學習筆記:使用k-近鄰演算法改進約會網站的配對效果
阿新 • • 發佈:2019-01-06
# name="code" class="python"># -*- coding: UTF-8 -*- from numpy import * import operator import matplotlib.pyplot as plt def file2matrix(filename): #準備資料-匯入資料 fr = open(filename) arrayOLines = fr.readlines() numberOfLines = len(arrayOLines) returnMat = zeros((numberOfLines, 3)) classLabelVactor = [] index = 0 for line in arrayOLines: line = line.strip() listFromLine = line.split('\t') returnMat[index,:] = listFromLine[0:3] classLabelVactor.append(int(listFromLine[-1])) index += 1 return returnMat,classLabelVactor def classify0(inX, dataSet, labels, k): #準備-KNN演算法 dataSetSize = dataSet.shape[0] diffMat = tile(inX, (dataSetSize,1)) - dataSet sqDiffMat = diffMat**2 sqDistances = sqDiffMat.sum(axis=1) distances = sqDistances**0.5 sortedDistances = distances.argsort() classCount={} for i in range(k): voteIlable = labels[sortedDistances[i]] classCount[voteIlable] = classCount.get(voteIlable, 0) + 1 sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True) return sortedClassCount[0][0] def showDateSet(datingDataMat, datingLabels): #分析資料-資料視覺化 fig = plt.figure() ax = fig.add_subplot(111) ax.scatter(datingDataMat[:,0], datingDataMat[:,1], c = 15*array(datingLabels), s = 15*array(datingLabels), label=u'散點圖') plt.legend(loc = 'upper left') plt.xlabel(u"玩視訊遊戲所耗得時間比") plt.ylabel(u"每年獲取的飛行常客里程數") plt.show() def autoNorm(dataSet): #準備資料-歸一化數值 minVals = dataSet.min(0) maxVals = dataSet.max(0) ranges = maxVals - minVals normDataSet = zeros(shape(dataSet)) m = dataSet.shape[0] normDataSet = dataSet - tile(minVals, (m,1)) normDataSet = dataSet / tile(ranges, (m,1)) return normDataSet, ranges, minVals def datingClassTest(): #測試演算法 hoRatio = 0.10 datingDataMat,datingLabels = file2matrix("/Users/ZZ/Desktop/MY_FILE/MACHINE_LEARNING_IN_ACTION/machinelearninginaction/Ch02/datingTestSet2.txt") norMat,ranges,minVals = autoNorm(datingDataMat) m = norMat.shape[0] numTestVes = int(m*hoRatio) errorCount = 0.0 for i in range(numTestVes): classifierResult = classify0(norMat[i,:], norMat[numTestVes:m,:], datingLabels[numTestVes:m], 3) print "the classifier came back with: %d, the rael answer is: %d" % (classifierResult, datingLabels[i]) if(classifierResult != datingLabels[i]): errorCount += 1.0 print "the tatol error rate is: %f" % (errorCount/float(numTestVes)) def classifyPerson(): #使用演算法 resultList = ['not at all', 'in small doses', 'in large doses'] percentTats = float(raw_input("percentage of time spent playing video games?")) ffMiles = float(raw_input("frequent flier miles earned per year?")) iceCream = float(raw_input("liters of ice cream consumed per year?")) datingDataMat,datingLabels = file2matrix("/Users/ZZ/Desktop/MY_FILE/MACHINE_LEARNING_IN_ACTION/machinelearninginaction/Ch02/datingTestSet2.txt") norMat,ranges,minVals = autoNorm(datingDataMat) inArr = array([ffMiles,percentTats,iceCream]) classifierResult = classify0((inArr-minVals)/ranges, norMat, datingLabels, 3) print "You will probably like this person:",resultList[classifierResult-1] classifyPerson()