1. 程式人生 > >學習筆記:使用k-近鄰演算法改進約會網站的配對效果

學習筆記:使用k-近鄰演算法改進約會網站的配對效果

# name="code" class="python"># -*- coding: UTF-8 -*-
from numpy import *
import operator
import matplotlib.pyplot as plt

def file2matrix(filename):	#準備資料-匯入資料
	fr = open(filename)
	arrayOLines = fr.readlines()
	numberOfLines = len(arrayOLines)
	returnMat = zeros((numberOfLines, 3))
	classLabelVactor = []
	index = 0
	for line in arrayOLines:
		line = line.strip()
		listFromLine = line.split('\t')
		returnMat[index,:] = listFromLine[0:3]
		classLabelVactor.append(int(listFromLine[-1]))
		index += 1
	return returnMat,classLabelVactor

def classify0(inX, dataSet, labels, k): #準備-KNN演算法
	dataSetSize = dataSet.shape[0]
	diffMat = tile(inX, (dataSetSize,1)) - dataSet
	sqDiffMat = diffMat**2
	sqDistances = sqDiffMat.sum(axis=1)
	distances = sqDistances**0.5
	sortedDistances = distances.argsort()
	classCount={}
	for i in range(k):
		voteIlable = labels[sortedDistances[i]]
		classCount[voteIlable] = classCount.get(voteIlable, 0) + 1
	sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)
	return sortedClassCount[0][0]

def showDateSet(datingDataMat, datingLabels):	#分析資料-資料視覺化
	fig = plt.figure()
	ax = fig.add_subplot(111)
	ax.scatter(datingDataMat[:,0], datingDataMat[:,1], c = 15*array(datingLabels), s = 15*array(datingLabels), label=u'散點圖')
	plt.legend(loc = 'upper left')
	plt.xlabel(u"玩視訊遊戲所耗得時間比")
	plt.ylabel(u"每年獲取的飛行常客里程數")
	plt.show()

def autoNorm(dataSet):	#準備資料-歸一化數值
	minVals = dataSet.min(0)
	maxVals = dataSet.max(0)
	ranges = maxVals - minVals
	normDataSet = zeros(shape(dataSet))
	m = dataSet.shape[0]
	normDataSet = dataSet - tile(minVals, (m,1))
	normDataSet = dataSet / tile(ranges, (m,1))
	return normDataSet, ranges, minVals

def datingClassTest(): #測試演算法
	hoRatio = 0.10
	datingDataMat,datingLabels = file2matrix("/Users/ZZ/Desktop/MY_FILE/MACHINE_LEARNING_IN_ACTION/machinelearninginaction/Ch02/datingTestSet2.txt")
	norMat,ranges,minVals = autoNorm(datingDataMat)
	m = norMat.shape[0]
	numTestVes = int(m*hoRatio)
	errorCount = 0.0
	for i in range(numTestVes):
		classifierResult = classify0(norMat[i,:], norMat[numTestVes:m,:], datingLabels[numTestVes:m], 3)
		print "the classifier came back with: %d, the rael answer is: %d" % (classifierResult, datingLabels[i])
		if(classifierResult != datingLabels[i]): 
			errorCount += 1.0
	print "the tatol error rate is: %f" % (errorCount/float(numTestVes))

def classifyPerson(): #使用演算法
	resultList = ['not at all', 'in small doses', 'in large doses']
	percentTats = float(raw_input("percentage of time spent playing video games?"))
	ffMiles = float(raw_input("frequent flier miles earned  per year?"))
	iceCream = float(raw_input("liters of ice cream consumed per year?"))
	datingDataMat,datingLabels = file2matrix("/Users/ZZ/Desktop/MY_FILE/MACHINE_LEARNING_IN_ACTION/machinelearninginaction/Ch02/datingTestSet2.txt")
	norMat,ranges,minVals = autoNorm(datingDataMat)
	inArr = array([ffMiles,percentTats,iceCream])
	classifierResult = classify0((inArr-minVals)/ranges, norMat, datingLabels, 3)
	print "You will probably like this person:",resultList[classifierResult-1]

classifyPerson()