1. 程式人生 > 實用技巧 >【機器學習實戰】第六章--支援向量機

【機器學習實戰】第六章--支援向量機

  1 import numpy as np
  2 import os
  3 
  4 
  5 class optStruct:
  6     # 建立一個數據結構來儲存所有重要的值,僅包含__init__方法,該方法可以實現其成員變數的填充
  7     def __init__(self, dataMatIn, classLabels, C, toler, kTup):
  8         # kTup是一個包含核函式資訊的元組
  9         self.X = dataMatIn
 10         self.labelMat = classLabels
 11         self.C = C
12 self.tol = toler 13 self.m = np.shape(dataMatIn)[0] 14 self.alphas = np.mat(np.zeros((self.m, 1))) 15 self.b = 0 16 # eCache的第一列給出的是eCache是否有效的標誌位,第二列給出的是實際的E值 17 self.eCache = np.mat(np.zeros((self.m, 2))) 18 self.K = np.mat(np.zeros((self.m, self.m)))
19 for i in range(self.m): 20 self.K[:, i] = kernelTrans(self.X, self.X[i, :], kTup) 21 22 23 def calcEk(oS, k): 24 # 計算E值並返回 25 # fXk = float(np.multiply(oS.alphas, oS.labelMat).T*(oS.X*oS.X[k,:].T)) + oS.b 26 fXk = float(np.multiply(oS.alphas, oS.labelMat).T * oS.K[:, k] + oS.b)
27 Ek = fXk - float(oS.labelMat[k]) 28 return Ek 29 30 31 def selectJ(i, oS, Ei): 32 # 用於選擇第二個alpha或者說內迴圈的alpha值,這裡的目標是選擇合適的第二個alpha值以保證在每次優化中採用最大步長 33 # 該函式的誤差值與第一個alpha值Ei和下標i有關 34 maxK = -1 35 maxDeltaE = 0 36 Ej = 0 37 oS.eCache[i] = [1, Ei] 38 validEcacheList = np.nonzero(oS.eCache[:, 0].A)[0] # 構建一個非零表。返回的是非零E值所對應的alpha值 39 # np.nonzero函式是numpy中用於得到陣列array中非零元素的位置(陣列索引)的函式。 40 if len(validEcacheList) > 1: 41 for k in validEcacheList: 42 if k == i: 43 continue 44 Ek = calcEk(oS, k) 45 deltaE = abs(Ei - Ek) 46 if deltaE > maxDeltaE: 47 maxK = k 48 maxDeltaE = deltaE 49 Ej = Ek 50 # 選擇具有最大步長的j 51 return maxK, Ej 52 else: 53 j = selectJrand(i, oS.m) 54 Ej = calcEk(oS, j) 55 return j, Ej 56 57 58 def updateEk(oS, k): 59 # 計算誤差值並存入快取當中。在對alpha值進行優化之後會用到這個值 60 Ek = calcEk(oS, k) 61 oS.eCache[k] = [1, Ek] 62 63 64 def innerL(i, oS): 65 Ei = calcEk(oS, i) 66 if ((oS.labelMat[i]*Ei<-oS.tol) and (oS.alphas[i]<oS.C)) or ((oS.labelMat[i]*Ei>oS.tol) and (oS.alphas[i]>0)): 67 j, Ej = selectJ(i, oS, Ei) 68 alphaIold = oS.alphas[i].copy() 69 alphaJold = oS.alphas[j].copy() 70 if oS.labelMat[i] != oS.labelMat[j]: 71 L = max(0, oS.alphas[j] - oS.alphas[i]) 72 H = min(oS.C, oS.C + oS.alphas[j] - oS.alphas[i]) 73 else: 74 L = max(0, oS.alphas[j] + oS.alphas[i] - oS.C) 75 H = min(oS.C, oS.alphas[j] + oS.alphas[i]) 76 if L == H: 77 print('L == H') 78 return 0 79 # eta = 2.0*oS.X[i,:]*oS.X[j,:].T-oS.X[i,:]*oS.X[i,:].T-oS.X[j,:]*oS.X[j,:].T 80 eta = 2.0*oS.K[i,j]-oS.K[i,i]-oS.K[j,j] 81 if eta >= 0: 82 print('eta>=0') 83 return 0 84 oS.alphas[j] -= oS.labelMat[j]*(Ei-Ej)/eta 85 oS.alphas[j] = clipAlpha(oS.alphas[j],H,L) 86 updateEk(oS,j) 87 if abs(oS.alphas[j] - alphaJold) < 0.00001: 88 print('j not moving enough') 89 return 0 90 oS.alphas[i] += oS.labelMat[j] * oS.labelMat[i]*(alphaJold - oS.alphas[j]) 91 updateEk(oS, i) 92 # b1 = oS.b - Ei - oS.labelMat[i]*(oS.alphas[i]-alphaIold)*oS.X[i,:]*oS.X[i,:].T-oS.labelMat[j]*\ 93 # (oS.alphas[j]-alphaJold)*oS.X[i,:]*oS.X[j,:].T 94 # b2 = oS.b - Ej - oS.labelMat[i]*(oS.alphas[i]-alphaIold)*oS.X[i,:]*oS.X[j,:].T-oS.labelMat[j]*\ 95 # (oS.alphas[j]-alphaJold)*oS.X[j,:]*oS.X[j,:].T 96 b1 = oS.b - Ei - oS.labelMat[i] * (oS.alphas[i] - alphaIold) * oS.K[i, i] - oS.labelMat[j] * \ 97 (oS.alphas[j]-alphaJold)*oS.K[i,j] 98 b2 = oS.b - Ej - oS.labelMat[i]*(oS.alphas[i]-alphaIold)*oS.K[i,j]-oS.labelMat[j]*\ 99 (oS.alphas[j]-alphaJold)*oS.K[j,j] 100 if (0<oS.alphas[i]) and (oS.C>oS.alphas[i]): 101 oS.b = b1 102 elif (0<oS.alphas[j]) and (oS.C>oS.alphas[j]): 103 oS.b = b2 104 else: 105 oS.b = (b1+b2)/2.0 106 return 1 107 else: 108 return 0 109 110 111 def smoP(dataMatIn, classLabels, C, toler, maxIter, kTup=('lin', 0)): 112 # 例項化物件oS,用來容納所有資料 113 oS = optStruct(np.mat(dataMatIn), np.mat(classLabels).transpose(),C,toler, kTup) 114 iter = 0 115 entireSet = True 116 alphaPairsChanged = 0 117 while iter < maxIter and (alphaPairsChanged >0 or entireSet): 118 # 當迭代次數超過指定的最大值,或者遍歷整個集合都未對任意alpha對進行修改時,就退出迴圈 119 alphaPairsChanged = 0 120 if entireSet: 121 for i in range(oS.m): 122 alphaPairsChanged += innerL(i, oS) 123 print('fullSet, iter: %d i: %d, pairs changed %d'%(iter, i, alphaPairsChanged)) 124 iter += 1 125 else: 126 nonBoundsIs = np.nonzero((oS.alphas.A > 0) * (oS.alphas.A < C))[0] 127 # matrix.A是將矩陣轉換為ndarray 128 # >>> type(c) 129 # <class 'numpy.matrix'> 130 # >>> type(c.A) 131 # <class 'numpy.ndarray'> 132 for i in nonBoundsIs: 133 print('non-bound,iter: %d i: %d, pairs changed %d' % (iter, i, alphaPairsChanged)) 134 iter += 1 135 if entireSet: 136 entireSet = False 137 elif alphaPairsChanged == 0: 138 entireSet = True 139 print('iteration number: %d' % iter) 140 return oS.b, oS.alphas 141 142 143 def clacWs(alphas, dataArr, classLabels): 144 X = np.mat(dataArr) 145 labelMat = np.mat(classLabels).transpose() 146 m, n = np.shape(X) 147 w = np.zeros((n, 1)) 148 for i in range(m): 149 w += np.multiply(alphas[i] * labelMat[i], X[i,:].T) 150 return w 151 152 153 def kernelTrans(X, A, kTup): 154 ''' 155 156 :param X: 所有資料集 157 :param A: 資料集中的一行 158 :param kTup: 元組kTup給出的是核函式的資訊,元組的第一個引數是描述所用核函式型別的一個字串, 159 其他兩個引數則都是核函式可能需要的可選引數 160 :return: 161 ''' 162 m, n = np.shape(X) 163 K = np.mat(np.zeros((m, 1))) 164 if kTup[0] == 'lin': 165 # 線性核函式 166 K = X * A.T 167 elif kTup[0] == 'rbf': 168 # 徑向基核函式 169 for j in range(m): 170 deltaRow = X[j, :] - A 171 K[j] = deltaRow * deltaRow.T 172 K = np.exp(K / (-1*kTup[1]**2)) 173 else: 174 raise NameError('Houston We Have a Problem -- That Kernel is not recognized') 175 return K 176 177 178 def testRbf(k1 = 1.3): 179 ''' 180 181 :param k1: 高斯徑向基函式中的一個使用者定義變數 182 :return: 183 ''' 184 # 讀入資料集 185 dataArr, labelArr = loadDataSet("../machinelearninginaction/Ch06/testSetRBF.txt") 186 # 執行Platt Smo演算法,核函式型別為rbf 187 b, alphas = smoP(dataArr, labelArr, 200, 0.0001, 10000, ('rbf', k1)) 188 # 建立資料矩陣副本 189 dataMat = np.mat(dataArr) 190 labelMat = np.mat(labelArr).transpose() 191 # 找出非零的alpha值 192 svInd = np.nonzero(alphas.A > 0)[0] 193 # 得到所需要的支援向量 194 sVs = dataMat[svInd] 195 # 得到類別標籤值 196 labelSV = labelMat[svInd] 197 print('there are %d Support Vectors' % np.shape(sVs)[0]) 198 m, n = np.shape(dataMat) 199 errorCount = 0 200 for i in range(m): 201 # 利用核函式進行分類 202 kernelEval = kernelTrans(sVs, dataMat[i, :], ('rbf', k1)) # 得到轉換後的資料 203 predict = kernelEval.T * np.multiply(labelSV, alphas[svInd]) + b 204 if np.sign(predict) != np.sign(labelArr[i]): 205 # sign()是Python的Numpy中的取數字符號(數字前的正負號)的函式。大於0時取1,小於0時取-1,等於0時取0 206 errorCount += 1 207 print('the training error rate is: %f' % (float(errorCount)/m)) 208 dataArr, labelArr = loadDataSet("../machinelearninginaction/Ch06/testSetRBF2.txt") 209 errorCount = 0 210 dataMat = np.mat(dataArr) 211 labelMat = np.mat(labelArr).transpose() 212 m, n = np.shape(dataMat) 213 for i in range(m): 214 kernelEval = kernelTrans(sVs, dataMat[i, :], ('rbf', k1)) 215 predict = kernelEval.T * np.multiply(labelSV, alphas[svInd]) + b 216 if np.sign(predict) != np.sign(labelArr[i]): 217 errorCount += 1 218 print('the test error rate is: %f' % (float(errorCount)/m)) 219 220 221 ''' 222 *************************************************************************** 223 ''' 224 # 手寫數字識別 225 def img2vector(filename): 226 returnVect = np.zeros((1, 1024)) # 建立1行1024列的零陣列 227 fr = open(filename) 228 for i in range(32): 229 lineStr = fr.readline() 230 for j in range(32): 231 returnVect[0, 32*i+j] = int(lineStr[j]) # 將32x32的圖片轉換成1x1024的行向量 232 return returnVect 233 234 235 def loadImages(dirName): 236 hwLabels = [] 237 trainingFileList = os.listdir(dirName) # 返回包含資料夾下的所有檔名的列表 238 m = len(trainingFileList) # 獲取所有檔案的個數 239 trainingMat = np.zeros((m, 1024)) 240 for i in range(m): 241 fileNameStr = trainingFileList[i] 242 fileStr = fileNameStr.split('.')[0] 243 classNumStr = int(fileStr.split('_')[0]) 244 # 二分類,只識別9 245 if classNumStr == 9: 246 hwLabels.append(-1) 247 else: 248 hwLabels.append(1) 249 trainingMat[i, :] = img2vector('%s/%s'%(dirName, fileNameStr)) 250 return trainingMat, hwLabels 251 252 253 def testDigits(kTup=('rbf', 10)): 254 dataArr, labelArr = loadImages('../machinelearninginaction/Ch02/digits/trainingDigits') 255 b, alphas = smoP(dataArr, labelArr, 200, 0.0001, 10000, kTup) 256 dataMat = np.mat(dataArr) 257 labelMat = np.mat(labelArr).transpose() 258 svInd = np.nonzero(alphas.A>0)[0] 259 sVs = dataMat[svInd] 260 labelSV = labelMat[svInd] 261 print('there are %d Support Vectors' % np.shape(sVs)[0]) 262 m, n = np.shape(dataMat) 263 errorCount = 0 264 for i in range(m): 265 kernelEval = kernelTrans(sVs, dataMat[i, :], kTup) 266 predict = kernelEval.T*np.multiply(labelSV, alphas[svInd]) + b 267 if np.sign(predict) != np.sign(labelArr[i]): 268 errorCount += 1 269 print('the training error rate is %f' % (float(errorCount)/m)) 270 dataArr, labelArr = loadImages('../machinelearninginaction/Ch02/digits/testDigits') 271 dataMat = np.mat(dataArr) 272 labelMat = np.mat(labelArr).transpose() 273 m, n = np.shape(dataMat) 274 275 for i in range(m): 276 kernelEval = kernelTrans(sVs, dataMat[i, :], kTup) 277 predict = kernelEval.T*np.multiply(labelSV, alphas[svInd]) + b 278 if np.sign(predict) != np.sign(labelArr[i]): 279 errorCount += 1 280 print('the test error rate is %f' % (float(errorCount)/m)) 281 282 283 ''' 284 *************************************************************************** 285 ''' 286 287 288 def loadDataSet(filename): 289 # 該函式開啟檔案並對其進行逐行解析,從而得到每行的類標籤和整個資料矩陣 290 dataMat = [] 291 labelMat = [] 292 fr = open(filename) 293 for line in fr.readlines(): 294 lineArr = line.strip().split('\t') 295 dataMat.append([float(lineArr[0]), float(lineArr[1])]) 296 labelMat.append(float(lineArr[2])) 297 return dataMat,labelMat 298 299 300 def selectJrand(i, m): 301 ''' 302 303 :param i:第一個alpha的下標 304 :param m: 所有alpha的數目 305 :return: 306 ''' 307 j = i 308 while j == i: 309 j = int(np.random.uniform(0, m)) # 隨機選擇不等於i的j值 310 return j 311 312 313 def clipAlpha(aj, H, L): 314 # 此函式用於調整大於H或小於L的alpha值 315 if aj > H: 316 aj = H 317 if aj < L: 318 aj = L 319 return aj 320 321 322 ''' 323 SMO函式的虛擬碼大致如下: 324 建立一個alpha向量並將其初始化為0向量 325 當迭代次數小於最大迭代次數時(外迴圈): 326 對資料集中的每個資料向量(內迴圈): 327 如果該資料向量可以被優化: 328 隨機選擇另外一個數據向量 329 同時優化這兩個向量 330 如果兩個向量都不能被優化,退出內迴圈 331 如果所有向量都沒被優化,增加迭代數目,繼續下一次迴圈 332 333 ''' 334 335 336 def smoSimple(dataMatIn, classLabels, C, toler, maxIter): 337 ''' 338 339 :param dataMatIn:資料集 340 :param classLabels: 類別標籤 341 :param C: 常數C 342 :param toler: 容錯率 343 :param maxIter: 最大迴圈次數 344 :return: 345 ''' 346 dataMatrix = np.mat(dataMatIn) # 建立矩陣 347 labelMat = np.mat(classLabels).transpose() # 矩陣轉置,變成列向量 348 b = 0 349 m, n = np.shape(dataMatrix) # 獲取行和列 350 alphas = np.mat(np.zeros((m,1))) 351 iter = 0 352 353 while iter < maxIter: 354 alphaPairsChanged = 0 # 用於記錄alpha是否已經進行優化 355 for i in range(m): 356 # fXi是預測的類別 357 fXi = float(np.multiply(alphas, labelMat).T*(dataMatrix*dataMatrix[i,:].T)) + b 358 # np.multiply為點乘,即對應元素相乘 359 Ei = fXi - float(labelMat[i]) # 預測結果和真實結果比對,計算誤差Ei 360 if((labelMat[i] * Ei < -toler) and (alphas[i] < C)) or ((labelMat[i] * Ei > toler) and (alphas[i]>0)): 361 # 不管是正間隔還是負間隔都會被測試,也要同時檢查alpha的值,以保證其不能等於0或C 362 j = selectJrand(i, m) # 隨機選擇第二個alpha值,即alpha[j] 363 fXj = float(np.multiply(alphas, labelMat).T*(dataMatrix*dataMatrix[j,:].T)) + b 364 Ej = fXj - float(labelMat[j]) # 計算第二個alpha的誤差 365 alphaIold = alphas[i].copy() 366 alphaJold = alphas[j].copy() 367 if labelMat[i] != labelMat[j]: 368 L = max(0, alphas[j] - alphas[i]) 369 H = min(C, C + alphas[j] - alphas[i]) 370 else: 371 L = max(0, alphas[j] + alphas[i] - C) 372 H = min(C, alphas[j] + alphas[i]) 373 # 以上將alpha[j]調整到0到C之間 374 if L == H: 375 print("L == H") 376 continue # 本次迴圈結束直接進行下一次for迴圈 377 eta = 2.0 * dataMatrix[i, :]*dataMatrix[j, :].T - dataMatrix[i, :]*dataMatrix[i,:].T - \ 378 dataMatrix[j,:]*dataMatrix[j,:].T 379 # eta是alpha[j]的最優修改量 380 if eta >= 0: 381 print(eta >= 0) 382 continue 383 alphas[j] -= labelMat[j]*(Ei - Ej)/eta 384 alphas[j] = clipAlpha(alphas[j], H, L) # 計算新的alpha[j] 385 if abs(alphas[j] - alphaJold) < 0.00001: 386 # 檢查alpha[j]是否有輕微改變,如果是的話就退出for迴圈 387 print("j not moving enough") 388 continue 389 # 對alpha[i]進行修改,修改量與alpha[j]相同,但反向相反 390 alphas[i] += labelMat[j] * labelMat[i] * (alphaJold - alphas[j]) 391 # 給兩個alpha值設定常數項b 392 b1 = b - Ei - labelMat[i] * (alphas[i] - alphaIold) * dataMatrix[i,:]*dataMatrix[i,:].T - \ 393 labelMat[j]*(alphas[j] - alphaJold)*dataMatrix[i,:]*dataMatrix[j,:].T 394 b2 = b - Ej - labelMat[i] * (alphas[i] - alphaIold) * dataMatrix[i,:]*dataMatrix[j,:].T - \ 395 labelMat[j]*(alphas[j] - alphaJold)*dataMatrix[j,:]*dataMatrix[j,:].T 396 if (0 < alphas[i]) and (C > alphas[i]): 397 b = b1 398 elif (0 < alphas[j]) and (C > alphas[j]): 399 b = b2 400 else: 401 # 如果程式執行到for迴圈的最後一行都不執行continue語句,那麼就已經成功地改變了一對alpha,增加alphaPairsChanged 402 b = (b1 + b2) / 2.0 403 alphaPairsChanged += 1 404 print('iter: %d i: %d, pairs changed %d'%(iter, i, alphaPairsChanged)) 405 # 檢查alpha值是否做了更新,如果有更新則將iter設為0後繼續執行程式。 406 # 只有在所有資料集上遍歷maxIter次,且不再發生任何alpha修改之後,程式才會停止並退出while迴圈 407 if alphaPairsChanged == 0: 408 iter += 1 409 else: 410 iter = 0 411 print('iteration number: %d' % iter) 412 return b, alphas 413 414 415 if __name__ == "__main__": 416 # testRbf() 417 # testDigits(('lin', 0)) 418 testDigits(('rbf', 10)) 419 420 ''' 421 # Python中Numpy mat的使用 https://www.cnblogs.com/lfri/p/10561914.html 422 dataArr, labelArr = loadDataSet("../machinelearninginaction/Ch06/testSet.txt") 423 # print(dataArr) 424 print(labelArr) 425 # b, alphas = smoSimple(dataArr, labelArr, 0.6, 0.001, 40) 426 b, alphas = smoP(dataArr, labelArr, 0.6, 0.001, 40) 427 # print(b) 428 # print(alphas[alphas>0]) # 陣列過濾,只對nunpy型別有用 429 print(np.shape(alphas[alphas>0])) # 支援向量的個數 430 for i in range(100): 431 # 輸出支援向量 432 if alphas[i] > 0.0: 433 print(dataArr[i], labelArr[i]) 434 ws = clacWs(alphas, dataArr, labelArr) 435 print(ws) 436 dataMat = np.mat(dataArr) 437 errorCount = 0 438 for k in range(np.shape(dataArr)[0]): 439 res = dataMat[k] * np.mat(ws) + b 440 if res < 0: 441 res = -1 442 else: 443 res = 1 444 if res != labelArr[k]: 445 errorCount += 1 446 means = errorCount/(np.shape(dataArr)[0]) 447 print('error rate= %.5f'%means) 448 '''
1 there are 328 Support Vectors
2 the training error rate is 0.000000
3 the test error rate is 0.006342