1. 程式人生 > >機器學習實戰例項練習-計算給定資料集的夏農熵

機器學習實戰例項練習-計算給定資料集的夏農熵

本文內容以程式碼為主(詳細請參考<機器學習實戰>書籍),主要用於自讀回顧,故註釋未精簡化,若發現錯誤還望各位前輩批評指正.

資訊熵:用來描述系統資訊量的不確定度.

                                 

from math import log

def calcShannonEnt(dataSet):
    numEntries = len(dataSet)  #
    labelCounts = {}
    # 以下五行為所有可能分類建立字典
    for featVec in dataSet:
        currentLabel = featVec[-1]  #提取最後一項做為標籤
        if currentLabel not in labelCounts.keys():
            labelCounts[currentLabel] = 0
        labelCounts[currentLabel] += 1  # 書中有錯
    # 0:{"yes":1} 1:{"yes":2}  2:{"no":1} 3:{"no":2} 4:{"no":3}
    shannonEnt = 0.0
    for key in labelCounts:
        prob = float(labelCounts[key]) / numEntries  # 計算概率
        # 以2為底求對數
        shannonEnt -= prob * log(prob,2) # 遞減求和得熵
    return shannonEnt

# 手動計算:  Ent = -0.4*log(2,0.4)-0.6*log(2,0.6)
# Ent_mannual = -(0.4 * log(0.4,2)) - (0.6 * log(0.6,2))
# print(Ent_mannual)

# 寫一個數據集
def createDataSet():
    dataSet = [[1, 1, 'yes'],
               [1, 1, 'yes'],
               [1, 0, 'no'],
               [0, 1, 'no'],
               [0, 1, 'no']]
    labels = ['no surfacing', 'flippers']
    return dataSet,labels

def splitDataSet(dataSet,axis,value): # 三個輸入引數:待劃分的資料集、劃分資料集的特徵、需要返回的特徵的值
    # 建立新的list物件
    retDataSet = []
    for featVec in dataSet:
        if featVec[axis] == value:  # dataSet[0]=0時執行以下操作
            # 以下三行抽取
            reducedFeatVec = featVec[:axis]   # featVec[:0]= [],即生成一個空列表
            reducedFeatVec.extend(featVec[axis + 1:]) # 新增index==1及後的元素 : 0/1/2 跳過,3:1,4:1
            retDataSet.append(reducedFeatVec) #整體作為元素新增 3:[[1,"no"]] , 4:[[1,"no"],[1,"no"]]
    return retDataSet

# 選擇最好的資料集劃分方式
def chooseBestFeatureToSplit(dataSet):
    numFeatures = len(dataSet[0]) - 1 # 去掉標籤項
    baseEntropy = calcShannonEnt(dataSet) # 計算熵
    bestInfoGain = 0.0
    bestFeature = -1
    for i in range(numFeatures):
        # 以下兩行建立唯一的分類標籤列表
        featList = [example[i] for example in dataSet] # i=0:[1,1,1,0,0]  i=1:[1,1,0,1,1]
        uniqueVals = set(featList)  # i=0:{0,1}  i=1:{0,1}
        newEntropy = 0.0
        # 以下五行計算每種劃分方式的資訊熵
        for value in uniqueVals:
            subDataSet = splitDataSet(dataSet, i, value)
            print(subDataSet)
            # i=0:{(0,0),(0,1)} 返回:[[1, 'no'], [1, 'no']]      [[1,"yes"],[1,"yes"],[0,"no"]]
            # i=1:{(1,0),(1,1)} 返回:[[0,"no"]]       [[1,"yes"],[1,"yes"],[1,"no"],[1,"no"]]
            prob = len(subDataSet)/float(len(dataSet))
            # i=0:{(0,0),(0,1)} 返回:2/5 3/5
            # i=1:{(1,0),(1,1)} 返回:1/5 4/5
            newEntropy += prob * calcShannonEnt(subDataSet)  # 注意這裡是subDataSet 不是 dataSet
        print("當i={}時得到的熵為".format(i),newEntropy)
        infoGain = baseEntropy - newEntropy # 資訊增益
        if (infoGain > bestInfoGain):
            # 計算最好的資訊增益
            bestInfoGain = infoGain
            bestFeature = i
    return bestFeature

if __name__ == "__main__":
    myDat,labels = createDataSet()
    a = splitDataSet(myDat,0,0)
    # print(a)
    b = chooseBestFeatureToSplit(myDat)
    print(b)

    # append()方法和extend()方法比較:
    # a = [1,2,3]
    # b = [4,5,6]
    # c = [7,8,9]
    # a.append(b)
    # print(a) # out:[1, 2, 3, [4, 5, 6]]
    # b.extend(c)
    # print(b)  # out:[4, 5, 6, 7, 8, 9]