1. 程式人生 > >機器學習之隨機森林——CART模型的PYTHON實現

機器學習之隨機森林——CART模型的PYTHON實現

機器學習之隨機森林——CART模型PYTHON實現

把機器學習的過程記錄一下。隨機森林即利用決策樹群對樣本進行訓練並預測的一種分類器,其與單棵決策樹相比可以平衡誤差。 其中CART模型:

  1. 二叉決策樹 ,節點特徵只取值“是”與“否”;
  2. 輸入特徵的切分方式 ,啟發式方法:

當前輸入空間的切分變數 假設為第 jj 輸入量xjx^{j},切分點為 xjx^{j} 的取值 ss,則切分的兩個區域為:

R1(j,s)={xxj<=s}andR2(j,s)={xxj>s} R_{1}(j,s) = \{x|x^{j}<=s\} \quad and \quad R_{2}(j,s) = \{x|x^{j}>s\}

尋找最優的j,sj,s,求解:

minj,s[minc1xiR1(j,s)(yic1)2+minc2xiR2(j,s)(yic2)2] min_{j,s}[min_{c_{1}}\sum_{x_{i}\in R_{1}(j,s)}(y_{i}-c_{1})^{2}+min_{c_{2}}\sum_{x_{i}\in R_{2}(j,s)}(y_{i}-c_{2})^{2}]

s)(yic2)2]cc 可以:

c^m=1NmxiRm(j,s)yi,m=1,2 \hat{c}_{m} = \frac{1}{N_{m}}\sum_{x_{i}\in R_{m}(j,s)}y_{i} , \quad m=1,2 遍歷所有輸入變數,可找到最優j,sj,s,將該輸入空間分成兩個區域。對每個區域重複上述劃分過程,直到滿足停止條件,就生成一棵迴歸樹;

  1. 生成分類樹, CART決策樹使用基尼指數選擇最優特徵;

  2. 選擇資料集中不同的子資料集即可生成不同的 CART決策樹,測試資料集通過該決策樹集和多數決策,得到最終結果。

實現程式碼網上找的,稍微修改了一下,資料集用的是UCI的Wine Data Set (link). 能夠直接執行。

pytho程式碼

#-*- coding: utf-8 -*-
# Random Forest Algorithm on Sonar Dataset
from random import seed
from random import randrange
from csv import reader
from math import sqrt
from math import log
# Load data file
def load_data(filename, ty):  #匯入csv檔案
    dataset = list()
    with open(filename, 'r') as file:
        if ty == 'csv':
            readers = reader(file)
            for row in readers:
                if not row:
                    continue
                dataset.append(row)
        else:
            while True:          # txt檔案
                readers = file.readline()
                if not readers:
                    break
                    pass
                p_tmp = [float(i) for i in readers.split(',')]
                dataset.append(p_tmp)
                pass

 ###   wine資料集預設類別標籤在第一列,統一放入最後一列
    length = len(dataset[0])-1
    sets = []
    for data in dataset:
        temp = data[1: length]+ [data[0]]
        sets.append(temp)
  #### --------------------
    return sets

# Convert string column to float
def str_column_to_float(dataset, column):  #將資料集的第column列轉換成float形式
    for row in dataset:
        row[column] = float(row[column])  #strip()返回移除字串頭尾指定的字元生成的新字串。

# Convert string column to integer
def str_column_to_int(dataset, column):    #將最後一列表示標籤的值轉換為Int型別0,1,...
    class_values = [row[column] for row in dataset]
    unique = set(class_values)
    lookup = dict()
    for i, value in enumerate(unique):
        lookup[value] = i
    for row in dataset:
        row[column] = lookup[row[column]]
    return lookup

# Split a dataset into k folds
def cross_validation_split(dataset, n_folds):  #將資料集dataset分成n_flods份,每份包含len(dataset) / n_folds個值,每個值由dataset資料集的內容隨機產生,每個值被使用一次
    dataset_split = list()
    fold_size = len(dataset) / n_folds
    for i in range(n_folds):
        fold = list()   #每次迴圈fold清零,防止重複匯入dataset_split
        dataset_copy = list(dataset)  #
        while len(fold) < fold_size:   #這裡不能用ifif只是在第一次判斷時起作用,while執行迴圈,直到條件不成立
            index = randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))  #將對應索引index的內容從dataset_copy中匯出,並將該內容從dataset_copy中刪除。pop() 函式用於移除列表中的一個元素(預設最後一個元素),並且返回該元素的值。
        dataset_split.append(fold)
    return dataset_split    #由dataset分割出的n_folds個數據構成的列表,為了用於交叉驗證

# Calculate accuracy percentage
def accuracy_metric(actual, predicted):  #匯入實際值和預測值,計算精確度
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    return correct / float(len(actual)) * 100.0



# Split a dataset based on an attribute and an attribute value #根據特徵和特徵值分割資料集
def test_split(index, value, dataset):
    left, right = list(), list()
    for row in dataset:
        if row[index] < value:
            left.append(row)
        else:
            right.append(row)
    return left, right

# Calculate the Gini index for a split dataset
def gini_index(groups, class_values):   #分類越準確,則gini越小
    gini = 0.0
    for class_value in class_values:  #class_values =[0,1]
        for group in groups:          #groups=(left,right)
            size = len(group)
            if size == 0:
                continue
            proportion = [row[-1] for row in group].count(class_value) / float(size)
            gini += (proportion * (1.0 - proportion))
    return gini

# Select the best split point for a dataset  #找出分割資料集的最優特徵,得到最優的特徵index,特徵值row[index],以及分割完的資料groups(left,right)
def get_split(dataset, n_features):
    class_values = list(set(row[-1] for row in dataset))  #class_values =[0,1]
    b_index, b_value, b_score, b_groups = 999, 999, 999, None
    features = list()
    while len(features) < n_features:
        index = randrange(1,len(dataset[0]))  #往features新增n_features個特徵(n_feature等於特徵數的根號),特徵索引從dataset中隨機取
        if index not in features:
            features.append(index)
    for index in features:        #在n_features個特徵中選出最優的特徵索引,並沒有遍歷所有特徵,從而保證了每課決策樹的差異性
        for row in dataset:
            groups = test_split(index, row[index], dataset)  #groups=(left,right);row[index]遍歷每一行index索引下的特徵值作為分類值value,找出最優的分類特徵和特徵值
            gini = gini_index(groups, class_values)
            if gini < b_score:
                b_index, b_value, b_score, b_groups = index, row[index], gini, groups  #最後得到最優的分類特徵b_index,分類特徵值b_value,分類結果b_groups。b_value為分錯的代價成本。
    #print b_score
    return {'index':b_index, 'value':b_value, 'groups':b_groups}

# Create a terminal node value #輸出group中出現次數較多的標籤
def to_terminal(group):
    outcomes = [row[-1] for row in group]           #max()函式中,當key引數不為空時,就以key的函式物件為判斷的標準;
    return max(set(outcomes), key=outcomes.count)   # 輸出group中出現次數較多的標籤

# Create child splits for a node or make terminal  #建立子分割器,遞迴分類,直到分類結束
def split(node, max_depth, min_size, n_features, depth):  #max_depth = 10,min_size = 1,n_features = int(sqrt(len(dataset[0])-1))
    left, right = node['groups']
    del(node['groups'])
# check for a no split
    if not left or not right:
        node['left'] = node['right'] = to_terminal(left + right)
        return
# check for max depth
    if depth >= max_depth:
        node['left'], node['right'] = to_terminal(left), to_terminal(right)
        return
# process left child
    if len(left) <= min_size:
        node['left'] = to_terminal(left)
    else:
        node['left'] = get_split(left, n_features)  #node['left']是一個字典,形式為{'index':b_index, 'value':b_value, 'groups':b_groups},所以node是一個多層字典
        split(node['left'], max_depth, min_size, n_features, depth+1)  #遞迴,depth+1計算遞迴層數
# process right child
    if len(right) <= min_size:
        node['right'] = to_terminal(right)
    else:
        node['right'] = get_split(right, n_features)
        split(node['right'], max_depth, min_size, n_features, depth+1)

# Build a decision tree
def build_tree(train, max_depth, min_size, n_features):
    #root = get_split(dataset, n_features)
    root = get_split(train, n_features)
    split(root, max_depth, min_size, n_features, 1)
    return root

# Make a prediction with a decision tree
def predict(node, row):   #預測模型分類結果
    if row[node['index']] < node['value']:
        if isinstance(node['left'], dict):    #isinstance是Python中的一個內建函式。是用來判斷一個物件是否是一個已知的型別。
            return predict(node['left'], row)
        else:
            return node['left']
    else:
        if isinstance(node['right'], dict):
            return predict(node['right'], row)
        else:
            return node['right']

# Make a prediction with a list of bagged trees
def bagging_predict(trees, row):
    predictions = [predict(tree, row) for tree in trees]  #使用多個決策樹trees對測試集test的第row行進行預測,再使用簡單投票法判斷出該行所屬分類
    return max(set(predictions), key=predictions.count)

# Create a random subsample from the dataset with replacement
def subsample(dataset, ratio):   #建立資料集的隨機子樣本
    sample = list()
    n_sample = round(len(dataset) * ratio)   #round() 方法返回浮點數x的四捨五入值。
    while len(sample) < n_sample:
        index = randrange(len(dataset))  #有放回的隨機取樣,有一些樣本被重複取樣,從而在訓練集中多次出現,有的則從未在訓練集中出現,此則自助取樣法。從而保證每棵決策樹訓練集的差異性
        sample.append(dataset[index])
    return sample

# Random Forest Algorithm
def random_forest(train, test, max_depth, min_size, sample_size, n_trees, n_features):
    trees = list()
    for i in range(n_trees):   #n_trees表示決策樹的數量
        sample = subsample(train, sample_size)  #隨機取樣保證了每棵決策樹訓練集的差異性
        tree = build_tree(sample, max_depth, min_size, n_features)  #建立一個決策樹
        trees.