機器學習---決策樹decision tree的應用



2.1 特性:

安裝 Graphviz—-轉化dot檔案至pdf視覺化決策樹:dot -Tpdf *.dot -o


分類(classifaction),迴歸(regression),聚類(clustering),降維(dimensionality reduction)
模型選擇(model selection),預處理(preprocessing)





RID age income student credit_rating class_buys_computer
1 youth high no fair no
2 youth high no excellent no
3 middle_aged high no fair yes
4 senior medium no fair yes
5 senior low yes fair yes
6 senior low yes excellent no
7 middle_aged low yes excellent yes
8 youth medium no fair no
9 youth low yes fair yes
10 senior medium yes fair yes
11 youth medium yes excellent yes
12 middle_aged medium no excellent yes
13 middle_aged high yes fair yes
14 senior medium no excellent no


from sklearn.feature_extraction import DictVectorizer
import csv
from sklearn import tree
from sklearn import preprocessing
from sklearn.externals.six import StringIO

# Read in the csv file and put features into list of dict and list of class label
#allElectronicsData = open(r'AllElectronics.csv', 'rb')
#reader = csv.reader(allElectronicsData)
#headers = reader.next()
#上面的語句在python3.X會報錯,'_csv.reader' object has no attribute 'next' 
allElectronicsData = open(r'AllElectronics.csv', 'rt')
reader = csv.reader(allElectronicsData)
headers = next(reader)

#['RID', 'age', 'income', 'student', 'credit_rating', 'class_buys_computer']

featureList = []
labelList = []

for row in reader:
    rowDict = {}
    for i in range(1, len(row)-1):
        rowDict[headers[i]] = row[i]

[{'age': 'youth', 'credit_rating': 'fair', 'income': 'high', 'student': 'no'}, 
{'age': 'youth', 'credit_rating': 'excellent', 'income': 'high', 'student': 'no'}, 
{'age': 'middle_aged', 'credit_rating': 'fair', 'income': 'high', 'student': 'no'}, 
{'age': 'senior', 'credit_rating': 'fair', 'income': 'medium', 'student': 'no'}, 
{'age': 'senior', 'credit_rating': 'fair', 'income': 'low', 'student': 'yes'}, 
{'age': 'senior', 'credit_rating': 'excellent', 'income': 'low', 'student': 'yes'}, 
{'age': 'middle_aged', 'credit_rating': 'excellent', 'income': 'low', 'student': 'yes'}, 
{'age': 'youth', 'credit_rating': 'fair', 'income': 'medium', 'student': 'no'}, 
{'age': 'youth', 'credit_rating': 'fair', 'income': 'low', 'student': 'yes'}, 
{'age': 'senior', 'credit_rating': 'fair', 'income': 'medium', 'student': 'yes'}, 
{'age': 'youth', 'credit_rating': 'excellent', 'income': 'medium', 'student': 'yes'}, 
{'age': 'middle_aged', 'credit_rating': 'excellent', 'income': 'medium', 'student': 'no'},
{'age': 'middle_aged', 'credit_rating': 'fair', 'income': 'high', 'student': 'yes'}, 
{'age': 'senior', 'credit_rating': 'excellent', 'income': 'medium', 'student': 'no'}]

# Vetorize features
vec = DictVectorizer()
dummyX = vec.fit_transform(featureList) .toarray()

print("dummyX: " + str(dummyX))
#youth  middle_age senor   high medium low   yes no   fair excellent    buy
# 1        0         0      1     0     0     0   1    1     0           0  
[[ 0.  0.  1.  0.  1.  1.  0.  0.  1.  0.]
 [ 0.  0.  1.  1.  0.  1.  0.  0.  1.  0.]
 [ 1.  0.  0.  0.  1.  1.  0.  0.  1.  0.]
 [ 0.  1.  0.  0.  1.  0.  0.  1.  1.  0.]
 [ 0.  1.  0.  0.  1.  0.  1.  0.  0.  1.]
 [ 0.  1.  0.  1.  0.  0.  1.  0.  0.  1.]
 [ 1.  0.  0.  1.  0.  0.  1.  0.  0.  1.]
 [ 0.  0.  1.  0.  1.  0.  0.  1.  1.  0.]
 [ 0.  0.  1.  0.  1.  0.  1.  0.  0.  1.]
 [ 0.  1.  0.  0.  1.  0.  0.  1.  0.  1.]
 [ 0.  0.  1.  1.  0.  0.  0.  1.  0.  1.]
 [ 1.  0.  0.  1.  0.  0.  0.  1.  1.  0.]
 [ 1.  0.  0.  0.  1.  1.  0.  0.  0.  1.]
 [ 0.  1.  0.  1.  0.  0.  0.  1.  1.  0.]]
['age=middle_aged', 'age=senior', 'age=youth', 
 'credit_rating=excellent', 'credit_rating=fair', 
 'student=no', 'student=yes']
print("labelList: " + str(labelList))
#['no', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'no', 'yes', 'yes', 'yes', 'yes', 'yes', 'no']

# vectorize class labels
lb = preprocessing.LabelBinarizer()
dummyY = lb.fit_transform(labelList)
print("dummyY: " + str(dummyY))

# Using decision tree for classification
# clf = tree.DecisionTreeClassifier()
clf = tree.DecisionTreeClassifier(criterion='entropy')
clf = clf.fit(dummyX, dummyY)
print("clf: " + str(clf))

# Visualize model
with open("allElectronicInformationGainOri.dot", 'w') as f:
    f = tree.export_graphviz(clf, feature_names=vec.get_feature_names(), out_file=f)

最後把生成的.dot檔案轉換成視覺化的pdf檔案,dot -Tpdf input.dot -o output.pdf



oneRowX = dummyX[0, :]
print("oneRowX: " + str(oneRowX))
#oneRowX: [ 0.  0.  1.  0.  1.  1.  0.  0.  1.  0.]

newRowX = oneRowX
newRowX[0] = 1
newRowX[2] = 0
print("newRowX: " + str(newRowX))
#newRowX: [ 1.  0.  0.  0.  1.  1.  0.  0.  1.  0.]
#predictedY = clf.predict(newRowX)
    "if it contains a single sample.".format(array))
ValueError: Expected 2D array, got 1D array instead:
array=[ 0.  0.  1.  0.  1.  1.  0.  0.  1.  0.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.
predictedY = clf.predict(newRowX.reshape(1,-1))
print("predictedY: " + str(predictedY))
#predictedY: [1]


