1. 程式人生 > >模式識別設計(Python程式設計):IRIS資料集的Kmeans聚類與分解聚類法

模式識別設計(Python程式設計):IRIS資料集的Kmeans聚類與分解聚類法

題目:本次作業的實驗需求是使用分解聚類法與c-means聚類法對IRIS資料集進行聚類,Kmeans聚類程式碼網上摘錄,分解聚類法純原創,PS:因為時間緊,分解聚類法進行第二次分解時,偷懶了~~有緣人改改吧~~

資料格式:

kmeans程式碼:

import math
from collections import defaultdict
import numpy as np
dataname = "data.txt"
def loadIRISdata(filename):
    data = []
    with open(filename, mode="r", encoding="utf-8") as rf:
        for line in rf:
            if line == '\n':
                continue
            data.append(list(map(float, line.split(" "))))
    return data

def generateCenters(data):
    '''求解初始聚類中心'''
    centers = []
    '''已知維度為4'''
    '''分三類,取第0,50,100的三個向量作為分界'''
    centers.append(data[0])
    centers.append(data[50])
    centers.append(data[100])
    return centers

def distance(a ,b):
    '''歐式距離'''
    sum = 0
    for i in range(4):
        sq = (a[i]-b[i])*(a[i]-b[i])
        sum += sq
    return math.sqrt(sum)

def point_avg(points):
    '''對維度求平均值'''
    new_center = []
    for i in range(4):
        sum = 0
        for p in points:
            sum += p[i]
        new_center.append(float("%.8f" % (sum/float(len(points)))))
    return new_center

def updataCenters(data, assigments):
    new_means = defaultdict(list)
    centers = []
    for assigment, point in zip(assigments, data):
        new_means[assigment].append(point)
        '''將同一類的資料進行整合'''
    for i in range(3):
        points = new_means[i]
        centers.append(point_avg(points))
    return centers

def assignment(data, centers):
    assignments = []
    '''對應位置顯示對應類群'''
    for point in data:
        '''遍歷所有資料'''
        shortest = float('inf')
        shortestindex = 0
        for i in range(3):
            '''遍歷三個中心向量,與哪個類中心歐氏距離最短就將其歸為哪類'''
            value = distance(point, centers[i])
            if value < shortest:
                shortest = value
                shortestindex = i
        assignments.append(shortestindex)
    return assignments

def kmeans(data):
    k_data = generateCenters(data)
    assigments = assignment(data, k_data)
    old_assigments = None
    while assigments != old_assigments:
        new_centers = updataCenters(data, assigments)
        old_assigments = assigments
        assigments = assignment(data, new_centers)
    result = list(zip(assigments, data))
    return result

def acc(result):
    sum = 0
    all = 0
    for i in range(50):
        if result[i][0] == 0:
            sum += 1
        all += 1
    for i in range(50):
        if result[i+50][0] == 1:
            sum += 1
        all += 1
    for i in range(50):
        if result[i+100][0] == 2:
            sum += 1
        all += 1
    print('sum:', sum, 'all:', all)
    return sum, all

if __name__ == "__main__":
    data = loadIRISdata(dataname)
    result = kmeans(data)
    for i in range(3):
        tag = 0
        print('\n')
        print("第%d類資料有:" % (i+1))
        for tuple in range(len(result)):
            if(result[tuple][0] == i):
                print(tuple, end=' ')
                tag += 1
            if tag > 20 :
                print('\n')
                tag = 0
    #print(result)
    print('\n')
    sum, all = acc(result)
    print('c-means準確度為:%2f%%' % ((sum/all)*100))

kmeans結果:

分解聚類程式碼:

import math
from collections import defaultdict
import numpy as np
dataname = "data.txt"
def loadIRISdata(filename):
    data = []
    with open(filename, mode="r", encoding="utf-8") as rf:
        for line in rf:
            if line == '\n':
                continue
            data.append(list(map(float, line.split(" "))))
    return data
def E(N, N1, N2, a, b):
    return float(((N1*N2)/N)*np.matrix((a-b))*np.matrix((a-b)).T)
def avg(data, k, assignments):
    sum = []
    tag = 0
    for i in range(150):
        if assignments[i] == k:
            sum.append(data[i])
            tag += 1
    return np.sum(sum, 0)/tag
def length(k, assignments):
    answer = 0
    for i in range(150):
        if assignments[i] == k:
            answer += 1
    return answer
def decomposition_clustering(data, assignments):
    Er_max = float('-inf')
    while True:
        # 第一次
        place = 0
        tag = 0
        for i in range(150):
            if assignments[i] == 0:
                assignments[i] = 1
                # print('第%d次迴圈的assignments:' % i, assignments)
                average_1 = avg(data, 0, assignments)
                if length(1, assignments) == 0:
                    average_2 = np.array([0, 0, 0, 0])
                else:
                    average_2 = avg(data, 1, assignments)
                Er = E(150, length(0, assignments), length(1, assignments), average_1, average_2)
                # print('E值為:', Er)
                if Er > Er_max:
                    place = i
                    Er_max = Er
                    tag = 1  # E未到極值
                    print('max_1:', Er_max)
                assignments[i] = 0
        if tag == 1:
            assignments[place] = 1
        else:
            break
    Er_max = float('-inf')
    while True:
        # 第二次
        place = 0
        tag = 0
        for i in range(150):
            if assignments[i] == 1:
                assignments[i] = 2
                # print('第%d次迴圈的assignments:' % i, assignments)
                average_1 = avg(data, 1, assignments)
                if length(2, assignments) == 0:
                    average_2 = np.array([0, 0, 0, 0])
                else:
                    average_2 = avg(data, 2, assignments)
                Er = E(150, length(1, assignments), length(2, assignments), average_1, average_2)
                # print('E值為:', Er)
                if Er > Er_max:
                    place = i
                    Er_max = Er
                    tag = 1  # E未到極值
                    print('max_2:', Er_max)
                assignments[i] = 1
        if tag == 1:
            assignments[place] = 2
        else:
            break
    return assignments
def acc(result):
    sum = 0
    all = 0
    for i in range(50):
        if result[i][0] == 0:
            sum += 1
        all += 1
    for i in range(50):
        if result[i+50][0] == 1:
            sum += 1
        all += 1
    for i in range(50):
        if result[i+100][0] == 2:
            sum += 1
        all += 1
    print('sum:', sum, 'all:', all)
    return sum, all


if __name__ == "__main__":
    data = loadIRISdata(dataname)
    assignments = []
    for i in range(150):
        assignments.append(0)
    answer = decomposition_clustering(data, assignments)
    result = list(zip(answer, data))
    for i in range(3):
        tag = 0
        print('\n')
        print("第%d類資料有:" % (i+1))
        for tuple in range(len(result)):
            if(result[tuple][0] == i):
                print(tuple, end=' ')
                tag += 1
            if tag > 20 :
                print('\n')
                tag = 0
    #print(result)
    print('\n')
    sum, all = acc(result)
    print('分解聚類法準確度為:%2f%%' % ((sum/all)*100))

分解聚類結果: