1. 程式人生 > >Word2vec進行中文情感分析

Word2vec進行中文情感分析

'''
Chinese sentiment analysis
'''

from sklearn.cross_validation import train_test_split
from gensim.models.word2vec import Word2Vec
import numpy as np
import pandas as pd
import jieba
from sklearn.externals import joblib    #把資料轉化為二進位制
from sklearn.svm import SVC
import sys

'''
資料預處理:載入資料
           預處理
           切分訓練集和測試集
'''
def load_file_and_processing():
    neg = pd.read_excel('H:/word2vect_3data/Chinese_data/neg.xls')
    pos = pd.read_excel('H:/word2vect_3data/Chinese_data/pos.xls')

    cw = lambda x:list(jieba.cut(x))                #jieba分詞
    pos['words'] = pos[0].apply(cw)				#此處會報錯,讀取時給列命名,在apply jieba.cut()不會報錯
    neg['words'] = neg[0].apply(cw)

    # use 1 for positive sentiment, 0 for negative
    y = np.concatenate((np.ones(len(pos)),np.zeros(len(neg))))

    x_train,x_test,y_train,y_test = train_test_split(np.concatenate((pos['words'],neg['words'])),y,test_size=0.2)

    np.save('H:/word2vect_3data/Chinese_data/y_train.npy', y_train)
    np.save('H:/word2vect_3data/Chinese_data/y_test.npy', y_test)
    return x_train,x_test



'''
對每個句子的所有詞向量取均值,生成一個句子的vector
'''
def build_sentence_vector(text,size,imdb_w2v):
    vec = np.zeros(size).reshape((1,size))
    count = 0
    for word in text:
        try:
            vec += imdb_w2v[word].reshape((1,size))
            count += 1
        except KeyError:
            continue
    if count != 0:
        vec /= count
    return vec

'''
計算詞向量
'''
def get_train_vecs(x_train,x_test):
    n_dim = 300
    #初始化模型和詞表
    imdb_w2v = Word2Vec(size=n_dim,min_count=10)    #詞頻少於min_count次數的單詞會被丟棄掉, 預設值為5
    imdb_w2v.build_vocab(x_train)

    #在評論集上訓練模型
    imdb_w2v.train(x_train)

    train_vecs = np.concatenate([build_sentence_vector(z,n_dim,imdb_w2v) for z in x_train])
    np.save('H:/word2vect_3data/Chinese_data/train_vecs.npy',train_vecs)
    print('train_vecs size:')
    print(train_vecs.shape)

    #在測試集上訓練
    imdb_w2v.train(x_test)
    imdb_w2v.save('H:/word2vect_3data/Chinese_data/w2v_model.pkl')
    #build test tweet vector then scale
    test_vecs = np.concatenate([build_sentence_vector(z,n_dim,imdb_w2v) for z in x_test])
    np.save('H:/word2vect_3data/Chinese_data/test_vecs.npy',test_vecs)
    print('test_vecs size:')
    print(test_vecs.shape)


def get_data():
    train_vecs = np.load('H:/word2vect_3data/Chinese_data/train_vecs.npy')
    y_train = np.load('H:/word2vect_3data/Chinese_data/y_train.npy')
    test_vecs = np.load('H:/word2vect_3data/Chinese_data/test_vecs.npy')
    y_test = np.load('H:/word2vect_3data/Chinese_data/y_test.npy')
    return train_vecs,test_vecs,y_train,y_test


'''
訓練模型
'''

def svm_train(train_vecs,y_train,test_vecs,y_test):
    clf = SVC(kernel='rbf',verbose=True)
    clf.fit(train_vecs,y_train)
    joblib.dump(clf, 'H:/word2vect_3data/Chinese_data/model.pkl')
    print(clf.score(test_vecs,y_test))


'''
構建待測句子向量
'''
def get_predict_vecs(words):
    n_dim = 300
    imdb_w2v =Word2Vec.load('H:/word2vect_3data/Chinese_data/w2v_model.pkl')
    train_vecs = build_sentence_vector(words,n_dim,imdb_w2v)
    return train_vecs


'''
對單個句子進行情感分析
'''
def svm_predict(string):
    words = jieba.cut(string)          #jieba.lcut直接返回list
    words_vecs = get_predict_vecs(words)
    clf =joblib.load('H:/word2vect_3data/Chinese_data/model.pkl')

    result = clf.predict(words_vecs)

    if int(result[0]) == 1:
        print('positive')
    else:
        print('negative')