20180923 word2vec相似度改進(不浪費句子)
阿新 • • 發佈:2018-12-11
沒有詞向量就丟掉,不放進詞集合裡,不浪費句子,的改進。
#!/usr/bin/python # -*- coding: UTF-8 -*- from __future__ import division #除法 import sys import codecs #可以以特定編碼開啟檔案 import jieba import jieba.posseg as pseg reload(sys) #zzh說這種方法不好,不要再用了!!! 可是真的很好用啊 QAQ sys.setdefaultencoding('utf-8') import gensim # model = gensim.models.Word2Vec.load("22620491.model") model = gensim.models.KeyedVectors.load_word2vec_format('news_12g_baidubaike_20g_novel_90g_embedding_64.bin', binary=True) word_vec = model.wv del model #把模型給word_vec,所以Model刪掉。 print word_vec[u'難過'] f = codecs.open("xlj_fenci.txt",'r','utf-8') #codecs包指定TXT開啟方式 lines = f.readlines() #doc = open('fenlei.txt', 'w') right,wrong,total=0,0,0 cntl,cnta,cntn,cntj,cntw=0,0,0,0,0 #標註時每一類的數量 resl,resa,resn,resj,resw=0,0,0,0,0 #分類正確每一類結果數量 for line in lines: #每一行彈幕 if lines.index(line) % 500 ==0: #顯示跑到多少條資料 print lines.index(line) if line.split(" ")[0].split(" ")[0]=="0": #分類正確個數 cntl=cntl+1 elif line.split(" ")[0].split(" ")[0]=="1": cnta=cnta+1 elif line.split(" ")[0].split(" ")[0]=="2": cntn=cntn+1 elif line.split(" ")[0].split(" ")[0]=="3": cntj=cntj+1 elif line.split(" ")[0].split(" ")[0]=="4": cntw=cntw+1 line1=line.split(" ")[1] words=line1.split(" ") u = [] for word in words: if word != "\r\n": #去掉換行符,linux只用\n換行。win下用\r\n表示換行。反正\n不行就\r\n試試! #print type(word) try: word_vec[word] u.append(word) #word_vec輸入必須要unicode才行。 except: continue le = [u'樂'] ai = [u'哀'] nu = [u'怒'] jing = [u'驚'] wu = [u'惡'] try: l,a,n,j,w=word_vec.n_similarity(u, le),word_vec.n_similarity(u, ai),word_vec.n_similarity(u, nu),word_vec.n_similarity(u, jing),word_vec.n_similarity(u, wu) list=[l,a,n,j,w] # print list #doc.write(line.split(" ",1)[0]+" "+str(list.index(max(list)))+" "+line.split(" ",1)[1]+'\n') # index記得 變成 str啊 !!改了好半天! if str(list.index(max(list)))==line.split(" ")[0].split(" ")[0]: right=right+1 if str(list.index(max(list)))=="0": #分類正確個數 resl=resl+1 elif str(list.index(max(list)))=="1": resa=resa+1 elif str(list.index(max(list)))=="2": resn=resn+1 elif str(list.index(max(list)))=="3": resj=resj+1 elif str(list.index(max(list)))=="4": resw=resw+1 except: #doc.write(line.split(" ",1)[0]+" "+"-1"+" "+line.split(" ",1)[1]+'\n') wrong=wrong+1 continue total=total+1 print(right,wrong,total) print(cntl,cnta,cntn,cntj,cntw) print(resl,resa,resn,resj,resw) print(resl/cntl,resa/cnta,resn/cntn,resj/cntj,resw/cntw) print("end") f.close() #doc.close()