1. 程式人生 > >NLP計算文件相似度之TF-IDF

NLP計算文件相似度之TF-IDF

#!/usr/bin/python  
# -*- coding: utf-8 -*-
import numpy
import os
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
#sys.setdefaultencoding('utf8')了 網易 杭研 大廈","小明 碩士 畢業 與 中國 科學院","我 愛 北京 天安門"]
trainfile = open(u'D:\python_noweightpathway\TIA\TIAxmmc.txt','r',encoding= 'utf8') #不同的documents用換行符隔開 traincorpus = trainfile.readlines() #corpus=["我 來到 北京 清華大學","我 他 來到 trainfile.close() corpus = traincorpus; vectorizer=CountVectorizer()#該類會將文字中的詞語轉換為詞頻矩陣,矩陣元素a[i][j] 表示j詞在i類文字下的詞頻 tfidf_vectorizer = TfidfVectorizer(max_df=0.95
, min_df=2, #max_features=n_features, stop_words='english') transformer=TfidfTransformer()#該類會統計每個詞語的tf-idf權值 tfidf=transformer.fit_transform(tfidf_vectorizer.fit_transform(corpus))#第一個fit_transform是計算tf-idf,第二個fit_transform是將文字轉為詞頻矩陣 word=tfidf_vectorizer.get_feature_names()#獲取詞袋模型中的所有詞語
weight=tfidf.toarray()#將tf-idf矩陣抽取出來,元素a[i][j]表示j詞在i類文字中的tf-idf權重 f = open("D:\python_noweightpathway\TIA\TIAsmilarity.txt","w+") for i in range(len(weight)):#列印每類文字的tf-idf詞語權重,第一個for遍歷所有文字,第二個for便利某一類文字下的詞語權重 # print u"-------這裡輸出第",i,u"類文字的詞語tf-idf權重------" f.write(str(i+1)+"\t") for j in range(len(word)): if(weight[i][j]>0): f.write(str(j+1) + ":" + str(weight[i][j]) + " ") f.write("\n") print (i) f.close() # f = open("D:\python_noweightpathway\TIA\dictionary.txt","w+") # for i in range(len(word)): # f.write(str(i) + "\t" + word[i].encode("utf-8") + "\n") # f.close() SimMatrix = (tfidf * tfidf.T).A print (SimMatrix[1,3]) #"第一篇與第4篇的相似度" numpy.savetxt("D:\python_noweightpathway\TIA\SimMatrix.csv", SimMatrix, delimiter=",") #儲存相似度矩陣

自己實現TF-IDF演算法

# -*-coding:utf8-*-
import pandas as pd
from numpy import *
file = open("jiebaResult_stopWords.txt", "r", encoding="utf8")
set_words=set()
words_list=[]
for line in file:
    words = line.strip().split(" ")
    words_list.append(words)
    for word in words:
        set_words.add(word)
file.close()
data_index=zeros([len(words_list),len(set_words)])
df=pd.DataFrame(columns=set_words,data=data_index)
tf_idf=pd.DataFrame(columns=set_words,data=data_index)
lenght=len(words_list)
for index in range(lenght):
    leng=len(words_list[index])
    for word in words_list[index]:
        df[word][index]=df[word][index]+leng
for word in words_list:
    N=len([i for i in list(df[word]) if i!=0])
    df[word]=df[word]*log(lenght/(1+N))
print(df)

演算法理論學習:
TF-IDF演算法理論學習
pandas中DataFrame學習

#-*-coding:utf8-*-
import pandas as pd
from numpy import *
a=zeros([3,4])
print(a)
a=[1,2,3,0,4]
#計算出現目標詞的文件的個數
lenght=len([i for i in a if i!=0])
print(lenght)
#資料框獲取一列
d = {'col1': [1, 2], 'col2': [3, 4]}
df = pd.DataFrame(data=d)
df['col1']=df['col1']*3
print(df)