NLP計算文件相似度之TF-IDF
阿新 • • 發佈:2019-01-30
#!/usr/bin/python
# -*- coding: utf-8 -*-
import numpy
import os
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
#sys.setdefaultencoding('utf8')了 網易 杭研 大廈","小明 碩士 畢業 與 中國 科學院","我 愛 北京 天安門"]
trainfile = open(u'D:\python_noweightpathway\TIA\TIAxmmc.txt','r',encoding= 'utf8') #不同的documents用換行符隔開
traincorpus = trainfile.readlines()
#corpus=["我 來到 北京 清華大學","我 他 來到
trainfile.close()
corpus = traincorpus;
vectorizer=CountVectorizer()#該類會將文字中的詞語轉換為詞頻矩陣,矩陣元素a[i][j] 表示j詞在i類文字下的詞頻
tfidf_vectorizer = TfidfVectorizer(max_df=0.95 , min_df=2, #max_features=n_features,
stop_words='english')
transformer=TfidfTransformer()#該類會統計每個詞語的tf-idf權值
tfidf=transformer.fit_transform(tfidf_vectorizer.fit_transform(corpus))#第一個fit_transform是計算tf-idf,第二個fit_transform是將文字轉為詞頻矩陣
word=tfidf_vectorizer.get_feature_names()#獲取詞袋模型中的所有詞語
weight=tfidf.toarray()#將tf-idf矩陣抽取出來,元素a[i][j]表示j詞在i類文字中的tf-idf權重
f = open("D:\python_noweightpathway\TIA\TIAsmilarity.txt","w+")
for i in range(len(weight)):#列印每類文字的tf-idf詞語權重,第一個for遍歷所有文字,第二個for便利某一類文字下的詞語權重
# print u"-------這裡輸出第",i,u"類文字的詞語tf-idf權重------"
f.write(str(i+1)+"\t")
for j in range(len(word)):
if(weight[i][j]>0): f.write(str(j+1) + ":" + str(weight[i][j]) + " ")
f.write("\n")
print (i)
f.close()
# f = open("D:\python_noweightpathway\TIA\dictionary.txt","w+")
# for i in range(len(word)):
# f.write(str(i) + "\t" + word[i].encode("utf-8") + "\n")
# f.close()
SimMatrix = (tfidf * tfidf.T).A
print (SimMatrix[1,3]) #"第一篇與第4篇的相似度"
numpy.savetxt("D:\python_noweightpathway\TIA\SimMatrix.csv", SimMatrix, delimiter=",") #儲存相似度矩陣
自己實現TF-IDF演算法
# -*-coding:utf8-*-
import pandas as pd
from numpy import *
file = open("jiebaResult_stopWords.txt", "r", encoding="utf8")
set_words=set()
words_list=[]
for line in file:
words = line.strip().split(" ")
words_list.append(words)
for word in words:
set_words.add(word)
file.close()
data_index=zeros([len(words_list),len(set_words)])
df=pd.DataFrame(columns=set_words,data=data_index)
tf_idf=pd.DataFrame(columns=set_words,data=data_index)
lenght=len(words_list)
for index in range(lenght):
leng=len(words_list[index])
for word in words_list[index]:
df[word][index]=df[word][index]+leng
for word in words_list:
N=len([i for i in list(df[word]) if i!=0])
df[word]=df[word]*log(lenght/(1+N))
print(df)
演算法理論學習:
TF-IDF演算法理論學習
pandas中DataFrame學習
#-*-coding:utf8-*-
import pandas as pd
from numpy import *
a=zeros([3,4])
print(a)
a=[1,2,3,0,4]
#計算出現目標詞的文件的個數
lenght=len([i for i in a if i!=0])
print(lenght)
#資料框獲取一列
d = {'col1': [1, 2], 'col2': [3, 4]}
df = pd.DataFrame(data=d)
df['col1']=df['col1']*3
print(df)