1. 程式人生 > >用NLTK對英文語料做預處理,用gensim計算相似度

用NLTK對英文語料做預處理,用gensim計算相似度

import nltk
from nltk.tokenize import word_tokenize

text = open('F:/iPython/newsfortfidf.txt')
# testtext = [line.strip() for line in file('text')]
testtextt = [course.split("###") for text in testtext]
print testtext

texts_tokenized = [[word for word in nltk.word_tokenize(testtext)]]
print texts_tokenized

from nltk.corpus import stopwords
english_stopwords = stopwords.words('english')
print english_stopwords
len(english_stopwords)
texts_filtered_stopwords = [[word for word in document if not word in english_stopwords] for document in texts_tokenized]
print texts_filtered_stopwords

english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%','-']
texts_filtered = [[word for word in document if not word in english_punctuations] for document in texts_filtered_stopwords]
print texts_filtered

from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()
texts_stemmed = [[st.stem(word) for word in docment] for docment in texts_filtered]
print texts_stemmed

all_stems = sum(texts_stemmed, [])
stems_once = set(stem for stem in set(all_stems) if all_stems.count(stem) == 1)
texts = [[stem for stem in text if stem not in stems_once] for text in texts_stemmed]

from gensim import corpora,models,similarities          #   http://blog.csdn.net/questionfish/article/details/46715795
 import logging                                                         #通過logging.basicConfig函式對日誌的輸出格式及方式做相關配置
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
dictionary = corpora.Dictionary(texts)                               #為每個出現在語料庫中的單詞分配了一個獨一無二的整數編號
print dictionary
print dictionary.token2id                                                   #檢視單詞與編號之間的對映關係

corpus = [dictionary.doc2bow(text) for text in texts]           #函式doc2bow()簡單地對每個不同單詞的出現次數進行了計數,並將      
print corpus                                                                    #單詞轉換為其編號,然後以稀疏向量的形式返回結果

tfidf = models.TfidfModel(corpus)   #補充tf-idf:http://www.ruanyifeng.com/blog/2013/03/tf-idf.html

corpus_tfidf = tfidf[corpus]
for doc in corpus_tfidf:
     print doc

print tfidf.dfs    #同idfs,也是個字典,每個key的value代表的是該單詞在多少文件曾經出現過
print tfidf.idfs   #資料的字典,每個資料的value代表該單詞對於該篇文件的代表性大小,即:如果該單詞在所有的文章中均出現,
                  #說明毫無代表作用,該處value為0,而如果該單詞在越少的文章中出現,則代表該單詞對於該文件有更強的代表性

lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=10)
corpus_lsi = lsi[corpus_tfidf]
for doc in corpus_lsi:
... print doc

index = similarities.MatrixSimilarity(lsi[corpus])

>>> print courses_name[210]
Machine Learning

>>> ml_course = texts[210]
>>> ml_bow = dicionary.doc2bow(ml_course)
>>> ml_lsi = lsi[ml_bow]
>>> print ml_lsi

>>> sims = index[ml_lsi]
>>> sort_sims = sorted(enumerate(sims), key=lambda item: -item[1]