用NLTK對英文語料做預處理,用gensim計算相似度
阿新 • • 發佈:2019-01-05
import nltk from nltk.tokenize import word_tokenize text = open('F:/iPython/newsfortfidf.txt') # testtext = [line.strip() for line in file('text')] testtextt = [course.split("###") for text in testtext] print testtext texts_tokenized = [[word for word in nltk.word_tokenize(testtext)]] print texts_tokenized from nltk.corpus import stopwords english_stopwords = stopwords.words('english') print english_stopwords len(english_stopwords) texts_filtered_stopwords = [[word for word in document if not word in english_stopwords] for document in texts_tokenized] print texts_filtered_stopwords english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%','-'] texts_filtered = [[word for word in document if not word in english_punctuations] for document in texts_filtered_stopwords] print texts_filtered from nltk.stem.lancaster import LancasterStemmer st = LancasterStemmer() texts_stemmed = [[st.stem(word) for word in docment] for docment in texts_filtered] print texts_stemmed all_stems = sum(texts_stemmed, []) stems_once = set(stem for stem in set(all_stems) if all_stems.count(stem) == 1) texts = [[stem for stem in text if stem not in stems_once] for text in texts_stemmed] from gensim import corpora,models,similarities # http://blog.csdn.net/questionfish/article/details/46715795 import logging #通過logging.basicConfig函式對日誌的輸出格式及方式做相關配置 logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) dictionary = corpora.Dictionary(texts) #為每個出現在語料庫中的單詞分配了一個獨一無二的整數編號 print dictionary print dictionary.token2id #檢視單詞與編號之間的對映關係 corpus = [dictionary.doc2bow(text) for text in texts] #函式doc2bow()簡單地對每個不同單詞的出現次數進行了計數,並將 print corpus #單詞轉換為其編號,然後以稀疏向量的形式返回結果 tfidf = models.TfidfModel(corpus) #補充tf-idf:http://www.ruanyifeng.com/blog/2013/03/tf-idf.html corpus_tfidf = tfidf[corpus] for doc in corpus_tfidf: print doc print tfidf.dfs #同idfs,也是個字典,每個key的value代表的是該單詞在多少文件曾經出現過 print tfidf.idfs #資料的字典,每個資料的value代表該單詞對於該篇文件的代表性大小,即:如果該單詞在所有的文章中均出現, #說明毫無代表作用,該處value為0,而如果該單詞在越少的文章中出現,則代表該單詞對於該文件有更強的代表性 lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=10) corpus_lsi = lsi[corpus_tfidf] for doc in corpus_lsi: ... print doc index = similarities.MatrixSimilarity(lsi[corpus]) >>> print courses_name[210] Machine Learning >>> ml_course = texts[210] >>> ml_bow = dicionary.doc2bow(ml_course) >>> ml_lsi = lsi[ml_bow] >>> print ml_lsi >>> sims = index[ml_lsi] >>> sort_sims = sorted(enumerate(sims), key=lambda item: -item[1]