gensim訓練模型並使用以人民的名義小說為例子
阿新 • • 發佈:2021-07-12
首先對小說進行預處理,使用jieba分詞進行分詞
import jieba import jieba.analyse jieba.suggest_freq('沙瑞金', True) jieba.suggest_freq('田國富', True) jieba.suggest_freq('高育良', True) jieba.suggest_freq('侯亮平', True) jieba.suggest_freq('鍾小艾', True) jieba.suggest_freq('陳岩石', True) jieba.suggest_freq('歐陽菁', True) jieba.suggest_freq('易學習', True) jieba.suggest_freq('王大路', True) jieba.suggest_freq('蔡成功', True) jieba.suggest_freq('孫連城', True) jieba.suggest_freq('季昌明', True) jieba.suggest_freq('丁義珍', True) jieba.suggest_freq('鄭西坡', True) jieba.suggest_freq('趙東來', True) jieba.suggest_freq('高小琴', True) jieba.suggest_freq('趙瑞龍', True) jieba.suggest_freq('林華華', True) jieba.suggest_freq('陸亦可', True) jieba.suggest_freq('劉新建', True) jieba.suggest_freq('劉慶祝', True) with open('./in_the_name_of_people.txt', encoding="utf8") as f: document = f.read() # print(document) # document_decode = document.decode('GBK') document_cut = jieba.cut(document) # print' '.join(jieba_cut) //如果列印結果,則分詞效果消失,後面的result無法顯示 result = ' '.join(document_cut) # result = result.encode('utf-8') # print(result) with open('./in_the_name_of_people_segment.txt', 'w', encoding="utf8") as f2: f2.write(result)
訓練模型:
import logging import os from gensim.models import word2vec logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) sentences = word2vec.LineSentence('./in_the_name_of_people_segment.txt') model = word2vec.Word2Vec(sentences, hs=1, min_count=1, window=3)
模型的三種應用:
# 找出相近詞的集合 print(model.wv.similar_by_key("沙瑞金", topn=5)) # 檢視兩個詞的相近程度 print("沙瑞金|高育良的相似度:", model.wv.similarity("沙瑞金", "高育良")) # 找出不同類的詞 print("沙瑞金 高育良 李達康 劉慶祝中不同類別的詞為:",model.wv.doesnt_match(u"沙瑞金 高育良 李達康 劉慶祝".split()))