1. 程式人生 > 其它 >gensim訓練模型並使用以人民的名義小說為例子

gensim訓練模型並使用以人民的名義小說為例子

首先對小說進行預處理,使用jieba分詞進行分詞

import jieba
import jieba.analyse

jieba.suggest_freq('沙瑞金', True)
jieba.suggest_freq('田國富', True)
jieba.suggest_freq('高育良', True)
jieba.suggest_freq('侯亮平', True)
jieba.suggest_freq('鍾小艾', True)
jieba.suggest_freq('陳岩石', True)
jieba.suggest_freq('歐陽菁', True)
jieba.suggest_freq(
'易學習', True) jieba.suggest_freq('王大路', True) jieba.suggest_freq('蔡成功', True) jieba.suggest_freq('孫連城', True) jieba.suggest_freq('季昌明', True) jieba.suggest_freq('丁義珍', True) jieba.suggest_freq('鄭西坡', True) jieba.suggest_freq('趙東來', True) jieba.suggest_freq('高小琴', True) jieba.suggest_freq('趙瑞龍', True) jieba.suggest_freq(
'林華華', True) jieba.suggest_freq('陸亦可', True) jieba.suggest_freq('劉新建', True) jieba.suggest_freq('劉慶祝', True) with open('./in_the_name_of_people.txt', encoding="utf8") as f: document = f.read() # print(document) # document_decode = document.decode('GBK') document_cut = jieba.cut(document) # print
' '.join(jieba_cut) //如果列印結果,則分詞效果消失,後面的result無法顯示 result = ' '.join(document_cut) # result = result.encode('utf-8') # print(result) with open('./in_the_name_of_people_segment.txt', 'w', encoding="utf8") as f2: f2.write(result)

訓練模型:

import logging
import os
from gensim.models import word2vec

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

sentences = word2vec.LineSentence('./in_the_name_of_people_segment.txt')

model = word2vec.Word2Vec(sentences, hs=1, min_count=1, window=3)

模型的三種應用:

# 找出相近詞的集合
print(model.wv.similar_by_key("沙瑞金", topn=5))

# 檢視兩個詞的相近程度
print("沙瑞金|高育良的相似度:", model.wv.similarity("沙瑞金", "高育良"))

# 找出不同類的詞
print("沙瑞金 高育良 李達康 劉慶祝中不同類別的詞為:",model.wv.doesnt_match(u"沙瑞金 高育良 李達康 劉慶祝".split()))