Doc2Vec計算句子文件向量、求文字相似度
阿新 • • 發佈:2018-12-21
注:本文主要是記錄自己常用的關於Doc2Vec的簡單程式程式碼。因此不做過多的解釋,直接寫出程式碼,如有問題可以討論交流。
一、doc2vec求文件向量
import sys import numpy as np import gensim from gensim.models.doc2vec import Doc2Vec, LabeledSentence TaggedDocument = gensim.models.doc2vec.TaggedDocument #讀取並處理資料 def get_datatset(sentence): all_sentence = [] for i, sentence in enumerate(sentences): all_sentence.append(TaggedDocument(sentence.split(), tag=[i])) return all_sentence #得到資料集corpus的文字向量 def getVecs(model, corpus, vector_size): vecs = [np.array(model.docvecs[z.tags[0]].reshape(1, vector_size)) for z in corpus] return np.concatenate(vecs) #用資料集的文字訓練模型 def train(all_sentence, vector_size, min_count, epoch): model = Doc2Vec(vector_size=vector_size, min_count=min_count, epochs=epoch) model.build_vocab(all_sentence) model.train(all_sentence, total_examples = model.corpus_count, epochs=model.epochs) return model if __name__ == "__main__": sentence = open('sentence.txt','r').readlines() all_sentence = get_dataset(sentence) model = train(all_sentence, vector_size, min_count, epoch) sentence_vecs = getVecs(model, all_sentence, vector_size)
二、doc2vec求文字相似度
import sys import gensim import sklearn import numpy as np from gensim.models.doc2vec import Doc2Vec, LabeledSentence TaggedDocument = gensim.models.doc2vec.TaggedDocument #訓練部分同上 def similarity(model): test_text = 'xxx xxx xxxxx'.split() inferred_vector = model.infer_vector(test_text) sims = model.most_similar([inferred_vector], topn=10) return sims if __name__ == '__main__': all_sentence = get_dataset(sentence) model = train(all_sentence, vector_size, min_count, epoch) sims = test()