用gensim doc2vec計算文字相似度,Python可以跑通的程式碼
阿新 • • 發佈:2018-11-14
Python3.7版本,轉載自:https://blog.csdn.net/juanjuan1314/article/details/75124046
wangyi_title.txt檔案下載地址:連結:https://pan.baidu.com/s/1uL75P13t98YHMqgv3Kx7TQ 密碼:oqxt
對原文有修改,原文程式碼是Python2,有很多問題。
# coding:utf-8 import sys import gensim import sklearn import numpy as np from gensim.models.doc2vec import Doc2Vec, LabeledSentence TaggededDocument = gensim.models.doc2vec.TaggedDocument def get_datasest(): with open("wangyi_title.txt", 'r') as cf: docs = cf.readlines() print(len(docs)) x_train = [] #y = np.concatenate(np.ones(len(docs))) for i, text in enumerate(docs): word_list = text.split(' ') l = len(word_list) word_list[l-1] = word_list[l-1].strip() document = TaggededDocument(word_list, tags=[i]) x_train.append(document) return x_train def getVecs(model, corpus, vector_size): vecs = [np.array(model.docvecs[z.tags[0]].reshape(1, vector_size)) for z in corpus] return np.concatenate(vecs) def train(x_train, vector_size=200, epoch_num=1): model_dm = Doc2Vec(x_train,min_count=1, window = 3, vector_size = vector_size, sample=1e-3, negative=5, workers=4) model_dm.train(x_train, total_examples=model_dm.corpus_count, epochs=70) model_dm.save('model_dm_wangyi') return model_dm def test(): model_dm = Doc2Vec.load("model_dm_wangyi") test_text = ['《', '舞林', '爭霸' '》', '十強' '出爐', '復活', '舞者', '澳門', '踢館'] inferred_vector_dm = model_dm.infer_vector(test_text) print(inferred_vector_dm) sims = model_dm.docvecs.most_similar([inferred_vector_dm], topn=10) return sims if __name__ == '__main__': x_train = get_datasest() model_dm = train(x_train) sims = test() for count, sim in sims: sentence = x_train[count] words = '' for word in sentence[0]: words = words + word + ' ' print (words, sim, len(sentence[0]))
用了網頁的熱門娛樂新聞標題作為訓練語料,輸出結果如下,很相似的句子確實少。
42754 [-2.1229391e-05 -5.3489220e-04 -1.4628534e-03 1.0101878e-03 1.9613570e-03 1.2337929e-04 -1.5623088e-03 1.4899696e-03 1.7250431e-04 1.7861715e-03 5.4765341e-04 1.7854273e-03 1.4752866e-04 -4.7224312e-04 -2.0143031e-03 -1.3678997e-03 2.1347464e-03 -4.2291704e-04 -2.2612642e-03 1.9719985e-03 -1.7474928e-03 6.7744928e-04 -1.1667489e-03 1.4224678e-03 -4.9147848e-04 1.9250986e-03 1.5286671e-04 -1.0706087e-03 -1.2940766e-03 -1.1336872e-03 -4.8530920e-04 1.4789804e-03 1.7939236e-03 -1.2773223e-03 -2.4406663e-03 1.9606731e-03 2.4594443e-03 1.5459055e-03 -9.8075520e-04 1.6827125e-03 1.4778823e-03 2.0646905e-03 -3.4740806e-05 -1.5140681e-03 -7.6300337e-04 -2.1761435e-03 -1.9383265e-04 6.5391039e-04 -9.3230215e-04 3.8053558e-04 1.6529204e-05 1.5503957e-03 5.2016345e-04 1.2898637e-03 1.7284699e-03 2.2767365e-03 -9.5764997e-05 -8.4209896e-05 -1.5726103e-03 2.2212588e-03 -5.9885468e-04 -2.1759607e-03 -1.9564391e-03 1.2035059e-03 -3.9055874e-04 1.1362566e-03 1.0841021e-03 -9.0546644e-04 2.3774474e-03 1.3961376e-03 -1.8707723e-03 1.5263865e-03 -1.1634092e-03 -2.2435680e-03 1.8672579e-03 5.6013430e-04 2.3103815e-03 1.2101847e-03 -2.4156671e-03 -5.1514624e-04 2.1143679e-03 2.3558659e-03 -1.0352633e-03 -8.4526307e-04 2.2150134e-03 5.3238236e-05 -2.3913602e-03 -1.5362124e-04 1.5323326e-03 2.4526857e-03 -1.6107119e-04 -3.4444834e-04 1.6401864e-03 1.0141496e-03 3.7656463e-04 -1.2738963e-04 -1.1323770e-03 -2.0433934e-03 3.7525350e-04 -1.6017296e-04 -3.3818476e-04 2.2791843e-03 -1.4202974e-03 -2.7641861e-04 1.1009629e-04 -4.2639120e-04 1.8214980e-03 -1.7151656e-03 -1.5390049e-03 -1.3191046e-03 1.7080955e-03 1.1002786e-03 1.6142949e-03 1.8982554e-03 -7.0945674e-04 -4.6570468e-04 9.8265568e-04 -7.4710487e-04 2.4075075e-03 -2.1547875e-03 -2.1082300e-03 -1.8821321e-03 9.6265052e-04 -1.1552537e-03 -1.6849015e-03 -1.2968426e-03 -1.5383511e-04 -7.5135130e-04 -1.8727558e-03 5.2730407e-04 -2.3783895e-03 2.4225495e-03 2.3140633e-03 -1.0093495e-03 1.5953591e-03 -1.6097585e-03 -5.1834644e-04 5.6184967e-05 2.8760443e-04 2.0393797e-03 1.4612459e-03 2.1953927e-03 -2.1270583e-03 -9.9687604e-04 1.2225753e-03 2.0009447e-03 4.6715033e-04 2.1180776e-03 2.8774102e-04 -8.8365687e-06 -1.7047256e-03 -9.7245700e-04 -4.0429382e-04 1.9775415e-03 -2.2045472e-03 1.5636642e-03 -1.9885909e-03 2.0202452e-03 2.1154643e-03 1.7958126e-03 1.0514902e-03 1.9323002e-03 -1.5818867e-03 1.3666560e-03 -9.1630412e-04 3.2067264e-04 1.7956816e-04 -2.3987342e-03 9.4504084e-04 -2.9586093e-04 -1.6545136e-03 -9.1628381e-04 -1.2085686e-04 8.3511556e-04 9.2640345e-04 -1.0981049e-03 -2.6373079e-04 -1.1188543e-04 -1.0378383e-03 3.7422587e-04 -2.0860252e-03 8.9370640e-04 1.1446123e-03 -1.3295287e-03 1.2766315e-03 1.3684760e-03 2.1959674e-03 6.3199044e-04 -2.7432822e-04 5.7462428e-04 2.3212784e-03 9.1525499e-04 1.9918189e-03 1.1947503e-03 -1.1286519e-03 -6.5884611e-04 -6.7673821e-04 -3.2887704e-04 -1.9954341e-03 -1.1857023e-04] /usr/local/lib/python3.7/site-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`. if np.issubdtype(vec.dtype, np.int): 《全民目擊》錯失上影節 導演稱是為品質讓路 0.26860833168029785 2 [星態度]蔡依林:在感情裡我不是強勢者 0.2664705514907837 1 《大魔術師》發劇照 梁朝偉劉青雲等人搞笑耍寶 0.23988798260688782 2 三池崇史戛納迴應媒體反饋:惡評在意料之中 0.23537132143974304 1 《單身男女》試映爆滿獲好評 本週四正式上映 0.22329775989055634 2 金雞百花節門票創“史上天價”最貴11560元 0.2214442789554596 1 《霍位元人》登雜誌封面 曝彼得傑克遜工作照 0.2193169891834259 2 章子怡迴應與汪峰婚期:應該快了 我也不小了 0.21707454323768616 2 《畫壁》發MV 鄧超:這是屬於我和孫儷的愛情 0.21643859148025513 2 《生化危機5》曝戰神自白預告 重現屠魔之旅 0.21323256194591522 2 [Finished in 114.1s]
1
1