爬取嗶哩嗶哩彈幕製作詞雲
阿新 • • 發佈:2018-11-16
爬取嗶哩嗶哩的彈幕,http://comment.bilibili.com/6315651.xml
需要知道cid,可以F12,F5重新整理,找cid,找到之後拼接url
也可以寫程式碼,解析response獲取cid,然後再拼接
使用requests或者urllib都可以
我是用requests,請求該連結獲取到xml檔案
程式碼:獲取xml
def get_data(): res = requests.get('http://comment.bilibili.com/6315651.xml') res.encoding = 'utf8' with open('gugongdanmu.xml', 'a', encoding='utf8') as f: f.writelines(res.text)
解析xml,
def analyze_xml(): f1 = open("gugongdanmu.xml", "r", encoding='utf8') f2 = open("tanmu2.txt", "w", encoding='utf8') count = 0 # 正則匹配解決xml的多餘的字元 dr = re.compile(r'<[^>]+>', re.S) while 1: line = f1.readline() if not line: break pass # 匹配到之後用空代替 dd = dr.sub('', line) # dd = re.findall(dr, line) count = count+1 f2.writelines(dd) print(count)
去掉無用的字元和數字,找出所有的漢字
def analyze_hanzi(): f1 = open("tanmu2.txt", "r", encoding='utf8') f2 = open("tanmu3.txt", "w", encoding='utf8') count = 0 # dr = re.compile(r'<[^>]+>',re.S) # 所有的漢字[一-龥] dr = re.compile(r'[一-龥]+',re.S) while 1: line = f1.readline() if not line: break pass # 找出無用的符號和數字 # dd = dr.sub('',line) dd = re.findall(dr, line) count = count+1 f2.writelines(dd) print(count) # pattern = re.compile(r'[一-龥]+')
使用jieba分詞,生成詞雲
def show_sign():
content = read_txt_file()
segment = jieba.lcut(content)
words_df = pd.DataFrame({'segment': segment})
stopwords = pd.read_csv("stopwords.txt", index_col=False, quoting=3, sep=" ", names=['stopword'], encoding='utf-8')
words_df = words_df[~words_df.segment.isin(stopwords.stopword)]
print(words_df)
print('-------------------------------')
words_stat = words_df.groupby(by=['segment'])['segment'].agg(numpy.size)
words_stat = words_stat.to_frame()
words_stat.columns = ['計數']
words_stat = words_stat.reset_index().sort_values(by=["計數"], ascending=False)
# 設定詞雲屬性
color_mask = imread('ciyun.png')
wordcloud = WordCloud(font_path="simhei.ttf", # 設定字型可以顯示中文
background_color="white", # 背景顏色
max_words=1000, # 詞雲顯示的最大詞數
mask=color_mask, # 設定背景圖片
max_font_size=100, # 字型最大值
random_state=42,
width=1000, height=860, margin=2,
# 設定圖片預設的大小,但是如果使用背景圖片的話, # 那麼儲存的圖片大小將會按照其大小儲存,margin為詞語邊緣距離
)
# 生成詞雲, 可以用generate輸入全部文字,也可以我們計算好詞頻後使用generate_from_frequencies函式
word_frequence = {x[0]: x[1] for x in words_stat.head(1000).values}
print(word_frequence)
# for key,value in word_frequence:
# write_txt_file(word_frequence)
word_frequence_dict = {}
for key in word_frequence:
word_frequence_dict[key] = word_frequence[key]
wordcloud.generate_from_frequencies(word_frequence_dict)
# 從背景圖片生成顏色值
image_colors = ImageColorGenerator(color_mask)
# 重新上色
wordcloud.recolor(color_func=image_colors)
# 儲存圖片
wordcloud.to_file('output.png')
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
執行程式,結果:
統計的結果
完成!
pip的換源,原來的太慢,然後將你自己沒有庫裝上