python 多線程方法爬取微信公眾號文章
阿新 • • 發佈:2018-06-05
微信爬蟲 多線程爬蟲
本文在上一篇基礎上增加多線程處理(http://blog.51cto.com/superleedo/2124494 )
執行思路:
1,規劃好執行流程,建立兩個執行線程,一個控制線程
2,線程1用於獲取url,並寫入urlqueue隊列
3,線程2,通過線程1的url獲取文章內容,並保存到本地文件中
4,線程3用於控制程序,保證1,2線程都執行完後退出
5,多線程退出程序,在子線程設置daemon為true,保證程序正常退出
6,添加異常處理,添加限時防止屏蔽
閑話不多說,上代碼
#!/usr/bin/env python # -*- coding: utf-8 -*- import re import urllib.request import time import sys import urllib.error import threading import queue urlqueue=queue.Queue() ##模擬瀏覽器安裝headers headers=("User-Agent","Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36") opener=urllib.request.build_opener() opener.addheaders=[headers] urllib.request.install_opener(opener) ##設置列表用於存儲鏈接 listurl=[] ##定義代理服務器函數 #def use_proxy(proxy_addr,url): # try: # import urllib.request # proxy=urllib.request.ProxyHandler({'http':proxy_addr}) # opener=urllib.request.build_opener(proxy,urllib.request.HTTPHandler) # urllib.request.install_opener(opener) # data=urllib.request.urlopen(url).read().decode('utf-8') # data=str(data) # return data # except urllib.error.URLError as e: # if hasattr(e,"code"): # print(e.code) # if hasattr(e,"reason"): # print(e.reason) # time.sleep(10) # except Exception as e: # print("exception"+str(e)) # time.sleep(1) ##定義獲取頁面所有文章鏈接 class getlisturl(threading.Thread): def __init__(self,key,pagestart,pageend,urlqueue): threading.Thread.__init__(self) self.key=key self.pagestart=pagestart self.pageend=pageend self.urlqueue=urlqueue def run(self): page=self.pagestart keycode=urllib.request.quote(key) # pagecode=urllib.request.quote("&page") for page in range(self.pagestart,self.pageend+1): url="http://weixin.sogou.com/weixin?type=2&query="+keycode+"&page="+str(page) data1=urllib.request.urlopen(url).read().decode('utf-8') data1=str(data1) listurlpat='<a data-z="art".*?(http://.*?)"' listurl.append(re.compile(listurlpat,re.S).findall(data1)) time.sleep(2) print("共獲取到"+str(len(listurl))+"頁") # print("第2頁鏈接數"+str(len(listurl[1]))+"個") # return listurl for i in range(0,len(listurl)): time.sleep(6) for j in range(0,len(listurl[i])): try: url=listurl[i][j] url=url.replace("amp;","") print("第"+str(i)+"i"+str(j)+"j次入隊") self.urlqueue.put(url) self.urlqueue.task_done() except urllib.error.URLError as e: if hasattr(e,"code"): print(e.code) if hasattr(e,"reason"): print(e.reason) time.sleep(10) except Exception as e: print("exception"+str(e)) time.sleep(1) ##定義獲取文章內容 class getcontent(threading.Thread): def __init__(self,urlqueue): threading.Thread.__init__(self) self.urlqueue=urlqueue def run(self): # i = 0 #設置本地文件中的開始html編碼 html1 = ''' <!DOCTYPE html> <html> <head> <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> <title>微信文章頁面</title> </head> <body> ''' fh=open("/home/urllib/test/1.html","wb") fh.write(html1.encode("utf-8")) fh.close() #再次以追加寫入的方式打開文件,以寫入對應文章內容 fh=open("/home/urllib/test/1.html","ab") i=1 while(True): try: url=self.urlqueue.get() data=urllib.request.urlopen(url).read().decode('utf-8') data=str(data) titlepat='var msg_title = "(.*?)";' contentpat='id="js_content">(.*?)id="js_sg_bar"' title=re.compile(titlepat).findall(data) content=re.compile(contentpat,re.S).findall(data) #初始化標題與內容 thistitle = "此次沒有獲取到" thiscontent= "此次沒有獲取到" #如果標題列表不為空,說明找到了標題,取列表第0個元素,即此次標題賦給變量thistitle if (title!=[]): thistitle = title[0] if (content!=[]): thiscontent = content[0] #將標題與內容匯總賦給變量dataall dataall = "<p>標題為:"+thistitle+"</p><p>內容為:"+thiscontent+"</p><br>" fh.write(dataall.encode('utf-8')) print("第"+str(i)+"個網頁處理") time.sleep(1) i+=1 except urllib.error.URLError as e: if hasattr(e,"code"): print(e.code) if hasattr(e,"reason"): print(e.reason) time.sleep(10) except Exception as e: print("exception"+str(e)) time.sleep(1) fh.close() html2='''</body> </html> ''' fh=open("/home/urllib/test/1.html","ab") fh.write(html2.encode("utf-8")) fh.close() class contrl(threading.Thread): def __init__(self,urlqueue): threading.Thread.__init__(self) self.urlqueue=urlqueue def run(self): while(True): print("程序執行中.....") time.sleep(60) if(self.urlqueue.empty()): print("程序執行完畢。。。") exit() key="科技" #proxy="122.114.31.177:808" pagestart=1 pageend=2 #listurl=getlisturl(key,pagestart,pageend) #getcontent(listurl) t1=getlisturl(key,pagestart,pageend,urlqueue) #子進程設置daemon為true,保證程序正常退出 t1.setDaemon(True) t1.start() t2=getcontent(urlqueue) t2.setDaemon(True) t2.start() t3=contrl(urlqueue) t3.start()
執行結果正常:
瀏覽器打開1.html
已上代碼可以直接使用
python 多線程方法爬取微信公眾號文章