python 爬蟲 封裝自己的常用方法
阿新 • • 發佈:2019-01-03
import urllib import urllib.request import ssl import re from collections import deque def writeFile2Strs(url,topath): with open(topath,"w") as f: f.write(getHtmlBytes(url).decode("utf-8")) def writeFile2Bytes(url,topath): with open(topath,"wb") as f: f.write(getHtmlBytes(url)) def getHtml_Str(url,decode="utf-8"): return getHtmlBytes(url).decode(decode) def getURL_list(strs): parUrl = r"(((http|ftp|https)://)(([a-zA-Z0-9\._-]+\.[a-zA-Z]{2,6})|([0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}))(:[0-9]{1,4})*(/[a-zA-Z0-9\&%_\./-~-]*)?)" re_URL = re.compile(parUrl) listURL = list(set(re_URL.findall(strs))) #這裡的listURL中每個元素都是又個一個列表 listURLs = [] for URLi in listURL: #取每個元素列表的[0] listURLs.append(URLi[0]) return listURLs def getQQ_list(strs): pat = r"[1-9]\d{4,10}" re_pat = re.compile(pat) listQQ = re_pat.findall(strs) listQQ = list(set(listQQ)) return listQQ def proceedAllUrlList(url,urlProceed): #篩選所有符合的列表 dq = deque() dq.append(url) while len(dq)!= 0: targeturl = dq.popleft() urlList = getURL_list(getHtml_Str(url)) urlProceed(url) for oneURL in urlList: dq.append(oneURL) def getHtmlBytes(url): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"} req = urllib.request.Request(url, headers=headers) # 使用ssl建立未驗證的上下文 context = ssl._create_unverified_context() try: response = urllib.request.urlopen(req, timeout=5,context=context) except: print("爬取超時!關閉執行緒") return -1 return response.read()