Python獲取個人網站的所有課程下載鏈接和密碼,並保存到Mongodb中
阿新 • • 發佈:2017-06-04
one find() net agent play col pat 進行 jpg
1、獲取網站課程的分類地址;
‘‘‘ 爬取屌絲首頁,獲取每個分類名稱和鏈接 ‘‘‘ import requests from lxml import etree headers = { ‘User-Agent‘:‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36‘, } def get_class_data(): list_data = [] url = ‘http://www.diaosiweb.net/index.htmlView Code‘ responese = requests.get(url,headers=headers) responese.encoding = responese.apparent_encoding class_names = etree.HTML(responese.text).xpath(‘//div[@id="menu"]/div/ul/li/a/text()‘) class_links = etree.HTML(responese.text).xpath(‘//div[@id="menu"][email protected]‘) for class_name,class_link inzip(class_names,class_links): if len(class_link.split(‘/‘)[-1]) == 0: class_data = { ‘類別名稱‘:class_name, ‘類別鏈接‘:class_link, } list_data.append(class_data) else: pass return list_data
2、通過上面獲取的地址來獲取所有的每個分類下的所有課程名稱、鏈接和發布時間,並保存到Mongodb中去;
‘‘‘ 獲取每個分類url下面的課程名稱和鏈接,然後通過課程鏈接,進入到鏈接裏面去獲取每個課程的url和密碼 ‘‘‘ from spiders_diaosi import get_class_data import requests from lxml import etree import pymongo from multiprocessing import Pool headers = { ‘User-Agent‘:‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36‘, } client = pymongo.MongoClient(‘localhost‘,27017) diaosi = client[‘kecheng_message‘] kecheng_message = diaosi[‘message‘] def get_kecheng_data(url): #獲取每頁的課程名稱、鏈接、發布時間 try: response = requests.get(url,headers = headers) response.encoding = response.apparent_encoding kecheng_names = etree.HTML(response.text).xpath(‘//ul[@class="g-list1"]/li/a/text()‘) kecheng_links = etree.HTML(response.text).xpath(‘//ul[@class="g-list1"][email protected]‘) times = etree.HTML(response.text).xpath(‘//ul[@class="g-list1"]/li/span/text()‘) for kecheng_name,kecheng_link,time in zip(kecheng_names,kecheng_links,times): data = { ‘課程名稱‘:kecheng_name, ‘課程鏈接‘:kecheng_link, ‘發布時間‘:time } kecheng_message.insert(data) #把獲取到的課程信息保存到Mongodb中,最後爬取的時候從數據中爬取 #print(data) except Exception as e: print(e) def get_max_page(url): #獲取每個分類的最大頁數 page_response = requests.get(url,headers=headers) page_num = int(etree.HTML(page_response.text).xpath(‘//span[@class="pageinfo"]/strong[1]/text()‘)[0]) return page_num #print(page_num) def get_class_id(url): class_response = requests.get(url,headers=headers) class_response.encoding = class_response.apparent_encoding if get_max_page(url) != 1: class_id = int(etree.HTML(class_response.text).xpath(‘//ul[@class="pagelist"][email protected]‘)[-1].split(‘_‘)[1]) for num in range(1,get_max_page(url) + 1): new_url = ‘{}list_{}_{}.html‘.format(url,class_id,num) #print(new_url) get_kecheng_data(new_url) else: get_kecheng_data(url) for link in get_class_data(): #從之前的爬取的分類鏈接中,讀取其中的鏈接,然後爬取每個分類鏈接中的課程信息 url = link[‘類別鏈接‘] print(‘開始爬取:‘ + link[‘類別名稱‘]) get_class_id(url) print(‘已經爬完了:‘ + link[‘類別名稱‘])View Code
3、從數據庫中讀取每個課程的鏈接,因為下載地址只有登入之後才可以看到,所以模擬登入之後,進行獲取,並保存到Mongodb中去,
from get_captcha import get_capthca import pymongo import re import requests from lxml import etree import random client = pymongo.MongoClient(‘localhost‘,27017) diaosi = client[‘kecheng_message‘] kecheng_message = diaosi[‘message‘] dow_message = diaosi[‘dow_message‘] login_url = ‘http://www.diaosiweb.net/member/index.php‘ headers_data = [ ‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36‘, ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393‘, ‘Mozilla/5.0 (Windows NT 10.0; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0‘, ] headers = {‘User-Agent‘:random.choice(headers_data)} data = { ‘fmdo‘:‘login‘, ‘dopost‘:‘login‘, ‘gourl‘:‘‘, ‘userid‘:‘***‘, #運行的時候這裏輸入你的用戶名,或者用input函數輸入也可以 ‘pwd‘:‘****‘, #這裏則輸入密碼,獲取用input函數 ‘vdcode‘:‘‘, ‘keeptime‘:‘604800‘, } get_capthca(login_url) captcha = input(‘輸入你看到的驗證碼:‘) data[‘vdcode‘] = captcha session = requests.Session() session.headers.update(headers) login_response = session.get(login_url,headers= headers,data=data) for link in kecheng_message.find(): html = session.get(link[‘課程鏈接‘]) html.encoding = html.apparent_encoding dow_url = re.compile("<div id=‘pan‘ style=\"display:none;\">(.*?)</div>").findall(html.text)[0] mima = etree.HTML(html.text).xpath(‘//span[@style]/text()‘) data = { ‘name‘:link[‘課程名稱‘], ‘link‘:link[‘課程鏈接‘], ‘dow_url‘:dow_url, } try: if len(mima) == 0 or len(mima) > 5 and ‘網盤提取密碼‘ not in mima[-1].split(‘:‘) : data[‘mima‘] = ‘沒有密碼‘ else: data[‘mima‘] = mima dow_message.insert(data) print(data) except Exception as e: print(e) print(link[‘課程名稱‘])View Code
下面是獲取網頁驗證碼的,
‘‘‘ 獲取登入界面的驗證碼,並保存到本地 --現在只是保存到本地中,後期再編寫自動輸入 ‘‘‘ import requests from lxml import etree import os login_url = ‘http://www.diaosiweb.net/member/index.php‘ headers = { ‘User-Agent‘:‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36‘, } def get_capthca(url): login_response = requests.get(url,headers=headers) image_url = ‘http://www.diaosiweb.net‘ + etree.HTML(login_response.text).xpath(‘//img[@id="vdimgck"][email protected]‘)[0].replace(‘..‘,‘‘) image_response = requests.get(image_url).content with open(‘captcha.jpg‘,‘wb‘) as f: f.write(image_response) f.close() print(‘驗證碼已經保存到:{}‘.format(os.getcwd()))View Code
恩,這樣差不多就完成了一個爬蟲項目了,因為是第一次完整的爬取,所以寫的比較亂,也沒有思維圖,也知道有很多地方不完善,但是發懶筋了,不想寫了,先這樣吧!
Python獲取個人網站的所有課程下載鏈接和密碼,並保存到Mongodb中