python爬蟲庫scrapy簡單使用例項詳解
阿新 • • 發佈:2020-02-11
最近因為專案需求,需要寫個爬蟲爬取一些題庫。在這之前爬蟲我都是用node或者php寫的。一直聽說python寫爬蟲有一手,便入手了python的爬蟲框架scrapy.
下面簡單的介紹一下scrapy的目錄結構與使用:
首先我們得安裝scrapy框架
pip install scrapy
接著使用scrapy命令建立一個爬蟲專案:
scrapy startproject questions
相關檔案簡介:
scrapy.cfg: 專案的配置檔案
questions/: 該專案的python模組。之後您將在此加入程式碼。
questions/items.py: 專案中的item檔案.
questions/pipelines.py: 專案中的pipelines檔案.
questions/settings.py: 專案的設定檔案.
questions/spiders/: 放置spider程式碼的目錄.
questions/spiders/xueersi.py: 實現爬蟲的主體程式碼.
xueersi.py 爬蟲主體
# -*- coding: utf-8 -*- import scrapy import time import numpy import re from questions.items import QuestionsItem class xueersiSpider(scrapy.Spider): name = "xueersi" # 爬蟲名字 allowed_domains = ["tiku.xueersi.com"] # 目標的域名 # 爬取的目標地址 start_urls = [ "http://tiku.xueersi.com/shiti/list_1_1_0_0_4_0_1","http://tiku.xueersi.com/shiti/list_1_2_0_0_4_0_1","http://tiku.xueersi.com/shiti/list_1_3_0_0_4_0_1",] levels = ['偏易','中檔','偏難'] subjects = ['英語','語文','數學'] # 爬蟲開始的時候,自動呼叫該方法,如果該方法不存在會自動呼叫parse方法 # def start_requests(self): # yield scrapy.Request('http://tiku.xueersi.com/shiti/list_1_2_0_0_4_0_39',callback=self.getquestion) # start_requests方法不存在時,parse方法自動被呼叫 def parse(self,response): # xpath的選擇器語法不多介紹,可以直接檢視官方文件 arr = response.xpath("//ul[@class='pagination']/li/a/text()").extract() total_page = arr[3] # 獲取分頁 for index in range(int(total_page)): yield scrapy.Request(response.url.replace('_0_0_4_0_1',"_0_0_4_0_"+str(index)),callback=self.getquestion) # 發出新的請求,獲取每個分頁所有題目 # 獲取題目 def getquestion(self,response): for res in response.xpath('//div[@class="main-wrap"]/ul[@class="items"]/li'): item = QuestionsItem() # 例項化Item類 # 獲取問題 questions = res.xpath('./div[@class="content-area"]').re(r'<div class="content-area">?([\s\S]+?)<(table|\/td|div|br)') if len(questions): # 獲取題目 question = questions[0].strip() item['source'] = question dr = re.compile(r'<[^>]+>',re.S) question = dr.sub('',question) content = res.extract() item['content'] = question # 獲取課目 subject = re.findall(ur'http:\/\/tiku\.xueersi\.com\/shiti\/list_1_(\d+)',response.url) item['subject'] = self.subjects[int(subject[0])-1] # 獲取難度等級 levels = res.xpath('//div[@class="info"]').re(ur'難度:([\s\S]+?)<') item['level'] = self.levels.index(levels[0])+1 # 獲取選項 options = re.findall(ur'[A-D][\..]([\s\S]+?)<(\/td|\/p|br)',content) item['options'] = options if len(options): url = res.xpath('./div[@class="info"]/a/@href').extract()[0] request = scrapy.Request(url,callback=self.getanswer) request.meta['item'] = item # 快取item資料,傳遞給下一個請求 yield request #for option in options: # 獲取答案 def getanswer(self,response): res = response.xpath('//div[@class="part"]').re(ur'<td>([\s\S]+?)<\/td>') con = re.findall(ur'([\s\S]+?)<br>[\s\S]+?([A-D])',res[0]) # 獲取含有解析的答案 if con: answer = con[0][1] analysis = con[0][0] # 獲取解析 else: answer = res[0] analysis = '' if answer: item = response.meta['item'] # 獲取item item['answer'] = answer.strip() item['analysis'] = analysis.strip() item['answer_url'] = response.url yield item # 返回item,輸出管道(pipelines.py)會自動接收該資料
items.py 資料結構定義:
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://doc.scrapy.org/en/latest/topics/items.html import scrapy class QuestionsItem(scrapy.Item): content = scrapy.Field() subject = scrapy.Field() level = scrapy.Field() answer = scrapy.Field() options = scrapy.Field() analysis = scrapy.Field() source = scrapy.Field() answer_url = scrapy.Field() pass
pipelines.py 輸出管道(本例子輸出的資料寫入本地資料庫):
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html import pymysql import md5 class QuestionsPipeline(object): def __init__(self): # 建立資料庫連線 self.connect = pymysql.connect('localhost','root','','question',use_unicode=True,charset='utf8') # 獲取遊標 self.cursor = self.connect.cursor() print("connecting mysql success!") self.answer = ['A','B','C','D'] def process_item(self,item,spider): content = pymysql.escape_string(item['content']) # 獲取題目hash值,使用該欄位過濾重複的題目 m1 = md5.new() m1.update(content) hash = m1.hexdigest() selectstr = "select id from question where hash='%s'"%(hash) self.cursor.execute(selectstr) res = self.cursor.fetchone() # 過濾相同的題目 if not res: # 插入題目 sqlstr = "insert into question(content,source,subject,level,answer,analysis,hash,answer_url) VALUES('%s','%s','%s')"%(content,pymysql.escape_string(item['source']),item['subject'],item['level'],item['answer'],pymysql.escape_string(item['analysis']),item['answer_url']) self.cursor.execute(sqlstr) qid = self.cursor.lastrowid # 插入選項 for index in range(len(item['options'])): option = item['options'][index] answer = self.answer.index(item['answer']) if answer==index: ans = '2' else: ans = '1' sqlstr = "insert into options(content,qid,answer) VALUES('%s','%s')"%(pymysql.escape_string(option[0]),ans) self.cursor.execute(sqlstr) self.connect.commit() #self.connect.close() return item
爬蟲構建完畢後,在專案的根目錄下執行
scrapy crawl xueersi # scrapy crawl 爬蟲的名稱
更多關於python爬蟲庫scrapy使用方法請檢視下面的相關連結