1. 程式人生 > >第十七節:Scrapy爬蟲框架之Middleware文件詳解

第十七節:Scrapy爬蟲框架之Middleware文件詳解

cookies yield 啟動 urn 響應 HERE 返回 === one

# -*- coding: utf-8 -*-

# 在這裏定義蜘蛛中間件的模型
# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html

from scrapy import signals

# ===========================Spider Middleware============================
# 定義:介於Scrapy引擎和爬蟲之間的框架,主要工作是處理蜘蛛的響應輸入和請求輸出。

# Spider Middleware功能:處理爬蟲的請求輸入和響應輸出
# scrapy已經提供了一些直接使用的中間件,他被SPIDER_MIDDLEWARES_BASE定義:
# {
# ‘scrapy.spidermiddlewares.httperror.HttpErrorMiddleware‘: 50,
# ‘scrapy.spidermiddlewares.offsite.OffsiteMiddleware‘: 500,
# ‘scrapy.spidermiddlewares.referer.RefererMiddleware‘: 700,
# ‘scrapy.spidermiddlewares.urllength.UrlLengthMiddleware‘: 800,

# ‘scrapy.spidermiddlewares.depth.DepthMiddleware‘: 900,
# }

# =================SpiderMiddleware類==================
class MaoyanSpiderMiddleware(object):
@classmethod

# 類方法,參數crawler,可以通過crawler調用settings裏的全局參數
def from_crawler(cls, crawler):
"""
:param crawler: 獲取settings裏的全局參數,如crawler.settings.get(參數)

"""
s = cls()
# 調用spider_opened函數進行爬取數據並對該函數發送該信號。該信號一般用來分配spider的資源
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)

# 調用spider_closed函數進行關閉爬蟲並對該函數發送該信號。該信號用來釋放spider在spider_opened時占用的資源。
# crawler.signals.connect(s.spider_closed, signal=signals.spider_closed)
return s

# 當返回來的response被Spider Middleware處理時,該方法被調用
def process_spider_input(self, response, spider):
"""
:param response: 被Spider Middleware處理的response對象
:param spider: 返回response對應的spider對象
"""
return None

# 當spider處理response對象的結果後,該方法被調用
def process_spider_output(self, response, result, spider):
"""
:param response: 被spider處理後得到結果的response對象
:param result: result包含Item或request對象的可叠代對象,即spider返回的response結果
:param spider: 返回response對象的spider對象
"""
# 遍歷返回的可叠代對象
for i in result:
yield i

# 當spider的process_spider_input和process_spider_output發生異常時調用該方法
def process_spider_exception(self, response, exception, spider):
"""
:param response: 異常被拋出時被處理的response對象
:param exception: 拋出的異常
:param spider: 拋出該異常的spider對象
"""
pass

# 以spider啟動的request為參數調用該方法,返回一個request可叠代對象
def process_start_requests(self, start_requests, spider):
"""
:param start_requests: 開始請求的可叠代對象
:param spider: 開始請求所對應的spider對象
"""
# 遍歷可叠代對象
for r in start_requests:
yield r


# 當spider開啟時調用該函數,說明開始爬取數據並分配spider的資源
def spider_opened(self, spider):
"""
:param spider: 開始爬取的spider對象
"""
spider.logger.info(‘Spider opened: %s‘ % spider.name)


# # 當某個spider被關閉時,說明關閉該爬蟲並釋放spider在spider_opened時占用的資源。
# def spider_closed(self, spider):
# """
# :param spider: 開始爬取的spider對象
# """
# spider.logger.info(‘Spider opened:%s‘%spider.name)




# ======================Downloader Middleware========================
# 定義:位於Scrapy引擎和下載器之間的框架,主要是處理Scrapy引擎與下載器之間的請求及響應。見scrapy框架圖
# Downloader Middleware功能:可以修改User-Agent、處理重定向、設置代理、失敗重試、設置Cookies等
# scrapy已經提供了一些直接使用的中間件,他被DOWNLOADER_MIDDLEWARES_BASE定義:
# {
# ‘scrapy.contrib.downloadermiddleware.robotstxt.RobotsTxtMiddleware‘: 100,
# ‘scrapy.contrib.downloadermiddleware.httpauth.HttpAuthMiddleware‘: 300,
# ‘scrapy.contrib.downloadermiddleware.downloadtimeout.DownloadTimeoutMiddleware‘: 350,
# ‘scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware‘: 400,
# ‘scrapy.contrib.downloadermiddleware.retry.RetryMiddleware‘: 500,
# ‘scrapy.contrib.downloadermiddleware.defaultheaders.DefaultHeadersMiddleware‘: 550,
# ‘scrapy.contrib.downloadermiddleware.redirect.MetaRefreshMiddleware‘: 580,
# ‘scrapy.contrib.downloadermiddleware.httpcompression.HttpCompressionMiddleware‘: 590,
# ‘scrapy.contrib.downloadermiddleware.redirect.RedirectMiddleware‘: 600,
# ‘scrapy.contrib.downloadermiddleware.cookies.CookiesMiddleware‘: 700,
# ‘scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware‘: 750,
# ‘scrapy.contrib.downloadermiddleware.chunked.ChunkedTransferMiddleware‘: 830,
# ‘scrapy.contrib.downloadermiddleware.stats.DownloaderStats‘: 850,
# ‘scrapy.contrib.downloadermiddleware.httpcache.HttpCacheMiddleware‘: 900,
# }


# ===============DownloaderMiddleware類=================
class MaoyanDownloaderMiddleware(object):
@classmethod

# 類方法,參數crawler,可以通過crawler調用settings裏的全局參數
def from_crawler(cls, crawler):
"""
:param crawler: 獲取settings裏的全局參數,如crawler.settings.get(參數)
"""
s = cls()
# 調用spider_opened函數進行爬取數據並對該函數發送該信號。該信號一般用來分配spider的資源
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)

# 調用spider_closed函數進行關閉爬蟲並對該函數發送該信號。該信號用來釋放spider在spider_opened時占用的資源。
# crawler.signals.connect(s.spider_closed, signal=signals.spider_closed)
return s

# request被scrapy從調度器調度給Downloader Middleware之前調用該方法對request對象進行處理
def process_request(self, request, spider):
"""
:param request: 就是scrapy從調度器調度出來的request對象
:param spider: 就是scrapy調度出來的request對象的spider對象
"""
return None

# request對象被Downloader Middleware執行後返回response是才調用該方法對response對象進行處理
def process_response(self, request, response, spider):
"""
:param request: 調度出來被Downloader Middleware處理的request對象
:param response: Downloader Middleware處理request對象返回後的response對象
:param spider: response返回來的spider對象
"""
return response

# 當process_request和process_response發生異常時調用
def process_exception(self, request, exception, spider):
"""
:param request: 產生異常的request對象
:param exception: 拋出的異常對象
:param spider: 產生異常的request對象的spider對象
"""
pass

# 當spider開啟時調用該函數,說明開始爬取數據並分配spider的資源
def spider_opened(self, spider):
"""
:param spider: 開始爬取的spider對象
"""
spider.logger.info(‘Spider opened: %s‘ % spider.name)


# # 當某個spider被關閉時,說明關閉該爬蟲並釋放spider在spider_opened時占用的資源。
# def spider_closed(self, spider):
# """
# :param spider: 開始爬取的spider對象
# """
# spider.logger.info(‘Spider opened: %s‘ % spider.name)

第十七節:Scrapy爬蟲框架之Middleware文件詳解