1. 程式人生 > >Scrapy Spider MiddleWare 設定

Scrapy Spider MiddleWare 設定

# -*- coding: utf-8 -*- # Define here the models for your spider middleware # # See documentation in: # https://doc.scrapy.org/en/latest/topics/spider-middleware.html from newrecord.settings import NOW_Y, NOW_M, NOW_D, YES_Y, YES_M, YES_D from scrapy import signals import time        import base64                    # DownloadMiddleware                # 0 47 167 寶藍色RGB
                                            # 在process_request   downloadmiddleware 中新增代理                                             # proxy_user_pass = 'USERNAME:PASSWORD'
                                            # encoded_user_pass = base64.b64encode(proxy_user_pass)                                             # request.headers['Proxy-Authorization']='Basic'+encoded_user_passwd
                                            # request.meta['proxy']='IP:PORT'                                        class NewrecordSpiderMiddleware(object):     # Not all methods need to be defined. If a method is not defined,     # scrapy acts as if the spider middleware does not modify the     # passed objects.                        @classmethod        def from_crawler(cls, crawler):         # This method is used by Scrapy to create your spiders.         s = cls()           crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)         return s     # 進入spider 的response 的資料 # 和 downloadmiddle裡面的process_response類似 # 先去downloadmiddleware那裡的process_response  再到這裡處理:                    def process_spider_input(self, response, spider):            # 處理進入spider中的response資料,但返回的是None          print('-----------------------3--------------------')         # 對response篩選之後不能阻止進入spider  啥用?try--except---Exception,            print('---進入spidermiddleware----process_spider_input------response.url----%s--------'%(response.url))         # Called for each response that goes through the spider         try:               # middleware and into the spider.         # Should return None or raise an exception.             return None         except Exception as e:             print(e)                         def process_spider_output(self, response, result, spider):         # Called with the results returned from the Spider, after         # it has processed the response.         # Must return an iterable of Request, dict or Item objects. # result :經過parse_item 處理過後的輸出結果,等於item資料也可以在這裡處理,不過是在Pipline處理過後的資料 # parse_item 輸出的結果先進入pipeline管道里去處理item資料最後回到process_spider_output這裡,再就是關閉spider:         for i in result:             yield i                         def process_spider_exception(self, response, exception, spider):         # Called when a spider or process_spider_input() method         # (from other spider middleware) raises an exception.                             # Should return either None or an iterable of Response, dict         # or Item objects.         pass                            # 處理start_urls  後面的url無關: 否則這方法不會執行,只能是start_urls引數 # 並且def 裡面的東西只能是process_start_requests # 處理start_urls  與後面的url無關:     def process_start_requests(self, start_urls, spider):         # Called with the start requests of the spider, and works # similarly to the process_spider_output() method, except         # that it doesn’t have a response associated.         # Must return only start_urls (not items).         for r in start_urls:             if str(r).find('rank_news') >= 0:                 print('---------------------0-----------------------------')                 print('-------------------進入Spider MiddleWare裡面的開始爬去網頁url-----------start_requests===:%s', r)                 yield r                        def spider_opened(self, spider):         spider.logger.info('Spider opened: %s ' %spider.name)                                        class NewrecordDownloaderMiddleware(object):     # Not all methods need to be defined. If a method is not defined,     # scrapy acts as if the downloader middleware does not modify the     # passed objects.                         @classmethod        def from_crawler(cls, crawler):         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)         return s                       # Proxy-Authorization base64代理賬戶驗證 # request.meta['proxy'] = "http://YOUR_PROXY_IP:PORT" # encoded_user_pass = base64.b64encode(proxy_user_pass) # request.headers['Proxy-Authorization'] = 'Basic ' + encoded_user_pass # request.meta['proxy'] = ['127.0.0.1:8000'] # request.meta['item']=''       在request meta 資料裡面增加資料 可以用來傳參 # request(url, meta['item']=item[], callback= '') # request.cookies['']=''    往request裡面增加cookies     def process_request(self, request, spider):         print('---------------1------------------')         print('----------------進入DownloadMiddleWare中的request的url是:%s----------------' %(request.url))         return None   # return None: continue processing this exception              # return a Response object: stops process_exception() chain    # return a Request object: stops process_exception() chain         def process_response(self, request, response, spider):         # 處理所有爬過的網站的response,通過response.url  可以篩選                                                                                                 print('-----------------------------2---------------------------------')    #  需要的爬取的網址,但這個在Rules裡面更方便        print('----------------進入DownloadMiddleWare中的response的url是:%s----------------' %(response.url))         return response                                                              # 返回的response 進入spider 中的process_spider_input                                                                                         def process_exception(self, request, exception, spider):                 pass                                                                                                                      def spider_opened(self, spider):                                   spider.logger.info('Spider opened: %s' % spider.name)                                                                               121       1,19         頂端