1. 程式人生 > >scrapy 爬蟲,ip代理,useragent,連線mysql的一些配置

scrapy 爬蟲,ip代理,useragent,連線mysql的一些配置

爬蟲Scrapy 資料庫的配置mysqlpymysql

#進入pipelines.py檔案

#首先匯入pymysql

import pymysql

class SqkPipeline(object):

    def __init__(self):

        self.client=pymsql.connect(

            host = ‘127.0.0.1’,

            port = 3306,

            user = ‘root’,

            password = ‘123456’,

            db = ‘xyts’,

            charset = ‘utf8’

        )

self.cur = self.client.cursor()

def process_item(self,item,spider):

    sql = “insert into birif_sf(image,title,author,classify,intro)VALUES     (%s,%s,%s,%s,%s)

    lis = (item['image_url'],item['title'],item['author'],item['classify'],

     item['intro'])

    self.cur.execute(sql,lis)

    self.client.commit()

    return item

#下圖註釋:

 

ip代理 配置

建立一個.py檔案  proxymiddlewares.py

import random,base64

Class ProxyMiddleware(object):

    proxy_list = [

    '117.43.1.128:808',    '122.114.31.177:808',    '61.135.217.7:80',    '122.72.18.35:80',    '122.72.18.34:80',    '123.139.56.238:9999',    '110.52.8.14:53281',    '139.224.80.139:3128'

     ]

def process_request(self,request,spider):

    pro_dir = random.choice(self.proxy_list)

    print(“USE PROXY ->”+pro_dir)

    request.meta[‘proxy’] = ‘http://’+pro_dir

繼續建立一個.py檔案useragent.py

#進入useragent.py檔案

#匯入所需模組

from scrapy import log

import loggin,random

from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware

class UserAgent(UserAgentMiddleware):

    user_agent_list = ["Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "        "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1","Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "        "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "        "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6","Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "        "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6","Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "        "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1","Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "        "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5","Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "        "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3","Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3","Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3","Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "        "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "        "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3","Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3","Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3","Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "        "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3","Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "        "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24","Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "        "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"    ]

def __init__(self,user_agent):

    self.user_agent=user_agent

def process_request(self,request,spider):

    ua = random.choice(self.user_agent_list)

    if ua:

       log.msg(‘Current UserAgent:’+ua,level=longgin.DEBUG)

       request.headers.setdefault(‘user-agent’,ua)

建立好以上倆個檔案以後修改settings.py

#覆蓋預設頭部請求

DEFAULT_REQUEST_HEADERS = {'Accept':  

        'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8','Accept-Language': 'zh-CN,zh;q=0.8','Host':'ip84.com','Referer':'http://ip84.com/','X-XHR-Referer':'http://ip84.com/'}

#啟用自定義的倆箇中間件,禁用scrapy內部的useragent中介軟體

DOWNLOADER_MIDDLEWARES = {'sqk_xs.useragent.UserAgent':1,'sqk_xs.proxymiddlewares.ProxyMiddleware': 100,

'scrapy.downloadermiddleware.useragent.UserAgentMiddleware':None,}

#配置專案管道

ITEM_PIPELINES = {'sqk_xs.pipelines.SqkXsPipeline': 300,}

#啟動scrapy的專案:

#建立一個.py檔案start.py

#進入start.py檔案

#匯入模組

from scrapy.cmdline import execute

if __name__ == ‘__main__’:

execute(‘scrapy crawl qidian’.split())