1. 程式人生 > >python Scrapy網路爬蟲實戰(存Json檔案以及存到mysql資料庫)

python Scrapy網路爬蟲實戰(存Json檔案以及存到mysql資料庫)

1-Scrapy建立新工程

在開始爬取之前,您必須建立一個新的 Scrapy 專案。 進入您打算儲存程式碼的目錄中【工作目錄】,執行下列命令,如下是我建立的一個爬取豆瓣的工程douban【儲存路徑為:C:\python27\web】: 

命令: scrapy startproject douban

2-目錄  如下

3--items的編寫

首先,檔案中有items.py,這個裡面這要是用來封裝爬蟲所要爬的欄位,如爬豆瓣電影,需要爬電影的ID,url,電影名稱等。

# -*- coding:utf-8 -*-
import scrapy
class MovieItem(scrapy.Item):
    rank = scrapy.Field()
    title = scrapy.Field()
    link = scrapy.Field()
    rate = scrapy.Field()
    quote = scrapy.Field()

4-spider_movie250.py  的編寫

# -*- coding:utf-8 -*-
import scrapy
from douban.items import MovieItem
class Movie250Spider(scrapy.Spider):
  # 定義爬蟲的名稱,主要main方法使用
  name = 'doubanmovie'
  allowed_domains = ["douban.com"]
  start_urls = [
    "http://movie.douban.com/top250/"
  ]
  # 解析資料
  def parse(self, response):
    items = []
    for info in response.xpath('//div[@class="item"]'):
      item = MovieItem()
      item['rank'] = info.xpath('div[@class="pic"]/em/text()').extract()
      item['title'] = info.xpath('div[@class="pic"]/a/img/@alt').extract()
      item['link'] = info.xpath('div[@class="pic"]/a/@href').extract()
      item['rate'] = info.xpath('div[@class="info"]/div[@class="bd"]/div[@class="star"]/span/text()').extract()
      item['quote'] = info.xpath('div[@class="info"]/div[@class="bd"]/p[@class="quote"]/span/text()').extract()
      items.append(item)
      yield item
    # 翻頁
    next_page = response.xpath('//span[@class="next"]/a/@href')
    if next_page:
      url = response.urljoin(next_page[0].extract())
      #爬每一頁
      yield scrapy.Request(url, self.parse)

5-編寫pipelines

# -*- coding: utf-8 -*-
import json
import codecs
#以Json的形式儲存
class JsonWithEncodingCnblogsPipeline(object):
    def __init__(self):
        self.file = codecs.open('douban.json', 'w', encoding='utf-8')
    def process_item(self, item, spider):
        line = json.dumps(dict(item), ensure_ascii=False) + "\n"
        self.file.write(line)
        return item
    def spider_closed(self, spider):
        self.file.close()

#將資料儲存到mysql資料庫
from twisted.enterprise import adbapi
import MySQLdb
import MySQLdb.cursors
class MySQLStorePipeline(object):
    #資料庫引數
    def __init__(self):
        dbargs = dict(
             host = '127.0.0.1',
             db = '資料庫名',
             user = 'root',
             passwd = 'root',
             cursorclass = MySQLdb.cursors.DictCursor,
             charset = 'utf8',
             use_unicode = True
            )
        self.dbpool = adbapi.ConnectionPool('MySQLdb',**dbargs)

    '''
    The default pipeline invoke function
    '''
    def process_item(self, item,spider):
        res = self.dbpool.runInteraction(self.insert_into_table,item)
        return item
    #插入的表,此表需要事先建好
    def insert_into_table(self,conn,item):
            conn.execute('insert into douban(rank,title,rate,qute,link) values(%s,%s,%s,%s,%s)', (
                item['rank'][0],
                item['title'][0],
                 item['rate'][0],
                 item['quote'][0],
                 item['link'][0])
                )

6-settings的編寫

USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1) XXXXXXX) Chrome/70.0.3538.67 Safari/537.36'
# start MySQL database configure setting
MYSQL_HOST = '127.0.0.1'
MYSQL_DBNAME = '資料庫名'
MYSQL_USER = 'root'
MYSQL_PASSWD = 'root'
# end of MySQL database configure setting

ITEM_PIPELINES = {
    'douban.pipelines.JsonWithEncodingCnblogsPipeline': 300,
    'douban.pipelines.MySQLStorePipeline': 300,
}

7-main 的編寫

from scrapy import cmdline
cmdline.execute("scrapy crawl doubanmovie".split())