1. 程式人生 > >scrapy+selenium 爬取淘寶商城商品數據存入到mongo中

scrapy+selenium 爬取淘寶商城商品數據存入到mongo中

mage 通過 -c style settings 一個 arc lec less

1.配置信息

# 設置mongo參數
MONGO_URI = localhost
MONGO_DB = taobao

# 設置搜索關鍵字
KEYWORDS=[小米手機,華為手機]
# 最大爬取頁數
MAX_PAGE = 2
# 相應超時設置
SELENIUM_TIMEOUT = 20

ROBOTSTXT_OBEY = False #忽略

# 中間件
DOWNLOADER_MIDDLEWARES = {
   taobaoSpider.middlewares.SeleniumMiddleware: 300,
}


#項目管道
ITEM_PIPELINES = {
   #
‘taobaoSpider.pipelines.TaobaospiderPipeline‘: 300, taobaoSpider.pipelines.MongoPipeline: 400, }
2.item
import scrapy


class TaobaospiderItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    title = scrapy.Field()
    price = scrapy.Field()
    sales 
= scrapy.Field() shop = scrapy.Field() location = scrapy.Field() image = scrapy.Field()

3.spider

# -*- coding: utf-8 -*-
import scrapy

class TaobaoSpider(scrapy.Spider):
    name = taobao
    allowed_domains = [taobao.com]
    base_url = https://s.taobao.com/search?q=

    def start_url(self):
        
# 通過self.settings.get()的方式獲取setting裏面的參數 for keyword in self.gettings.get(KEYWORDS): for page in range(1,self.gettings.get(MAX_PAGE)+1): url = self.base_url.format(self.url) yield scrapy.Request(url=self.url, callback=self.parse, meta={page:page}, # 傳遞頁碼 dont_filter=True) # 不去重 def parse(self, response): products = response.xpath(//*[@id="mainsrp-itemlist"]/div[@class="m-itemlist"]/div[@class="grid g-claerfix"]/div[1]) # products = response.xpath(‘//div[contains(@class,"item J_MouserOnverReq"/‘) for product in products: from taobaoSpider.taobaoSpider.items import TaobaospiderItem item = TaobaospiderItem() item[title] = ‘‘.join(product.xpath(//div[contains(@class,"title")]/text()).extract()).strip() item[location] = ‘‘.join(product.xpath(//div[contains(@class,"location")]/text()).extract()).strip() item[shop] = ‘‘.join(product.xpath(//div[contains(@class,"shop")]/text()).extract()).strip() item[price] = ‘‘.join(product.xpath(//div[contains(@class,"price")]/text()).extract()).strip() item[deal] = ‘‘.join(product.xpath(//div[contains(@class,"deal-cnt")]/text()).extract()).strip() item[iamge] = ‘‘.join(product.xpath(//div[@class="pic"]/img[contains(@class,"img")/@data-src).extract()).strip() yield item

4.中間件

from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from scrapy.http import HtmlResponse
from logging import getLogger


class SeleniumMiddleware(object):
    # def __init__(self, timeout=None, service_args=[]):
    def __init__(self, timeout=None):
        self.logger = getLogger(__name__)
        self.timeout = timeout
        # self.browser = webdriver.PhantomJS(service_args=service_args)
        # 無界面模式
        # self.options = webdriver.ChromeOptions()
        # self.options.add_argument(‘--headless‘)
        # self.browser = webdriver.Chrome(chrome_options=self.options)
        self.browser = webdriver.Chrome()
        # self.browser.set_window_size(1400, 700)
        self.browser.set_page_load_timeout(self.timeout)
        self.wait = WebDriverWait(self.browser, self.timeout)
        print(timeout:, self.timeout)

    def __del__(self):
        self.browser.close()

    def process_request(self, request, spider):
        ‘‘‘
        :param request:
        :param spider:
        :return:
        ‘‘‘
        self.logger.debug(Selenium is Runing)
        # 得到的是個int型的整數
        page = request.meta.get(page, 1)
        try:
            self.browser.get(request.url)
            print(10*-, request.url,10*-)
            if page > 1:
                # 從第二頁開始,等待頁面加載完成
                # Presence_of_all_elements_located 判斷一組元素是否存在
                input = self.wait.until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, ##mainsrp-pager > div > div > div > div.form > input)))
                # Element_to_be_clickable 判斷元素是否可點擊
                submit = self.wait.until(
                    EC.element_to_be_clickable((By.CSS_SELECTOR, #mainsrp-pager > div > div > div > div.form > span.btn.J_Submit)))
                input.clear()
                # 輸入第幾頁
                input.send_keys(page)
                submit.click()
            # Text_to_be_present_in_element 判斷元素是否有xx文本信息
            self.wait.until(EC.text_to_be_present_in_element((
                By.CSS_SELECTOR, #mainsrp-pager > div > div > div > ul > li.item.active > span), str(page)))
            # Presence_of_all_elements_located 判斷一組元素是否存在
            # 檢測每一個item是否加載出來了
            self.wait.until(EC.presence_of_element_located((
                By.CSS_SELECTOR, #mainsrp-itemlist .m-itemlist .grid.g-clearfix .item)))
            return HtmlResponse(url=request.url, body=self.browser.page_source, request=request, encoding=utf-8, status=200)
        except TimeoutException:
            return HtmlResponse(url=request.url, status=500, request=request)

    # 類方法 感覺就是獲取setting裏面的數據 在這裏調用
    # 得到數據之後變成類變量
    @classmethod
    def from_crawler(cls, crawler):
        return cls(timeout=crawler.settings.get(SELENIUM_TIMEOUT),)
                   # service_args=crawler.settings.get(‘PHANTOMJS_SERVICE_ARGS‘))

5.管道(存儲到mongo中)

class Pipeline(object):
    def process_item(self, item, spider):
        return item

import pymongo

# 存儲到mongo中
class MongoPipeline(object):

    # 配置mongo數據庫
    def __init__(self,mongo_url,mongo_db):
        self.mongo_url = mongo_url
        self.mongo_db = mongo_db

    # 從setting中獲取參數
    @classmethod
    def from_crawler(cls,crawler):
        mongo_url = crawler.settings.get(MONGO_URL)
        mongo_db = crawler.settings.get(MONGO_DB)

    # 連接數據庫
    def open_spider(self,spider):
        self.client = pymongo.MongoClient(self.mongo_url)
        self.db = self.client[self.mongo_db]

    # 關閉數據庫連接
    def close_spider(self,spider):
        self.client.close()

    # 設置存儲格式
    def process_item(self,item,spider):
        # item.__class__.__name__  輸出的item的類名
        name = item.__class__.__name__
        print(---------------name, name, -------------------)
        self.db[name].insert(dict(item))
        return item

scrapy+selenium 爬取淘寶商城商品數據存入到mongo中