使用scrapy框架,用模擬瀏覽器的方法爬取京東上面膜資訊,並存入mysql,sqlite,mongodb資料庫
阿新 • • 發佈:2018-12-02
因為京東的頁面是由JavaScript動態載入的所以使用模擬瀏覽器的方法進行爬取,具體程式碼如下 :
# -*- coding: utf-8 -*- import scrapy from scrapy import Request from jdpro.items import jdproItem num = 0 class MaskSpider(scrapy.Spider): name = 'mask' allowed_domains = ['list.jd.com'] def __init__(self): self.urls = [ "https://list.jd.com/list.html?cat=1316,1381,1392&sort=sort_totalsales15_desc&trans=1&page=85&JL=6_0_0#J_main"] def start_requests(self): for url_str in self.urls: yield Request(url=url_str, callback=self.parse, meta={"page": "0"}, dont_filter=True) def parse(self, response): # with open("jd.html","wb") as f: # f.write(response.body) item = jdproItem() li_list = response.css('#plist > ul > li') page_next = response.css('#J_bottomPage > span.p-num > a.pn-next') print("li_list is :::::: ", li_list) for li in li_list: try: goods_name = li.xpath(r'./div/div/a/em/text()')[0].extract().strip("\n\t ") if goods_name == "": goods_name = li.xpath(r'./div/div/a/em/text()')[1].extract().strip("\n\t ") except Exception as e: print(e) try: goods_price = li.xpath(r'.//div[@class="p-price"]/strong/i/text()')[0].extract() except Exception as e: print(e) goods_price = "暫無價格" try: goods_img = "https:" + li.xpath('.//div[contains(@class,"p-img")]/a/img/@src')[0].extract() except Exception as e: print(e) goods_img = "https:" + li.xpath('.//div[contains(@class,"p-img")]/a/img/@data-lazy-img')[0].extract() try: platfrom = li.xpath('.//div[contains(@class,"p-commit")]/strong/a/text()')[0].extract() except Exception as e: platfrom = "暫無" try: sales = li.xpath('.//div[@class="p-shop"]/span/a/text()')[0].extract().strip(".") except Exception as e: print(e) sales = "暫無" item["goods_name"] = goods_name item["goods_price"] = goods_price item["goods_img"] = goods_img item["platfrom"] = platfrom item["sales"] = sales yield item global num if len(page_next) > 0: num += 1 if num < 260: print("開始爬取第{}頁".format(num)) yield Request(url=response.url, callback=self.parse, meta={"page": "2"}, dont_filter=True) else: print("資料爬取完畢")
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:62.0) Gecko/20100101 Firefox/62.0' ROBOTSTXT_OBEY = False DOWNLOADER_MIDDLEWARES = { # 'jdpro.middlewares.JdproDownloaderMiddleware': 543, 'jdpro.middlewares.SeleniumMiddleware': 543, } ITEM_PIPELINES = { # 'jdpro.pipelines.writeFilesPipeline': 300, 'jdpro.pipelines.saveSqlitePipeline': 301, # 'jdpro.pipelines.saveMysqlPipeline': 302, # 'jdpro.pipelines.saveMongodbPipeline': 303, }
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html # 寫入檔案 import json class writeFilesPipeline(object): def open_spider(self, spider): self.fp = open("data.txt", "w", encoding="utf8") def close_spider(self, spider): self.fp.close() def process_item(self, item, spider): dic = dict(item) string = json.dumps(dic, ensure_ascii=False) self.fp.write(string + "\n") return item # 存入sqlite資料庫 import sqlite3 class saveSqlitePipeline(object): def open_spider(self, spider): # 連線資料庫 self.conn = sqlite3.connect("Goods.db") def close_spider(self, spider): # 關閉資料庫 self.conn.close() def process_item(self, item, spider): self.cursor = self.conn.cursor() sql = 'insert into Goods(goods_name,goods_price,goods_img,platfrom,sales) values("%s","%s","%s","%s","%s")' % ( item['goods_name'], item['goods_price'], item['goods_img'], item['platfrom'], item['sales']) # 執行sql語句 try: self.cursor.execute(sql) self.conn.commit() except Exception as e: print("資料插入失敗...請等待") print(e) self.conn.rollback() return item # 存入mysql資料庫 import pymysql class saveMysqlPipeline(object): def open_spider(self, spider): # 連線資料庫 self.conn = pymysql.Connect(host="xxxxxx", port="3306", user="root", password="xxxxxx", database="xxxxxx", charset="utf8") def colse_spider(self, spider): # 關閉資料庫 self.conn.close() def process_item(self, item, spider): self.cursor = self.conn.cursor() sql = 'insert into Goods(goods_name,goods_price,goods_img,platfrom,sales) values("%s","%s","%s","%s","%s")' % ( item['goods_name'], item['goods_price'], item['goods_img'], item['platfrom'], item['sales']) try: self.cursor.execute(sql) self.conn.commit() except Exception as e: print("資料插入失敗...請等待") print(e) self.conn.rollback() return item # 存入mongodb資料庫 import pymongo class saveMongodbPipeline(object): def open_spider(self, spider): # 連線資料庫 self.client = pymongo.MongoClient(host="localhost", port=27017) def close_spider(self, spider): # 關閉資料庫 self.client.close() def process_item(self, item, spider): # 選擇資料庫 db = self.client.job51 # 選擇集合 col = db.job51 # #將item轉化為字典 print(item) dic = dict(item) col.insert(dic) return item
import time
from selenium import webdriver
from scrapy.http import HtmlResponse
from selenium.webdriver.chrome.options import Options
class SeleniumMiddleware(object):
def __init__(self):
self.options = Options()
self.options.add_argument("--headless")
self.browser = webdriver.Chrome(executable_path=r"D:\python_others\Spider\code\day06\tools\chromedriver.exe",
chrome_options=self.options
)
def process_request(self, request, spider):
if int(request.meta["page"]) == 2:
next_page = self.browser.find_element_by_css_selector('#J_bottomPage > span.p-num > a.pn-next')
next_page.click()
self.browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')
time.sleep(10)
return HtmlResponse(url=self.browser.current_url, body=self.browser.page_source, encoding="utf8",
request=request)
else:
try:
print("url is :::::", request.url)
self.browser.get(request.url)
self.browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')
except TimeoutError as e:
print("超時")
time.sleep(10)
return HtmlResponse(url=self.browser.current_url, body=self.browser.page_source, encoding="utf8",
request=request)