1. 程式人生 > >爬蟲02-淘寶資料採集

爬蟲02-淘寶資料採集

"""
__title__ = ''
__author__ = 'Thompson'
__mtime__ = '2018/7/24'
# code is far away from bugs with the god animal protecting
    I love animals. They taste delicious.
              ┏┓      ┏┓
            ┏┛┻━━━┛┻┓
            ┃      ☃      ┃
            ┃  ┳┛  ┗┳  ┃
            ┃      ┻      ┃
            ┗━┓      ┏━┛
                ┃      ┗━━━┓
                ┃  神獸保佑    ┣┓
                ┃ 永無BUG!   ┏┛
                ┗┓┓┏━┳┓┏┛
                  ┃┫┫  ┃┫┫
                  ┗┻┛  ┗┻┛
"""

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import re
from lxml import etree
import json


#建立WebDriver物件
browser = webdriver.Chrome()
#等待變數
wait = WebDriverWait(browser,10)
try:
    browser.get('https://www.taobao.com/')  # 開啟淘寶首頁
    tb_input = wait.until(
        EC.presence_of_element_located((By.CSS_SELECTOR, '#q'))
    )  # 等待輸入框載入完成
    search_btn = wait.until(
        EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_TSearchForm > div.search-button > button'))
    )  # 等待搜尋按鈕載入完成
    tb_input.send_keys('iphone X')  # 輸入框中傳入“美食”
    search_btn.click()  # 點選搜尋

    html = browser.page_source
    #print(html)
    pat = re.compile(r'g_page_config = (.*?}});')
    matchObj = pat.search(html)
    if matchObj != None:
        conf = json.loads(matchObj.group(1))
    print(conf)
    print(type(conf))
    with open("./data/iphonex.json", "w", encoding='utf-8') as f:
    #     # indent 超級好用,格式化儲存字典,預設為None,小於0為零個空格
    #     #f.write(json.dumps(matchObj.group(1), indent=4))
         json.dump(conf, f, indent=4)  # 和上面的效果一樣
    # html = etree.HTML(browser.page_source)
    itemlist = conf["mods"]["itemlist"]["data"]["auctions"]
    for i in range(len(itemlist)):
        print("店鋪:",itemlist[i]['nick'])
        print("item_loc:", itemlist[i]['item_loc'])
        print("pic_url:", itemlist[i]['pic_url'])
        print("評論數:", itemlist[i]['comment_count'])
        print("詳情頁面:", itemlist[i]['detail_url'])
        print("標題:", itemlist[i]['raw_title'])
        print("標題:", itemlist[i]['title'])
        print("價格:", itemlist[i]['view_price'])
        print('='*80)
except TimeoutException as e:
    print(e)

browser.close()