爬蟲02-淘寶資料採集
阿新 • • 發佈:2018-11-01
""" __title__ = '' __author__ = 'Thompson' __mtime__ = '2018/7/24' # code is far away from bugs with the god animal protecting I love animals. They taste delicious. ┏┓ ┏┓ ┏┛┻━━━┛┻┓ ┃ ☃ ┃ ┃ ┳┛ ┗┳ ┃ ┃ ┻ ┃ ┗━┓ ┏━┛ ┃ ┗━━━┓ ┃ 神獸保佑 ┣┓ ┃ 永無BUG! ┏┛ ┗┓┓┏━┳┓┏┛ ┃┫┫ ┃┫┫ ┗┻┛ ┗┻┛ """ from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException import re from lxml import etree import json #建立WebDriver物件 browser = webdriver.Chrome() #等待變數 wait = WebDriverWait(browser,10) try: browser.get('https://www.taobao.com/') # 開啟淘寶首頁 tb_input = wait.until( EC.presence_of_element_located((By.CSS_SELECTOR, '#q')) ) # 等待輸入框載入完成 search_btn = wait.until( EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_TSearchForm > div.search-button > button')) ) # 等待搜尋按鈕載入完成 tb_input.send_keys('iphone X') # 輸入框中傳入“美食” search_btn.click() # 點選搜尋 html = browser.page_source #print(html) pat = re.compile(r'g_page_config = (.*?}});') matchObj = pat.search(html) if matchObj != None: conf = json.loads(matchObj.group(1)) print(conf) print(type(conf)) with open("./data/iphonex.json", "w", encoding='utf-8') as f: # # indent 超級好用,格式化儲存字典,預設為None,小於0為零個空格 # #f.write(json.dumps(matchObj.group(1), indent=4)) json.dump(conf, f, indent=4) # 和上面的效果一樣 # html = etree.HTML(browser.page_source) itemlist = conf["mods"]["itemlist"]["data"]["auctions"] for i in range(len(itemlist)): print("店鋪:",itemlist[i]['nick']) print("item_loc:", itemlist[i]['item_loc']) print("pic_url:", itemlist[i]['pic_url']) print("評論數:", itemlist[i]['comment_count']) print("詳情頁面:", itemlist[i]['detail_url']) print("標題:", itemlist[i]['raw_title']) print("標題:", itemlist[i]['title']) print("價格:", itemlist[i]['view_price']) print('='*80) except TimeoutException as e: print(e) browser.close()