【爬蟲入門】【同步】爬取人人車車輛資訊1.0
阿新 • • 發佈:2019-01-09
# 爬取人人車車車輛資訊。 from urllib.request import urlopen from urllib.error import HTTPError import re, sqlite3 class RRCSpider(object): """ 人人車爬蟲類 """ def __init__(self): pass def get_list_html(self, page_num): """ 獲取列表頁原始碼 :param page_num: 列表頁的頁碼 :return: 返回網頁原始碼 """ list_url = 'https://www.renrenche.com/zz/ershouche/p{}/'.format(page_num) try: list_html = urlopen(list_url).read().decode() except HTTPError as e: print('列表頁異常:url={}, error={}'.format(list_url, e)) return None, None else: return list_html, list_url def parse_list_html(self, list_html, list_url): """ 解析列表頁資料 :param list_html: 列表頁網頁原始碼 :return: 返回每一個數據的詳情頁地址 """ # 利用正則表示式提取列表頁中所有二手車的詳情頁的連結。 detail_urls = re.findall(re.compile(r'<li class="span6 list-item.*?".*?<a.*?href="(.*?)".*?class="thumbnail"', re.S), list_html) if detail_urls: return detail_urls else: print('列表頁資料為空:url={}'.format(list_url)) return None def get_detail_html(self, detail_url): """ 獲取詳情頁原始碼 :param detail_url: 詳情頁的url :return: 返回詳情頁網頁原始碼 """ try: detail_html = urlopen(detail_url).read().decode() except HTTPError as e: print('詳情頁異常:url={}, error={}'.format(detail_url, e)) return None, None else: return detail_html, detail_url def parse_detail_html(self, detail_html, detail_url): """ 解析詳情頁資料 :param detail_html: 詳情頁網頁原始碼 :return: None """ # [('本天', '6.7', '2010')] data = re.findall(re.compile(r'<h1 class="title-name rrc.*?">(.*?)</h1>.*?<p class="price.*?">(.*?)</p>.*?<p class="money.*?首付(.*?)<.*?月供(.*?)</p>.*?<ul class=".*?box-list-primary-detail">.*?<strong class="car-summary rrc.*?">(.*?)</strong>.*?<p class="small-title rrc.*?">(.*?)</p>.*?<strong.*?id="car-licensed">(.*?)</strong>.*?<p>.*?<strong class="car-summary">(.*?)</strong>.*?<p class="transfer-record">.*?<strong.*?>(.*?)</strong>', re.S), detail_html)[0] print(data) def start_spider(self, num): """ 爬蟲程式啟動入口 :return: """ print('正在請求第{}頁'.format(num)) list_html, list_url = self.get_list_html(num) if list_html: detail_urls = self.parse_list_html(list_html, list_url) if detail_urls: for detail_url in detail_urls: url = 'https://www.renrenche.com' + detail_url detail_html, d_url = self.get_detail_html(url) if detail_html: self.parse_detail_html(detail_html, d_url) if __name__ == '__main__': obj = RRCSpider() # 這是同步for迴圈 for x in range(1,2): obj.start_spider(x)