1. 程式人生 > >爬取大規模資料(1)

爬取大規模資料(1)

本文以58同城網站為例子
大概流程如下:
1、找到58類目頁的所有類目連結
2、設定資料庫(這裡使用MongoDB)
3、編寫兩個爬蟲分別爬取解析該類目下的所有商品連結、詳情頁資訊並存入資料庫中
4、
首先獲取所有類目的連結:
# channel_extract.py

from bs4 import BeautifulSoup
import requests

start_url = 'http://bj.58.com/sale.shtml' #類目頁
url_host = 'http://bj.58.com'

def getIndexURL(url):
    wb_data = requests.get(url)
    soup = BeautifulSoup(wb_data.text, 'html.parser'
) links = soup.select("ul.ym-submnu > li > b >a") for link in links: page_url = url_host + link.get('href') print(page_url) # 把連結暫時存放在變數channel_list channel_list = ''' http://bj.58.com/shouji/ http://bj.58.com/tongxunyw/ http://bj.58.com/danche/ http://bj.58.com/diandongche/
http://bj.58.com/fzixingche/ http://bj.58.com/sanlunche/ http://bj.58.com/peijianzhuangbei/ http://bj.58.com/diannao/ http://bj.58.com/bijiben/ http://bj.58.com/pbdn/ http://bj.58.com/diannaopeijian/ http://bj.58.com/zhoubianshebei/ http://bj.58.com/shuma/ http://bj.58.com/shumaxiangji/ http://bj.58.com/mpsanmpsi/
http://bj.58.com/youxiji/ http://bj.58.com/ershoukongtiao/ http://bj.58.com/dianshiji/ http://bj.58.com/xiyiji/ http://bj.58.com/bingxiang/ http://bj.58.com/jiadian/ http://bj.58.com/binggui/ http://bj.58.com/chuang/ http://bj.58.com/ershoujiaju/ http://bj.58.com/yingyou/ http://bj.58.com/yingeryongpin/ http://bj.58.com/muyingweiyang/ http://bj.58.com/muyingtongchuang/ http://bj.58.com/yunfuyongpin/ http://bj.58.com/fushi/ http://bj.58.com/nanzhuang/ http://bj.58.com/fsxiemao/ http://bj.58.com/xiangbao/ http://bj.58.com/meirong/ http://bj.58.com/yishu/ http://bj.58.com/shufahuihua/ http://bj.58.com/zhubaoshipin/ http://bj.58.com/yuqi/ http://bj.58.com/tushu/ http://bj.58.com/tushubook/ http://bj.58.com/wenti/ http://bj.58.com/yundongfushi/ http://bj.58.com/jianshenqixie/ http://bj.58.com/huju/ http://bj.58.com/qiulei/ http://bj.58.com/yueqi/ http://bj.58.com/kaquan/ http://bj.58.com/bangongshebei/ http://bj.58.com/diannaohaocai/ http://bj.58.com/bangongjiaju/ http://bj.58.com/ershoushebei/ http://bj.58.com/chengren/ http://bj.58.com/nvyongpin/ http://bj.58.com/qinglvqingqu/ http://bj.58.com/qingquneiyi/ http://bj.58.com/chengren/ http://bj.58.com/xiaoyuan/ http://bj.58.com/ershouqiugou/ http://bj.58.com/tiaozao/ http://bj.58.com/tiaozao/ http://bj.58.com/tiaozao/ '''

# 解析處理資料並存放如資料庫中
# page_parsing.py
from bs4 import BeautifulSoup
import requests
import pymongo
import time

#啟動mongodb
client = MongoClient(‘localhost’, 27017)
#給資料庫命名
ceshi = client[‘ceshi’]
#建立一個表單
url_list = ceshi[‘url_list’]
item_info = ceshi[‘item_info’]

#spider1
def getLinkFrom(channel, pages, who_sell=0):
”’3個引數分別是類目連結、頁碼、賣家(預設0為個人,1是商家)”’
#構造完整連結
full_link = ‘{}{}/pn{}’.format(channel, str(who_sell), str(pages))
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.text, ‘html.parser’)
if soup.find(“td”, “t”):
for link in soup.select(“td.t a.t”):
item_link = link.get(‘href’).split(‘?’)[0]
url_list.insert_one({“url”:item_link})
elsr:
pass

# spider2
def getItemInfo(url):
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.text, ‘html.parser’)
no_longer_exist = ‘404’ in soup.find(‘script’, type=’text/javascript’).get(‘src’).split(‘/’)
if no_longer_exist:
pass
else:
title = soup.title.text
price = soup.select(‘span.price’)[0].text
data = soup.select(”div.detail-title__info__text”)
area = soup.select(‘div.su_con > a’)[0].text
item_info.insert_one({‘title’:title, ‘price’:price,’post_data’:post_data, ‘area’:area, ‘url’:url})

getItemInfo(‘http://bj.58.com/shouji/34326399783213x.shtml‘)

好了