爬蟲爬取鏈家二手房資訊,對二手房做分析
阿新 • • 發佈:2019-02-15
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import requests
def generate_web_link(districts):
'''此函式生成武漢地區所有區域二手房網頁連結地址'''
page_urls = []
base_url = 'https://wh.lianjia.com/ershoufang/{}'
for district in districts:
district_url = base_url.format(district)
res = requests.get(district_url).content.decode('utf-8' )
soup = BeautifulSoup(res,'lxml')
totalpage = int(eval(soup.find('div',{'class':'page-box house-lst-page-box'})['page-data'])['totalPage'])#找出每個區域總共有多少頁
#eval函式將字元轉化為表示式,find找出的內容是字串形式的字典'{"totalPage":100,"curPage":1}'
for page in range(1,totalpage+1):
page_url = district_url + '/pg{}' .format(page)
page_urls.append((district,page_url))
return page_urls
def house_info_spider(page_links):
district_dicts = {'jiangan':'江岸','jianghan':'江漢','qiaokou':'礄口',
'dongxihu':'東西湖','wuchang':'武昌','qingshan':'青山',
'hongshan':'洪山','hanyang' : '漢陽','donghugaoxin':'東湖高新',
'jiangxia':'江夏'}
infos = pd.DataFrame()
for page_link in page_links:
res = requests.get(page_link[1]).content.decode('utf-8')
soup = BeautifulSoup(res,'lxml')
house_infos = [i.text for i in soup.find_all('div',{'class':'houseInfo'})]
floors = [i.text for i in soup.find_all('div',{'class':'positionInfo'})]
total_prices = [i.text for i in soup.find_all('div',{'class':'totalPrice'})]
unit_prices = [i.text for i in soup.find_all('div',{'class':'unitPrice'})]
house_districts = [district_dicts[page_link[0]]]*len(house_infos)
for house_info,floor,total_price,unit_price,district in zip(house_infos,floors,total_prices,unit_prices,house_districts):
infos = infos.append([[house_info,floor,total_price,unit_price,district]])
infos.columns = ['資訊','樓層','售價','單價','地區']
return infos
#開始採集資料
if __name__ == '__main__':
districts = ['jiangan','jianghan','qiaokou','dongxihu','wuchang','qingshan',
'hongshan','hanyang','donghugaoxin','jiangxia']
page_links = generate_web_link(districts)
house_datas = house_info_spider(page_links)
house_datas = house_datas.reset_index(drop=True)
house_datas.to_csv('lianjia_house.csv',index=False)#資料儲存到電腦