1. 程式人生 > >爬蟲爬取鏈家二手房資訊,對二手房做分析

爬蟲爬取鏈家二手房資訊,對二手房做分析

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import requests
def generate_web_link(districts):
    '''此函式生成武漢地區所有區域二手房網頁連結地址'''
    page_urls = [] 
    base_url = 'https://wh.lianjia.com/ershoufang/{}'
    for district in districts:
        district_url = base_url.format(district)
        res = requests.get(district_url).content.decode('utf-8'
) soup = BeautifulSoup(res,'lxml') totalpage = int(eval(soup.find('div',{'class':'page-box house-lst-page-box'})['page-data'])['totalPage'])#找出每個區域總共有多少頁 #eval函式將字元轉化為表示式,find找出的內容是字串形式的字典'{"totalPage":100,"curPage":1}' for page in range(1,totalpage+1): page_url = district_url + '/pg{}'
.format(page) page_urls.append((district,page_url)) return page_urls
def house_info_spider(page_links):
    district_dicts = {'jiangan':'江岸','jianghan':'江漢','qiaokou':'礄口',
                    'dongxihu':'東西湖','wuchang':'武昌','qingshan':'青山',
                    'hongshan':'洪山','hanyang'
: '漢陽','donghugaoxin':'東湖高新', 'jiangxia':'江夏'} infos = pd.DataFrame() for page_link in page_links: res = requests.get(page_link[1]).content.decode('utf-8') soup = BeautifulSoup(res,'lxml') house_infos = [i.text for i in soup.find_all('div',{'class':'houseInfo'})] floors = [i.text for i in soup.find_all('div',{'class':'positionInfo'})] total_prices = [i.text for i in soup.find_all('div',{'class':'totalPrice'})] unit_prices = [i.text for i in soup.find_all('div',{'class':'unitPrice'})] house_districts = [district_dicts[page_link[0]]]*len(house_infos) for house_info,floor,total_price,unit_price,district in zip(house_infos,floors,total_prices,unit_prices,house_districts): infos = infos.append([[house_info,floor,total_price,unit_price,district]]) infos.columns = ['資訊','樓層','售價','單價','地區'] return infos
#開始採集資料
if __name__ == '__main__':
    districts = ['jiangan','jianghan','qiaokou','dongxihu','wuchang','qingshan',
                 'hongshan','hanyang','donghugaoxin','jiangxia']
    page_links = generate_web_link(districts)
    house_datas = house_info_spider(page_links)
house_datas = house_datas.reset_index(drop=True)
house_datas.to_csv('lianjia_house.csv',index=False)#資料儲存到電腦