1. 程式人生 > >python 學習 - 爬蟲入門練習 爬取鏈家網二手房資訊

python 學習 - 爬蟲入門練習 爬取鏈家網二手房資訊

import requests
from bs4 import BeautifulSoup
import sqlite3

conn = sqlite3.connect("test.db")
c = conn.cursor()

for num in range(1,101):
    url = "https://cs.lianjia.com/ershoufang/pg%s/"%num
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/64.0.3282.140 Safari/537.36
', } req = requests.session() response = req.get(url, headers=headers, verify=False) info = response.text f1 = BeautifulSoup(info,'lxml') f2 = f1.find(class_='sellListContent') f3 = f2.find_all(class_='clear LOGCLICKDATA') for i in f3: data_id = i.find(class_="
noresultRecommend").get('data-housecode') href = i.find( class_ ="noresultRecommend img ").get('href') title = i.find(class_ ="title").get_text() adress = i.find(class_="houseInfo").get_text().split("|") jage = i.find(class_="totalPrice").get_text() # print(k,data_id, '|', title, '|', adress, '|', jage, '|', href)
# print("---") dz = '' fx = '' dx = '' cx = '' zx = '' dt = '' if len(adress) == 6: dz = adress[0] fx = adress[1] dx = adress[2] cx = adress[3] zx = adress[4] dt = adress[5] elif len(adress) ==5: dz = adress[0] fx = adress[1] dx = adress[2] cx = adress[3] zx = adress[4] dt = 'None' # print(dz,fx,dx,cx,zx,dt) elif len(adress) < 5: print(dz, fx, dx, cx, zx, dt) info = {'nid':int(data_id), 'title':title, 'dz':dz, 'fx':fx, 'dx':dx, 'cx':cx, 'zx':zx, 'dt':dt, 'jg':jage, 'url':href} # print(info) x = info sql = "insert into rsf(nid,dz,fx,dx,cx,zx,dt,jg,title,url)values(%d,'%s','%s','%s','%s','%s','%s','%s','%s','%s')" % (x['nid'], x['dz'], x['fx'], x['dx'], x['cx'], x['zx'], x['dt'], x['jg'], x['title'], x['url']) cursor = c.execute(sql) conn.commit() # print("OK") conn.close() # import json # file_path = 'info%s.txt'%num # json_data = json.dumps(info_list).encode('utf8') # with open(file_path,'wb') as f: # f.write(json_data)

 

sqlite3 讀取資料

import sqlite3
conn = sqlite3.connect("test.db")
c = conn.cursor()

#sqlit3 查詢資料
cursor = c.execute("SELECT * from rsf")
k = 1
for row in cursor:
    num = float(row[7].split('')[0])
    if 30.0 < num < 50.0:
        print(k,row[1],row[3],num,row[-2])
        k +=1

conn.close()