1. 程式人生 > >爬蟲爬當當網書籍信息

爬蟲爬當當網書籍信息

表達式 sel soup bs4 cti rom rtt utf system

拖了好久的一個爬蟲

先上代碼 文字慢慢補

 1 # -*- coding: utf-8 -*
 2 
 3 import urllib2
 4 import xlwt
 5 from bs4 import BeautifulSoup
 6 from datashape import json
 7 import re
 8 import json
 9 import requests
10 
11 
12 def getJsonText(url):
13     try:
14         r = requests.get(url, timeout=1
) 15 r.raise_for_status() 16 r.encoding = r.apparent_encoding 17 return r.text 18 except: 19 print 獲取失敗 20 return ‘‘ 21 22 23 def getgood(url): 24 html = urllib2.urlopen(url).read() 25 26 # 用正則表達式拿取 27 ma = re.search(r"productId":"[\d]+"
, html) 28 productId = eval(ma.group().split(:)[-1]) 29 categoryPath = eval(ma.group().split(:)[-1]) 30 mainProductId = eval(ma.group().split(:)[-1]) 31 # 對Ajax的url進行拼接 32 json_url = http://product.dangdang.com/index.php?r=comment%2Flist&productId={productId}&categoryPath={categoryPath}&mainProductId={mainProductId}&mediumId=0&pageIndex=1&sortType=1&filterType=1&isSystem=1&tagId=0&tagFilterCount=0
.format( 33 productId=productId, categoryPath=categoryPath, mainProductId=mainProductId) 34 # 調用方法,下載下來json數據 35 json_html = json.loads(getJsonText(json_url)) 36 summary = json_html[data][list][summary] 37 data = {} 38 data[all_comment_num] = summary[total_comment_num] # 總評論數 39 data[good_comment_num] = summary[total_crazy_count] # 好評數 40 data[middle_comment_num] = summary[total_indifferent_count] # 中評數 41 data[bad_comment_num] = summary[total_detest_count] # 差評數 42 data[good_rate] = summary[goodRate] # 好評率 43 return data 44 45 def main(): 46 wb = xlwt.Workbook() 47 sheet1 = wb.add_sheet("Sheet") 48 sheet1.write(0, 0, unicode(序號, "utf-8")) 49 sheet1.write(0, 1, unicode(書名, "utf-8")) 50 sheet1.write(0, 2, unicode(價格, "utf-8")) 51 sheet1.write(0, 3, unicode(折扣, "utf-8")) 52 sheet1.write(0, 4, unicode(評論數, "utf-8")) 53 sheet1.write(0, 5, unicode(好評, "utf-8")) 54 sheet1.write(0, 6, unicode(中評, "utf-8")) 55 sheet1.write(0, 7, unicode(差評, "utf-8")) 56 sheet1.write(0, 8, unicode(好評率, "utf-8")) 57 58 for page in range(25): 59 60 url = http://bang.dangdang.com/books/bestsellers/01.00.00.00.00.00-24hours-0-0-1-%d % (page+1) 61 get = urllib2.urlopen(url).read() 62 data = BeautifulSoup(get, lxml) 63 64 bookname = data.find_all(div, attrs={class: name}) 65 bookstar = data.find_all(div, attrs={class: star}) 66 bookprice = data.find_all(div, attrs={class: price}) 67 bookoff = data.find_all(span, attrs={class: price_s}) 68 69 for i in range(20): 70 bookurl = bookname[i].find(a)[href] 71 data = getgood(bookurl) 72 print (str(page*20+i+1) + " " 73 + bookname[i].find(a)[title] + " " # 書名 74 + bookprice[i].find(span).text[1:] + " " # 價格 75 + bookoff[i].text[:-1] + " " # 折扣 76 + bookstar[i].find(a).text[:-3] + " " # 評論數 77 + data[good_comment_num] + " " # 好評數 78 + data[middle_comment_num] + " " # 中評數 79 + data[bad_comment_num] + " " # 差評數 80 + data[good_rate] + " " # 好評率 81 ) 82 83 sheet1.write(page * 20 + i + 1, 0, page * 20 + i + 1) 84 sheet1.write(page * 20 + i + 1, 1, bookname[i].find(a)[title]) 85 sheet1.write(page * 20 + i + 1, 2, bookprice[i].find(span).text[1:]) 86 sheet1.write(page * 20 + i + 1, 3, bookoff[i].text[:-1]) 87 sheet1.write(page * 20 + i + 1, 4, bookstar[i].find(a).text[:-3]) 88 sheet1.write(page * 20 + i + 1, 5, data[good_comment_num]) 89 sheet1.write(page * 20 + i + 1, 6, data[middle_comment_num]) 90 sheet1.write(page * 20 + i + 1, 7, data[bad_comment_num]) 91 sheet1.write(page * 20 + i + 1, 8, data[good_rate]) 92 wb.save(test.xls) 93 94 main()

爬蟲爬當當網書籍信息