1. 程式人生 > >Python爬蟲之利用BeautifulSoup爬取豆瓣小說(三)——將小說信息寫入文件

Python爬蟲之利用BeautifulSoup爬取豆瓣小說(三)——將小說信息寫入文件

設置 one 行為 blog 應該 += html uil rate

 1 #-*-coding:utf-8-*-
 2 import urllib2
 3 from bs4 import BeautifulSoup
 4 
 5 class dbxs:
 6 
 7     def __init__(self):
 8         self.pageIndex = 0
 9         self.enable = True
10         self.file = None
11         self.content = []
12         
13         
14     #獲取html頁面的內容
15     def getPage(self, pageIndex):
16 try: 17 #設置代理ip 18 enable_proxy = True 19 proxy_handler = urllib2.ProxyHandler({Http: 113.118.170.230:808}) 20 null_proxy_handler = urllib2.ProxyHandler({}) 21 if enable_proxy: 22 opener = urllib2.build_opener(proxy_handler)
23 else: 24 opener = urllib2.build_opener(null_proxy_handler) 25 urllib2.install_opener(opener) 26 #獲得頁面響應的內容 27 url = https://www.douban.com/tag/%E5%B0%8F%E8%AF%B4/book + "?start=" + str(pageIndex) 28 #設置請求頭部信息,模擬瀏覽器的行為 29 my_headers = {
User-Agent:Mozilla/5.0 (Windows NT 6.1; WOW64; rv:55.0)} 30 request = urllib2.Request(url, headers = my_headers) 31 response = urllib2.urlopen(request) 32 return response.read() 33 except urllib2.URLError, e: 34 if hasattr(e, "code"): 35 print e.code 36 if hasattr(e, "reason"): 37 print e.reason 38 return None 39 40 #過濾查找這一頁的小說名字,信息和評分 41 def getContent(self, pageIndex, content): 42 pageCode = self.getPage(pageIndex) 43 soup = BeautifulSoup(pageCode, html.parser) 44 #在獲得相應的內容中找出所有標簽為<dd>的內容(裏面包含了我們需要的小說信息) 45 contents = soup.find_all(dd) 46 47 if contents: 48 for item in contents: 49 title = item.find(class_ = title).string.encode(utf-8) 50 info = item.find(class_ = desc).string.strip().encode(utf-8) 51 rate = item.find(class_ = rating_nums) 52 #通過試驗,我們發現某一頁可能存在小說沒有評分,如果我們不判斷rate,那麽可能就出現報錯 53 if rate: 54 rates = rate.string.encode(utf-8) 55 content.append([title, info, rates]) 56 57 else: 58 content.append([title, info]) 59 #如果頁面不包含<dd>標簽,我們應該停止 60 else: 61 print u"所有頁面已加載完" 62 self.enable = False 63 64 return content 65 66 67 68 #寫入文件 69 def writeData(self, content): 70 self.file = open("bdxs.txt", "w+") #必須在for循環外面,不然每一次寫入都會覆蓋之前的數據 71 for item in content: 72 if len(item) == 3: 73 self.file.write(item[0] + "\n") 74 self.file.write(item[1] + "\n") 75 self.file.write(u"評分:" + item[2] + "\n\n") 76 else: 77 self.file.write(item[0] + "\n") 78 self.file.write(item[1] + "\n") 79 self.file.write("========================================\n\n") 80 81 82 #創建一個開始方法 83 def start(self): 84 x = 1 85 while self.enable == True: 86 content = self.getContent(self.pageIndex, self.content) 87 if self.enable == True: 88 print "正在寫入第%s頁..." %x 89 self.writeData(content) 90 self.pageIndex += 15 91 x += 1 92 93 94 DBXS = dbxs() 95 DBXS.start()

這段代碼我還沒理解透徹,比如每一頁的小說信息寫入完成後,怎麽在後面加上第幾頁,後期我將繼續完善它。

Python爬蟲之利用BeautifulSoup爬取豆瓣小說(三)——將小說信息寫入文件