1. 程式人生 > >免app下載筆趣閣小說

免app下載筆趣閣小說

[] .com site 根據 app下載 代碼 earch mozilla 學習

  這個是對最近學習的一次總結吧。前兩天寫的,今天才有時間寫博客。

  偶然點開筆趣閣的網址(https://www.biquge.cc/),突然覺得我應該可以用爬蟲實現小說下載。有這個想法我就開始嘗試了。

  

  爬蟲呀,說白了就是程序自動模擬瀏覽器操作來獲取網頁的內容。

  先用F12查看元素,查看章節網址鏈接,和章節正文內容。

  結構很簡單。

  想法很快就有了,通過網站的搜索打開小說詳情頁,然後獲取每一章的網址url,依次訪問每一章網址,再通過正則表達式匹配章節內容,

最後將匹配的內容保存到本地。

  中間忘了一個小的知識點,就是我使用re.findall()來匹配的,它最後返回的時一個列表!!!

  運行結果如下圖:

  技術分享圖片

  

  代碼如下:

  

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2018/10/20 15:46
# @Author  : yuantup
# @Site    : 
# @File    : biquge.py
# @Software: PyCharm
import urllib.request
import re
import time
import os


def open_url(url):
    # 打開網址專用
    # 以字典的形式設置headers
    head = {
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8, # ‘Accept-Encoding‘: ‘gzip‘, # 接受編碼如果是gzip,deflate之類的,可能會報錯 Accept-Language: zh-CN,zh;q=0.9, Connection: keep-alive, Host: sou.xanbhx.com,
Referer: https://www.biquge.cc/, Upgrade-Insecure-Requests: 1, User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36, } # 設置cookies # proxy = urllib.request.ProxyHandler({‘http‘: ‘127.0.0.1:8888‘}) opener = urllib.request.build_opener() # 遍歷字典,將其轉換為指定格式(外層列表,裏層元組) headers = [] for key, value in head.items(): item = (key, value) headers.append(item) opener.addheaders = headers urllib.request.install_opener(opener) response = urllib.request.urlopen(url) html = response.read() time.sleep(1) return html def novel_detail(book_name): # 根據傳入的小說名字獲取到小說的詳情頁,並提取出小說內容(詳情,每個章節的網址) # 小說存在重名情況!!!待解決 zh_book_name = urllib.request.quote(book_name) url = https://sou.xanbhx.com/search?siteid=biqugecc&q= + zh_book_name html = open_url(url).decode(utf-8) # print(html) name_pa = <span class="s2">.*?<a href="(.*?)" target="_blank">.*?(\S*?)</a> name_list = re.findall(name_pa, html, re.S) # print(name_list[1]) if name_list[0][1] == book_name: book_url = name_list[0][0] print(book_url) elif not name_list: print(‘‘) print(對不起,該網址沒有找到你需要的書。) return book_url def content(url): # 獲取小說正文 html = open_url(url).decode(utf-8) # print(html) main_body_pa = r最新章節(提示:已啟用緩存技術,最新章節可能會延時顯示,登錄書架即可實時查看。).*?<dt>(.*?)</div> chapter_url_pa = r<a style="" href="(.*?)"> main_body = re.findall(main_body_pa, html, re.S) # print(main_body, ‘ 1‘) # 記住re.findall()方法返回的時一個列表!!! chapter_url = re.findall(chapter_url_pa, main_body[0]) # print(chapter_url, ‘ 2‘) time.sleep(2) return chapter_url def save_novel(novel_url, content_url_list, book_name): # 保存小說內容 for i in range(len(content_url_list)): real_url = novel_url + content_url_list[i] html = open_url(real_url).decode(utf-8) # print(html) chapter_name_pa = <h1>(.*?)</h1> chapter_name = re.search(chapter_name_pa, html).group(1) # print(chapter_name) # print(type(chapter_name)) content_pa = r<div id="content">(.*?)<script> content1 = re.findall(content_pa, html, re.S) content2 = content1[0].replace(&nbsp;&nbsp;&nbsp;&nbsp;, ) content3 = content2.replace(<br/>, \n) content4 = content3.replace(</br>, ‘‘) re_chapter_name = chapter_name.replace( , ‘‘) content5 = content4.replace(re_chapter_name, ‘‘) # 有些章節內容包括章節名,這裏替換掉它們。 whole_content = + chapter_name + \n + content5 # print(whole_content) # print(chapter_name) with open(book_name + .txt, a, encoding=utf-8) as f: f.write(whole_content) print(成功下載 {}.format(chapter_name)) time.sleep(1) def main(): path = rE:\spiser_sons\books a = os.getcwd() print(a) if os.path.exists(path): os.chdir(path) print(os.getcwd()) else: os.mkdir(path) os.chdir(path) book_name = input(請輸入想下載小說的名字:) novel_url = novel_detail(book_name) content_url_list = content(novel_url) save_novel(novel_url, content_url_list, book_name) if __name__ == __main__: main()

免app下載筆趣閣小說