1. 程式人生 > >2018/7/21 Python 爬蟲學習

2018/7/21 Python 爬蟲學習

write 5.0 http ati lib res txt文本 urllib agent

2018/7/21,這幾天整理出來的一些Python 爬蟲學習代碼。

import urllib2

response = urllib2.urlopen("http://baidu.com")

html = response.read()

print html

進一步,可以request

import urllib2

req = urllib2.Request("http://www.baidu.com")

response = urllib2.urlopen(req)

html = response.read()

print html

偽裝瀏覽器

import urllib2
url = "http://www.baidu.com"
user_agent = "Mozilla/5.0(compatible;MSTE 9.0;Windows NT 6.1;Trident/5.0;"
headers = {"User-Agent‘:user_agent}
req = urllib2.Request(rul,headers = headers)
response = urllib2.urlopen(req)
the_page = response.read()
print the_page

代碼:輸入輸出網頁

# _*_ coding:utf-8 _*_
import urllib2

def load_page(url):
user_agent = "Mozilla/5.0 (compatible;MSTE 9.0;Windows NT 6.1;Trident/5.0;"
headers = {"User-Agent":user_agent}
req = urllib2.Request(url,headers = headerss)
response = urllib2.urlopen(req)
html = response.read()
return html

def tieba_spider(url,begin_page,end_page):
"""
貼吧爬蟲的方法
"""
for i in range(begin_page,end_page +1):
pn = 50 * (i-1)
my_url = url + str(pn)
html = load_page(my_url)
print "##################第%頁########################" %(i)
print html
print "###############################################"

if __name__ == "__main__":
url = raw_input("請輸入貼吧的url地址")
begin_page = int(raw_input("請輸入起始頁碼"))
end_page = int(raw_input("請輸入終止頁碼"))

tieba_spider(url,begin_page,end_page)

代碼:輸入輸出保存網頁

# _*_ coding:utf-8 _*_
import urllib2

def load_page(url):
user_agent = "Mozilla/5.0 (compatible;MSTE 9.0;Windows NT 6.1;Trident/5.0;"
headers = {"User-Agent":user_agent}
req = urllib2.Request(url,headers = headers)
response = urllib2.urlopen(req)
html = response.read()
return html

def writee_to_file(file_name,txt):
"""將txt文本存入到file_name文件中
"""
print "正在存儲文件" +filr_name
f = oprn(file_name,‘w‘)
f = write(txt)
f.close(0

def tieba_spider(url,begin_page,end_page):
"""
貼吧爬蟲的方法
"""
for i in range(begin_page,end_page + 1):
pn = 50 * (i-1)
my_url = url + srt(pn)
html = load_page(my_url)

filr_name = str(i) + ".html"
write_to_file(file_name,html)

if __name__ == "__main__":
url = raw_input("請輸入貼吧的url地址")
begin_page = int(raw_input("請輸入起始頁碼"))
end_page = int(raw_input("請輸入終止頁碼"))

tieba_spider(url,begin_page,end_page)

2018/7/21 Python 爬蟲學習