python基礎爬蟲——使用深度優先和廣度優先爬取圖結構網站
阿新 • • 發佈:2021-02-16
困於心衡於慮而後作
今天要學習的目標是:深度優先爬取複雜網站,圖結構網站
1.深度優先爬取圖結構網站
程式碼及解釋如下:
from bs4 import BeautifulSoup
import urllib.request
class Stack:
def __init__(self):
self.st = []
def pop(self):
return self.st.pop()
def push(self, obj):
self.st.append(obj)
def empty(self):
return len(self.st) == 0
def visit(url):
global urls
if url in urls:
return []
urls.append(url)
try:
data = urllib.request.urlopen(url)
data = data.read()
data = data.decode()
soup = BeautifulSoup(data, 'lxml')
print (soup.find('h3').text)
links = soup.select('a')
return links
except Exception as err:
print(err)
start_url = "http://127.0.0.1:5000/"
urls = []
def spider(url):
links = visit(url)
for link in links:
url = start_url + link['href']
spider(url)
def DFS(): # 深度優先訪問站點
st = Stack()
st.push(start_url + 'books.htm')
while not st.empty():
url = st.pop()
links = visit(url)
for link in links: # 從左到右
url = start_url + link['href']
st.push(url)
#
# for i in range(len(links) - 1, -1, -1): # 從右到左
# url = start_url + links[i]['href']
# st.push(url)
# 深度優先方法訪問站點(圖的結構)
urls = []
spider(start_url + 'books.htm') # 遞迴的方法得到的
print() # 輸出回車
urls = []
DFS() # 通過壓棧的方法得到,可以控制從左到右還有從右到左
print('the end')
# def spider(url):
# global urls
# stack = Stack()
# stack.push(url)
# while not stack.empty():
# url = stack.pop()
# if url not in urls:
# urls.append(url)
# try:
# data = urllib.request.urlopen(url)
# data = data.read()
# data = data.decode()
# soup = BeautifulSoup(data, 'lxml')
# print(soup.find('h3').text)
# links = soup.select('a')
# for i in range(len(links) - 1, -1, -1):
# href = links[i]['href']
# url = start_url + '/' + href
# stack.push(url)
# except Exception as err:
# print(err)
# start_url = "http://127.0.0.1:5000"
# urls = []
# spider(start_url)
# print('the end')
執行結果:
2.廣度優先爬取圖網站
python程式碼:
from bs4 import BeautifulSoup
import urllib.request
# 廣度優先佇列實現爬取圖結構網站
class Queue:
def __init__(self):
self.st = []
def fetch(self):
return self.st.pop(0)
def enter(self, obj):
self.st.append(obj)
def empty(self):
return len(self.st) == 0
def visit(url):
global urls
if url in urls:
return []
urls.append(url)
try:
data = urllib.request.urlopen(url)
data = data.read()
data = data.decode()
soup = BeautifulSoup(data, 'lxml')
print(soup.find('h3').text)
links = soup.select('a')
return links
except Exception as err:
print(err)
start_url = "http://127.0.0.1:5000/"
urls = []
def spider(url):
global urls
q = Queue()
q.enter(start_url)
while not q.empty():
url = q.fetch()
links = visit(url)
for link in links:
url = start_url + link['href']
q.enter(url)
spider(start_url)
print('the end')
執行結果: