1. 程式人生 > 其它 >python基礎爬蟲——使用深度優先和廣度優先爬取圖結構網站

python基礎爬蟲——使用深度優先和廣度優先爬取圖結構網站

技術標籤:pythonpython

困於心衡於慮而後作
今天要學習的目標是:深度優先爬取複雜網站,圖結構網站

1.深度優先爬取圖結構網站
程式碼及解釋如下:

from bs4 import BeautifulSoup
import urllib.request


class Stack:
    def __init__(self):
        self.st = []

    def pop(self):
        return self.st.pop()

    def push(self, obj):
        self.st.append(obj)

    def
empty(self): return len(self.st) == 0 def visit(url): global urls if url in urls: return [] urls.append(url) try: data = urllib.request.urlopen(url) data = data.read() data = data.decode() soup = BeautifulSoup(data, 'lxml') print
(soup.find('h3').text) links = soup.select('a') return links except Exception as err: print(err) start_url = "http://127.0.0.1:5000/" urls = [] def spider(url): links = visit(url) for link in links: url = start_url + link['href'] spider(url) def
DFS(): # 深度優先訪問站點 st = Stack() st.push(start_url + 'books.htm') while not st.empty(): url = st.pop() links = visit(url) for link in links: # 從左到右 url = start_url + link['href'] st.push(url) # # for i in range(len(links) - 1, -1, -1): # 從右到左 # url = start_url + links[i]['href'] # st.push(url) # 深度優先方法訪問站點(圖的結構) urls = [] spider(start_url + 'books.htm') # 遞迴的方法得到的 print() # 輸出回車 urls = [] DFS() # 通過壓棧的方法得到,可以控制從左到右還有從右到左 print('the end') # def spider(url): # global urls # stack = Stack() # stack.push(url) # while not stack.empty(): # url = stack.pop() # if url not in urls: # urls.append(url) # try: # data = urllib.request.urlopen(url) # data = data.read() # data = data.decode() # soup = BeautifulSoup(data, 'lxml') # print(soup.find('h3').text) # links = soup.select('a') # for i in range(len(links) - 1, -1, -1): # href = links[i]['href'] # url = start_url + '/' + href # stack.push(url) # except Exception as err: # print(err) # start_url = "http://127.0.0.1:5000" # urls = [] # spider(start_url) # print('the end')

執行結果:
在這裡插入圖片描述
2.廣度優先爬取圖網站

python程式碼:

from bs4 import BeautifulSoup
import urllib.request

# 廣度優先佇列實現爬取圖結構網站
class Queue:
    def __init__(self):
        self.st = []

    def fetch(self):
        return self.st.pop(0)

    def enter(self, obj):
        self.st.append(obj)

    def empty(self):
        return len(self.st) == 0


def visit(url):
    global urls
    if url in urls:
        return []
    urls.append(url)
    try:
        data = urllib.request.urlopen(url)
        data = data.read()
        data = data.decode()
        soup = BeautifulSoup(data, 'lxml')
        print(soup.find('h3').text)
        links = soup.select('a')
        return links
    except Exception as err:
        print(err)

start_url = "http://127.0.0.1:5000/"
urls = []


def spider(url):
    global urls
    q = Queue()
    q.enter(start_url)
    while not q.empty():
        url = q.fetch()
        links = visit(url)
        for link in links:
            url = start_url + link['href']
            q.enter(url)


spider(start_url)
print('the end')

執行結果:
在這裡插入圖片描述