異常爬蟲demo、 find函式和findAll函式 以及正則表示式查詢元素
阿新 • • 發佈:2018-12-19
程式碼位於書的第1-2章
# from urllib.request import urlopen # from bs4 import BeautifulSoup # html = urlopen("http://pythonscraping.com/pages/page1.html") # # html.parser是Python的解析器的解析型別 # bsObj = BeautifulSoup(html.read() , "html.parser") # print(bsObj.h1) # 異常處理的爬蟲例子 # from urllib.error import HTTPError, URLError # from urllib.request import urlopen # from bs4 import BeautifulSoup # # def get_title(url): # try: # html = urlopen(url) # except (HTTPError, URLError) as e: # return None # try: # bsObj = BeautifulSoup(html.read(), "html.parser") # title = bsObj.body.h1 # except AttributeError as e: # return None # return title # title = get_title("http://pythonscraping.com/pages/page1.html") # if title is None : # print("title could not be found") # else : # print(title) # find和findAll函式 import re from urllib.request import urlopen from bs4 import BeautifulSoup # html = urlopen("http://pythonscraping.com/pages/warandpeace.html") # bsObj=BeautifulSoup(html.read(), "html.parser") # nameList=bsObj.findAll("span", {"class": "green"}) # for name in nameList: # print(name.get_text()) # from urllib.request import urlopen # from bs4 import BeautifulSoup # 使用導航樹進行查詢 # html = urlopen("http://www.pythonscraping.com/pages/page3.html") # bsObj = BeautifulSoup(html.read(), "html.parser") # print(bsObj.find("img", {"src": "../img/gifts/img1.jpg"}).parent.previous_sibling.get_text()) # 使用正則表示式查詢 html = urlopen("http://www.pythonscraping.com/pages/page3.html") bsObj = BeautifulSoup(html.read(), "html.parser") images=bsObj.findAll("img", {"src": re.compile("\.\.\/img\/gifts\/img.*\.jpg")}) for image in images: print(image["src"])