網路爬蟲:淘女郎
阿新 • • 發佈:2019-02-18
#_*_ coding:utf-8 _*_
import urllib.request
from bs4 import BeautifulSoup
import os
import re
from selenium import webdriver
class Spider:
#頁面初始化
def __init__(self):
self.siteURL = "http://mm.taobao.com/json/request_top_list.htm"
# 獲取索引頁面的內容
def getPage(self, pageIndex):
url = self.siteURL + "?page=" + str(pageIndex)
print ("第%s頁淘女郎網址:"%str(pageIndex)+url)
print ("分別為:")
request = urllib.request.Request(url)
response = urllib.request.urlopen(request)
html=response.read().decode("gbk")
soup=BeautifulSoup(html,"html.parser")
return soup
# 獲取索引介面所有MM的資訊,list格式
def getContents(self, pageIndex): #獲取該頁下淘女郎的資訊
page = self.getPage(pageIndex)
contents=[]
for link in page.findAll("p",{"class":"top"}):
# print(link.a.get_text()) # name
# print (link.em.get_text()) #age
# print (link.span.get_text()) #address
# print (link.a.attrs["href"]) #link
name=link.a.get_text()
age=link.em.get_text()
address=link.span.get_text()
link=link.a.attrs["href"]
mm=[name,age,address,link]
# print (mm)
contents.append(mm)
# print (contents)
# for each in contents:
# print (each)
return contents
# 獲取MM個人詳情頁面
def getDetailPage(self, infoURL):
response = urllib.request.urlopen(infoURL)
return response.read().decode("gbk")
#獲取個人文字簡介
# def getBrief(self,page):
# pattern = re.compile('<div class="mm-aixiu-content".*?>(.*?)<!--',re.S)
# result = re.search(pattern,page)
# return self.tool.replace(result.group(1))
#獲取這個淘女淘的個人域名(該網頁為動態載入,直接用BeautifulSoup無法提出)
def get_Peronsal_address(self,detailURL):
driver = webdriver.PhantomJS()
driver.get(detailURL)
pageSource = driver.page_source
# soup = BeautifulSoup(pageSource, "html.parser")
# # print (soup.prettify())
# 第一種方法
# <span>//mm.taobao.com/titikatrina</span>
# link=soup.find("div",class_="mm-p-info mm-p-domain-info").find("span").get_text()
# print (link)
# 另外的方法,使用正則表示式
try:
link = re.findall(r'<span>//mm\.taobao\.com/.*</span>', pageSource)
str_link = str(link[0])
str_link = str_link.lstrip("<span>").rstrip("</span>")
return str_link
except IndexError as e:
print("她沒有個人域名:" + str(e))
return False
#獲取頁面所有圖片
def getAllImg(self,detail_Page_link):
detail_Page_link = "https:" + detail_Page_link
request = urllib.request.Request(detail_Page_link)
response = urllib.request.urlopen(request)
html = response.read().decode("gbk")
soup = BeautifulSoup(html, "html.parser")
images = soup.findAll("img", {"src": re.compile("//img.alicdn.com/.*")})
# print (images)
return images
# for image in images:
# # return(images)
# print(image["src"])
#儲存多張寫真圖片
def saveImgs(self,images,name):
number = 1
print ("發現",name,"共有",len(images),"張照片")
for image in images:
imageURL="http:"+image["src"]
# print (imageURL)
# splitPath = imageURL.split('.')
# fTail = splitPath.pop()
# if len(fTail) > 3:
# fTail = "jpg"
fileName = name + "/" + str(number) + ".jpg"
self.saveImg(imageURL,fileName)
number += 1
# print (fileName)
#傳入圖片地址,檔名,儲存單張圖片
def saveImg(self,imageURL,fileName):
# 判斷圖片是否存在
# 存在 True
# 不存在 False
isExists = os.path.exists(fileName)
# 判斷結果
if not isExists:
# 如果圖片不存在則下載圖片
try:
u = urllib.request.urlopen(imageURL)
data = u.read()
f = open(fileName, 'wb')
f.write(data)
print("正在悄悄儲存她的一張圖片為", fileName)
f.close()
# return True
except urllib.error.HTTPError as reason:
print(reason)
else:
# 如果目錄存在則不建立,並提示目錄已存在
print("名為", fileName, "的圖片已經成功下載")
# return False
#建立新目錄
def mkdir(self,path):
path = path.strip()
# 判斷路徑是否存在
# 存在 True
# 不存在 False
isExists=os.path.exists(path)
# 判斷結果
if not isExists:
# 如果不存在則建立目錄
print ("偷偷新建了名字叫做",path,"的資料夾")
# 建立目錄操作函式
os.makedirs(path)
return True
else:
# 如果目錄存在則不建立,並提示目錄已存在
print ("名為",path,"的資料夾已經建立成功")
return False
#將一頁淘寶MM的資訊儲存起來
def savePageInfo(self,pageIndex):
#獲取第一頁淘寶MM列表
contents = self.getContents(pageIndex)
for item in contents: #此處使用切面可以決定選取某頁淘女淘的個數
#item[0]姓名,item[1]年齡,item[2]居住地,item[3]網址
print ("發現一位模特,名字叫",item[0],"芳齡",item[1],",她住在",item[2])
print ("正在偷偷地儲存",item[0],"的資訊")
print ("又意外地發現她的地址是","https:"+item[3]+"&is_coment=false")
#個人詳情頁面的URL
detailURL = "https:"+item[3]+"&is_coment=false"
# #獲取個人簡介
# brief = self.getBrief(detailPage)
#
#得到個人詳情頁面程式碼
# detailPage = self.getDetailPage(detailURL)
detail_Page_link=self.get_Peronsal_address(detailURL)
# print (item[0]+"的個人域名為:"+detail_Page_link)
#某些淘女郎沒有個人域名,返回的個人連結為空,則無法從連結提取。
#可以設定返回false,遇到false就break跳出。
if detail_Page_link!=False:
# #獲取所有圖片列表
images_links = self.getAllImg(detail_Page_link)
self.mkdir(item[0])
#
# #儲存個人簡介
# self.saveBrief(brief,item[0])
# #儲存頭像
# self.saveIcon(item[1],item[0])
#儲存圖片
self.saveImgs(images_links,item[0])
else:
break
#傳入起止頁碼,獲取MM圖片
def savePagesInfo(self,start,end):
for i in range(start,end+1):
print ("正在偷偷尋找第"+str(i)+"個地方,看看MM們在不在")
self.savePageInfo(i)
spider = Spider()
# spider.getContents(1)
spider.savePagesInfo(1,2)