爬取某招聘網站的招聘資訊(獵聘)
阿新 • • 發佈:2019-02-03
這該找工作了,俗話說的胡奧,金九銀十嘛。一個一個招聘資訊找著看,有點麻煩。所以心動了下,不如把我想找的資訊都爬取下來,直接sql語句查詢所有相關資訊,多方便,是吧~
注:
如果start-urls只設置一個的話,那麼只會爬取等於或者小於40條資料(會有重複)
Spider塊:
資訊搜尋,原本是想搜python、爬蟲之類的,後來寫著寫著就變成java了。果真還是忘不了自己的母語言啊~
import scrapy
from liepinSpider.items import LiepinspiderItem
class LisPinSpider(scrapy.Spider):
name = 'liepin'
allowed_domains = ['www.liepin.com']
start_urls = [
'https://www.liepin.com/sh/zhaopin/?dqs=020&salary=&isAnalysis=true&init=1&searchType=1&fromSearchBtn=1&jobTitles=&industries=&industryType=&d_headId=89d222c119810d9835c864b9842ca41a&d_ckId=89d222c119810d9835c864b9842ca41a&d_sfrom=search_city&d_curPage=0&d_pageSize=40&siTag=&key=java'
]
#這個地址是該網站翻頁第二頁的地址,只需要在/zhaopin/後面加上pn1(第2頁,以此類
#推)
#還有,在這裡要吐槽下獵聘的這個翻頁,搜尋關鍵字和區域後,進行翻頁,關鍵字和區域都
#沒了,我還要手動修改拼接url地址。最主要的是,使用者體驗沒了啊。不懂技術的人,壓根
#不知道怎麼看第二頁內容了。。
#如果想做的更靈活,直接input,修改key關鍵字地址就行,中文需要更改下編碼,
#就這樣
# https://www.liepin.com/sh/zhaopin/pn1/?dqs=&salary=&isAnalysis=true&init=1&searchType=1&fromSearchBtn=1&jobTitles=&industries=&industryType=&d_headId=89d222c119810d9835c864b9842ca41a&d_ckId=89d222c119810d9835c864b9842ca41a&d_sfrom=search_city&d_curPage=0&d_pageSize=40&siTag=&key=java
def parse(self, response):
list = response.css('.sojob-list li')
for li in list:
html_url = li.css('.job-name a::attr(href)').extract_first()
yield scrapy.Request(html_url, callback=self.content)
#這個位置可以編寫下一頁的訪問請求
#yield scrapy.Request('拼接好的url',callback=self.parse)
def content(self, response):
item = LiepinspiderItem()
#這個是直接獲取該頁面的url地址
html_url = response.url
title = response.css('.title-info h1::text').extract_first()
company = response.css('.title-info h3 a::text').extract_first()
money = response.css('.job-item-title::text').extract_first()
address = response.css('.basic-infor a::text').extract_first()
times = response.css('.basic-infor time::attr(title)').extract_first()
job_query_list = response.css('.job-qualifications span::text').extract()
job_query = ''
for job_querys in job_query_list:
job_query += job_querys + ','
tag_list = response.css('.tag-list span::text').extract()
tags = ''
for tag_span in tag_list:
tags += tag_span + ','
job_contents = response.css('.job-description div::text').extract()
job_content=''
for job in job_contents:
job_content += job.replace('\r\n','')
#不要忘了在item中設定相關的引數呦
#招聘網頁url
item['html_url'] = html_url
#標題
item['title'] = title
#公司名稱
item['company'] = company
#薪水
item['money'] = money.strip()
#公司地址(這個是區域地址,詳細地址可以在頁面上找到,自己修改下就好了)
item['address'] = address
#釋出時間
item['times'] = times
#簡寫的工作條件
item['job_query'] = job_query
#福利待遇
item['tags'] = tags
#詳細的工作職責和工作條件
item['job_content'] = job_content.strip()
yield item
pipelines塊:
不要忘記在setting中開啟pipelines模組啊~~
ITEM_PIPELINES = {
‘liepinSpider.pipelines.LiepinspiderPipeline’: 1,
}
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
from scrapy.exceptions import DropItem
def dbHandle():
conn = pymysql.connect(
host='localhost',
user='root',
passwd='Cs123456.',
charset='utf8',
db='liepin',
use_unicode=False
)
return conn
class LiepinspiderPipeline(object):
def process_item(self, item, spider):
#連線資料庫
db = dbHandle()
#開啟遊標
cursor = db.cursor()
#拼接sql
sql = 'insert into liepin_list (url, title, company, money, address, times, job_query, tags, job_content) ' \
'value ("{html_url}", "{title}", "{company}", "{money}", "{address}", "{times}", "{job_query}", "{tags}", "{job_content}");'.format(
**item)
try:
#判斷
re = self.db_distinct(item['html_url'])
if re:
try:
cursor.execute(sql)
db.commit()
except:
raise DropItem('sql執行錯誤')
else:
raise DropItem('資料已存在')
except:
db.rollback()
cursor.close()
#通過招聘地址的url來判斷這個頁面是否被儲存過
def db_distinct(self, html_url):
db = dbHandle()
cursor = db.cursor()
sql = 'select * from liepin_list where url ="{}"'.format(html_url)
cursor.execute(sql)
data = cursor.fetchone()
cursor.close()
if data == None:
return True
else:
return False
表結構:
查詢的部分資料:
這樣就大功告成了,謝謝觀看。