Nodejs學習筆記(十一)—數據采集器示例(request和cheerio)
寫在之前
很多人都有做數據采集的需求,用不同的語言,不同的方式都能實現,我以前也用C#寫過,主要還是發送各類請求和正則解析數據比較繁瑣些,總體來說沒啥不好的,就是效率要差一些,
用nodejs寫采集程序還是比較有效率(可能也只是相對C#來說),今天主要用一個示例來說一下使用nodejs實現數據采集器,主要使用到request和cheerio。
request :用於http請求
https://github.com/request/request
cheerio:用於提取request返回的html中需要的信息(和jquery用法一致)
https://github.com/cheeriojs/cheerio
示例
單獨去說API用法沒什麽意思也沒必要記住全部API,下面開始示例
還是說點閑話:
nodejs開發工具還是很多,以前我也很推薦sublime,自從微軟推出了Visual Studio Code後就轉用它去做nodejs開發。
用它開發還是比較舒服的,免配置、啟動快、自動補全、查看定義和引用、搜索快等,有VS的一貫風格,應該會越做越好,所以推薦一下^_^!
示例要求
從 http://36kr.com/ 中抓取其中文章的“標題”、“地址”、“發布時間”、“封面圖片”
采集器
1.建立項目文件夾sampleDAU
2.建立package.json文件
{ "name": "Wilson_SampleDAU", "version": "0.0.1", "private": false, "dependencies": { "request":"*", "cheerio":"*" } }
3.在終端中用npm安裝引用
cd 項目根目錄 npm install
4.建立app.js編寫采集器代碼
首先要用瀏覽器打開要采集的URL,使用開發者工具查看HTML結構,然後根據結構寫解析代碼
/* * 功能: 數據采集 * 創建人: Wilson * 時間: 2015-07-29 */ var request = require(‘request‘), cheerio = require(‘cheerio‘), URL_36KR = ‘http://36kr.com/‘; //36氪 /* 開啟數據采集器 */ function dataCollectorStartup() { dataRequest(URL_36KR); } /* 數據請求 */ function dataRequest(dataUrl) { request({ url: dataUrl, method: ‘GET‘ }, function(err, res, body) { if (err) { console.log(dataUrl) console.error(‘[ERROR]Collection‘ + err); return; } switch(dataUrl) { case URL_36KR: dataParse36Kr(body); break; } }); } /* 36kr 數據解析 */ function dataParse36Kr(body) { console.log(‘============================================================================================‘); console.log(‘======================================36kr==================================================‘); console.log(‘============================================================================================‘); var $ = cheerio.load(body); var articles = $(‘article‘) for (var i = 0; i < articles.length; i++) { var article = articles[i]; var descDoms = $(article).find(‘.desc‘); if(descDoms.length == 0) { continue; } var coverDom = $(article).children().first(); var titleDom = $(descDoms).find(‘.info_flow_news_title‘); var timeDom = $(descDoms).find(‘.timeago‘); var titleVal = titleDom.text(); var urlVal = titleDom.attr(‘href‘); var timeVal = timeDom.attr(‘title‘); var coverUrl = coverDom.attr(‘data-lazyload‘); //處理時間 var timeDateSecs = new Date(timeVal).getTime() / 1000; if(urlVal != undefined) { console.info(‘--------------------------------‘); console.info(‘標題:‘ + titleVal); console.info(‘地址:‘ + urlVal); console.info(‘時間:‘ + timeDateSecs); console.info(‘封面:‘ + coverUrl); console.info(‘--------------------------------‘); } }; } dataCollectorStartup();
測試結果
這個采集器就完成了,其實就是request一個get請求,請求回調中會返回body即HTML代碼,通過cheerio庫以jquery庫語法一樣操作解析,取出想要的數據!
加入代理
做一個采集器DEMO上面就基本完成了。如果需要長期使用為了防止網站屏蔽,還是需要加入一個代理列表
為示例我從網上的免費代理中提出一些做示例,制作成proxylist.js,其中提供一個隨機取一條代理的函數
proxylist.js
var PROXY_LIST = [{"ip":"111.1.55.136","port":"55336"},{"ip":"111.1.54.91","port":"55336"},{"ip":"111.1.56.19","port":"55336"} ,{"ip":"112.114.63.16","port":"55336"},{"ip":"106.58.63.83","port":"55336"},{"ip":"119.188.133.54","port":"55336"} ,{"ip":"106.58.63.84","port":"55336"},{"ip":"183.95.132.171","port":"55336"},{"ip":"11.12.14.9","port":"55336"} ,{"ip":"60.164.223.16","port":"55336"},{"ip":"117.185.13.87","port":"8080"},{"ip":"112.114.63.20","port":"55336"} ,{"ip":"188.134.19.102","port":"3129"},{"ip":"106.58.63.80","port":"55336"},{"ip":"60.164.223.20","port":"55336"} ,{"ip":"106.58.63.78","port":"55336"},{"ip":"112.114.63.23","port":"55336"},{"ip":"112.114.63.30","port":"55336"} ,{"ip":"60.164.223.14","port":"55336"},{"ip":"190.202.82.234","port":"3128"},{"ip":"60.164.223.15","port":"55336"} ,{"ip":"60.164.223.5","port":"55336"},{"ip":"221.204.9.28","port":"55336"},{"ip":"60.164.223.2","port":"55336"} ,{"ip":"139.214.113.84","port":"55336"} ,{"ip":"112.25.49.14","port":"55336"},{"ip":"221.204.9.19","port":"55336"} ,{"ip":"221.204.9.39","port":"55336"},{"ip":"113.207.57.18","port":"55336"} ,{"ip":"112.25.62.15","port":"55336"} ,{"ip":"60.5.255.143","port":"55336"},{"ip":"221.204.9.18","port":"55336"},{"ip":"60.5.255.145","port":"55336"} ,{"ip":"221.204.9.16","port":"55336"},{"ip":"183.232.82.132","port":"55336"},{"ip":"113.207.62.78","port":"55336"} ,{"ip":"60.5.255.144","port":"55336"} ,{"ip":"60.5.255.141","port":"55336"},{"ip":"221.204.9.23","port":"55336"} ,{"ip":"157.122.96.50","port":"55336"},{"ip":"218.61.39.41","port":"55336"} ,{"ip":"221.204.9.26","port":"55336"} ,{"ip":"112.112.43.213","port":"55336"},{"ip":"60.5.255.138","port":"55336"},{"ip":"60.5.255.133","port":"55336"} ,{"ip":"221.204.9.25","port":"55336"},{"ip":"111.161.35.56","port":"55336"},{"ip":"111.161.35.49","port":"55336"} ,{"ip":"183.129.134.226","port":"8080"} ,{"ip":"58.220.10.86","port":"80"},{"ip":"183.87.117.44","port":"80"} ,{"ip":"211.23.19.130","port":"80"},{"ip":"61.234.249.107","port":"8118"},{"ip":"200.20.168.140","port":"80"} ,{"ip":"111.1.46.176","port":"55336"},{"ip":"120.203.158.149","port":"8118"},{"ip":"70.39.189.6","port":"9090"} ,{"ip":"210.6.237.191","port":"3128"},{"ip":"122.155.195.26","port":"8080"}]; module.exports.GetProxy = function () { var randomNum = parseInt(Math.floor(Math.random() * PROXY_LIST.length)); var proxy = PROXY_LIST[randomNum]; return ‘http://‘ + proxy.ip + ‘:‘ + proxy.port; }
對app.js代碼做如下修改
/* * 功能: 數據采集 * 創建人: Wilson * 時間: 2015-07-29 */ var request = require(‘request‘), cheerio = require(‘cheerio‘), URL_36KR = ‘http://36kr.com/‘, //36氪 Proxy = require(‘./proxylist.js‘); ... /* 數據請求 */ function dataRequest(dataUrl) { request({ url: dataUrl, proxy: Proxy.GetProxy(), method: ‘GET‘ }, function(err, res, body) { ... } } ... dataCollectorStartup() setInterval(dataCollectorStartup, 10000);
這樣就改造完成,加入代碼,並且加了setInterval進行定間隔執行!
請求https
上面示例中采集http請求,如果換成https呢?
新建app2.js,代碼如下
/* * 功能: 請求HTTPS * 創建人: Wilson * 時間: 2015-07-29 */ var request = require(‘request‘), URL_INTERFACELIFE = ‘https://interfacelift.com/wallpaper/downloads/date/wide_16:10/‘; /* 開啟數據采集器 */ function dataCollectorStartup() { dataRequest(URL_INTERFACELIFE); } /* 數據請求 */ function dataRequest(dataUrl) { request({ url: dataUrl, method: ‘GET‘ }, function(err, res, body) { if (err) { console.log(dataUrl) console.error(‘[ERROR]Collection‘ + err); return; } console.info(body); }); } dataCollectorStartup();
執行會發現返回body中什麽也沒有^_^!
加入一些代碼再看看
/* * 功能: 請求HTTPS * 創建人: Wilson * 時間: 2015-07-29 */ var request = require(‘request‘), URL_INTERFACELIFE = ‘https://interfacelift.com/wallpaper/downloads/date/wide_16:10/‘; /* 開啟數據采集器 */ ... /* 數據請求 */ function dataRequest(dataUrl) { request({ url: dataUrl, method: ‘GET‘, headers: { ‘User-Agent‘: ‘wilson‘ } }, function(err, res, body) { if (err) { console.log(dataUrl) console.error(‘[ERROR]Collection‘ + err); return; } console.info(body); }); } ...
再執行,你會發現body中返回請求的HTML!(結果就不放上來了,自已執行一下!)
詳細的請看:https://github.com/request/request#custom-http-headers
寫在之後
request庫我還是推薦API可以多看看,比如Forms部分我就在實際項目測試中用的比較多!
比如做接口測試:
1.提交兩個參數(參數1:字符串 參數2:數字)
request.post({url:‘接口URL‘,form: {參數一名稱:‘參數一值‘,參數二名稱:參數二值},function(err,res,body){ if(err) { return; } console.log(body);
});
body就是接口返回
2.提交一個字符串參數,提交一個文件參數(比如上傳頭像等)
var r = request.post(‘接口URL‘,function(err,res,body){ if(err) { return; } console.log(body); }); var form = r.form(); form.append(‘參數一名稱‘, ‘參數一值‘); form.append(‘參數二名稱‘, fs.createReadStream(‘1.jpg‘), {filename: ‘1.jpg‘});
cheerio庫真沒什麽好講的,會jquery就行,它庫的api基本都不用看!
此系列的源代碼可到http://bijian1013.iteye.com/blog/2425085下載。
文章來源:https://www.cnblogs.com/zhongweiv/p/node_request_cheerio.html
Nodejs學習筆記(十一)—數據采集器示例(request和cheerio)