利用java-maven程式爬取西刺網頁的ip代理
阿新 • • 發佈:2018-12-12
主要程式碼:
package com.itquwei.spider; import java.io.IOException; import java.nio.charset.Charset; import org.apache.http.HttpEntity; import org.apache.http.client.ClientProtocolException; import org.apache.http.client.config.RequestConfig; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import com.itquwei.spider.dao.IpInfoDao; import com.itquwei.spider.pojo.IpInfo; public class XCSpider { private static IpInfoDao dao = new IpInfoDao(); public static void main(String[] args) throws Exception { for (int page = 1; page < 664; page++) { paging(page); System.out.println("第一"+page+"頁"); Thread.sleep(5000); } } // 分頁查詢 public static void paging(int page) throws IOException, ClientProtocolException { // 建立一個客戶端 String url = "http://www.xicidaili.com/nt/" + page; String html = getIndex(url); IpInfo ipInfo = getIpInfo(html); if (ipInfo != null) { dao.saveIpInfo(ipInfo); } } // 獲取ip詳細資訊 public static IpInfo getIpInfo(String html) { Document doc = Jsoup.parse(html); Elements trs = doc.select("#ip_list tr[class]"); for (Element element : trs) { IpInfo info = new IpInfo(); Elements tds = element.select("tr td"); // 獲取ip地址 String ip = tds.get(1).text(); info.setIp(ip); // 獲取埠號 String port = tds.get(2).text(); info.setPort(port); // 獲取伺服器地址 String address = tds.get(3).select("a").text(); info.setAddress(address); // 獲取狀態 String status = tds.get(4).text(); info.setStatus(status); // 獲取型別 String type = tds.get(5).text(); info.setType(type); // 獲取存活時間 String liveTime = tds.get(8).text(); info.setLiveTime(liveTime); // 獲取驗證時間 String testTime = tds.get(9).text(); info.setTestTime(testTime); // System.out.println(info); return info; } return null; } // 獲取西刺網頁 public static String getIndex(String url) throws IOException, ClientProtocolException { //建立客戶端 CloseableHttpClient httpClient = HttpClients.createDefault(); HttpGet httpGet = new HttpGet(url); // setConnectTimeout(10000)連線超時時間(單位豪秒) // setSocketTimeout(10000)讀取超時時間(單位豪秒) RequestConfig config = RequestConfig.custom().setConnectTimeout(20000) .setSocketTimeout(30000).build(); httpGet.setConfig(config); httpGet.setHeader( "User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.117 Safari/537.36"); // 執行 CloseableHttpResponse res = httpClient.execute(httpGet); HttpEntity entity = res.getEntity(); String html = ""; if (entity != null) { html = EntityUtils.toString(entity, Charset.forName("utf-8")); } // System.out.println(html); return html; } }
pojo程式碼:
package com.itquwei.spider.pojo; public class IpInfo { private String ip; private String port; private String address; private String status; private String type; private String liveTime; private String testTime; @Override public String toString() { return "IpInfo [ip=" + ip + ", port=" + port + ", address=" + address + ", status=" + status + ", type=" + type + ", liveTime=" + liveTime + ", testTime=" + testTime + "]"; } public String getStatus() { return status; } public void setStatus(String status) { this.status = status; } public String getLiveTime() { return liveTime; } public void setLiveTime(String liveTime) { this.liveTime = liveTime; } public String getTestTime() { return testTime; } public void setTestTime(String testTime) { this.testTime = testTime; } public String getIp() { return ip; } public void setIp(String ip) { this.ip = ip; } public String getPort() { return port; } public void setPort(String port) { this.port = port; } public String getAddress() { return address; } public void setAddress(String address) { this.address = address; } public String getType() { return type; } public void setType(String type) { this.type = type; } }
dao程式碼:連線資料庫用的
package com.itquwei.spider.dao; import org.springframework.jdbc.core.JdbcTemplate; import com.itquwei.spider.pojo.IpInfo; import com.mchange.v2.c3p0.ComboPooledDataSource; public class IpInfoDao extends JdbcTemplate { public IpInfoDao() { ComboPooledDataSource dataSource = new ComboPooledDataSource(); dataSource .setJdbcUrl("jdbc:mysql://localhost:3306/spider?characterEnconding=utf-8"); dataSource.setUser("root"); dataSource.setPassword("root"); setDataSource(dataSource); } public void saveIpInfo(IpInfo info) { String sql = "insert into xc_ipInfo (ip,port,address,status,type,liveTime,testTime) values(?,?,?,?,?,?,?);"; update(sql, info.getIp(), info.getPort(), info.getAddress(), info.getStatus(), info.getType(), info.getLiveTime(), info.getTestTime()); } }
結果: