java網頁爬蟲正則表示式
阿新 • • 發佈:2018-11-09
package cn.itcast.regextest.demo;
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class PachongDemo {
public static void main(String[] args) throws IOException {
List<String>list=demo_2();
for(String i:list) {
System.out.println(i);
}
}
public static List<String> demo_2() throws IOException {
//URL統一資源定位符
URL url=new URL("https://news.163.com/18/0929/09/DSS2A0NO0001875N.html");
BufferedReader bur=new BufferedReader(new InputStreamReader(url.openStream()));
List<String>list=new ArrayList<String>();
String regex="南昌";
Pattern pa=Pattern.compile(regex);
String line=null;
while((line=bur.readLine())!=null) {
Matcher m=pa.matcher(line);
if(m.find()) {
list.add(m.group());
}
}
bur.close();
return list;
}
//從本地磁碟檔案爬取
public static List<String> demo_1() throws IOException {
BufferedReader bur=new BufferedReader(new FileReader("demo.txt"));
List<String>list=new ArrayList<String>();
String regex="\\ [email protected](\\w+\\.\\w+)+";//正則表示式
Pattern pa=Pattern.compile(regex);//正則物件
String line=null;
while((line=bur.readLine())!=null) {
Matcher m=pa.matcher(line);
if(m.find()) {
list.add(m.group());
}
}
bur.close();
return list;
}
}