網易雲音樂爬蟲--評論爬取以及Top Music統計
阿新 • • 發佈:2019-02-20
網易云云音樂評論十分有趣,於是就想寫個爬蟲爬取評論。但是不熟悉Python,就用java寫了個。
主要使用了HttpClient,,Jsoup, 佇列, 執行緒, log4j,poi生成Excel儲存結果, 書寫過程中主要一個問題就是評論獲取,網易對其進行了加密,進行好一番搜尋才找到解決方法。爬取歌單數,top歌曲數都可以動態進行配置.
目錄結構
主程式
package personal.mario.main; import java.io.IOException; import java.util.List; import org.apache.http.client.ClientProtocolException; import org.apache.log4j.Logger; import org.apache.poi.hssf.usermodel.HSSFSheet; import org.apache.poi.hssf.usermodel.HSSFWorkbook; import personal.mario.bean.MusicCommentMessage; import personal.mario.service.HtmlFetcherService; import personal.mario.service.HtmlParserService; import personal.mario.service.MusicListQueueService; import personal.mario.service.MusicQueueService; import personal.mario.service.TopMusicCalculateService; import personal.mario.utils.Constants; import personal.mario.utils.GenerateExcelUtils; /* * 主邏輯 * author timeless.li * 2016-10-26 * */ public class NetEaseCrawler implements Runnable { private int totalMusicList = Constants.MUSIC_LIST_COUNT; private int limit = Constants.PER_PAGE; private int offset =Constants.OFFSET; private HSSFWorkbook commentMessageWorkbook = new HSSFWorkbook(); private List<MusicCommentMessage> ms = null; private static Logger logger = Logger.getLogger(NetEaseCrawler.class); @Override public void run() { try { //初始化待爬取的歌單URL佇列 initUncrawledMusicListQueue(); //記錄所有爬取出來的歌曲數,包含重複歌曲 int count = 0; //歌曲資訊Excel初始化 HSSFSheet commentMessageSheet = GenerateExcelUtils.generateCommentMessageExcelInit(commentMessageWorkbook); //開始根據歌單爬取 while (!MusicListQueueService.isUncrawledMusicListEmpty()) { //填充待爬取歌曲佇列 fillUncrawledMusicQueue(MusicListQueueService.getTopMusicList()); //歌曲佇列為空就返回上層迴圈填充歌曲佇列 while (!MusicQueueService.isUncrawledMusicQueueEmpty()) { //取出待爬取歌曲ID String songId = MusicQueueService.getTopMusicUrl(); //判斷是否已經爬取過 if (!MusicQueueService.isMusicCrawled(songId)) { //獲取到爬取結果,歌曲資訊 MusicCommentMessage mcm = getCommentMessage(songId); //判斷是否加入Top歌曲列表 ms = TopMusicCalculateService.getTopMusic(mcm); //向歌曲資訊Excel插入資料 GenerateExcelUtils.generateCommentMessageExcelProcess(commentMessageWorkbook, commentMessageSheet, mcm, count); //生成歌曲評論Excel GenerateExcelUtils.generateCommentsExcel(mcm); //加入已經爬取的佇列,供以後查重判斷 MusicQueueService.addCrawledMusic(songId); count++; } } } //生成歌曲資訊Excel GenerateExcelUtils.generateCommentMessageExcelWrite(commentMessageWorkbook); //生成Top歌曲Excel GenerateExcelUtils.generateTopMusicExcel(ms); logger.info("count : " + count); //實際爬取的歌曲數,不包含重複 logger.info("size : " + MusicQueueService.getCrawledMusicSize()); } catch (Exception e) { e.printStackTrace(); } } /* * 迴圈請求獲取所有歌單 * */ public void initUncrawledMusicListQueue() throws ClientProtocolException, IOException { if (totalMusicList > limit) { int tmpLimit = limit; int tmpOffset = offset; while (totalMusicList > tmpOffset) { String suffix = "limit=" + tmpLimit + "&offset=" + tmpOffset; tmpOffset += tmpLimit; if (tmpOffset + tmpLimit > totalMusicList) { tmpLimit = totalMusicList - tmpOffset; } HtmlParserService.parseAndSaveMusicListUrl(HtmlFetcherService.fetch(Constants.SOURCE_URL + suffix)); } } else { String suffix = "limit=" + totalMusicList + "&offset=" + offset; HtmlParserService.parseAndSaveMusicListUrl(HtmlFetcherService.fetch(Constants.SOURCE_URL + suffix)); } } //填充要爬取的歌曲佇列 public void fillUncrawledMusicQueue(String musicListUrl) throws IOException { HtmlParserService.parseMusicListAndGetMusics(musicListUrl); } //由於反爬的存在, 一旦被禁止爬取, 休眠幾秒後再進行爬取 public MusicCommentMessage getCommentMessage(String songId) { try { MusicCommentMessage mc = HtmlParserService.parseCommentMessage(songId); if (mc == null) { logger.info("warining: be interceptted by net ease music server.."); Thread.sleep((long) (Math.random() * 30000)); //遞迴 return getCommentMessage(songId); } else { return mc; } } catch (Exception e) { logger.info("error: be refused by net ease music server.."); return getCommentMessage(songId); } } }
計算Top歌曲
package personal.mario.service; import java.util.ArrayList; import java.util.List; import personal.mario.bean.MusicCommentMessage; import personal.mario.utils.Constants; /*計算獲取TOP 歌曲*/ public class TopMusicCalculateService { private static List<MusicCommentMessage> ms = new ArrayList<MusicCommentMessage>(); public static List<MusicCommentMessage> getTopMusic(MusicCommentMessage mcm) { int topSize = ms.size(); if (topSize == 0) { ms.add(mcm); } if (topSize > 0 && topSize < Constants.TOP_MUSIC_COUNT) { for (int j = 0; j < topSize; j++) { if (mcm.getCommentCount() > ms.get(j).getCommentCount()) { ms.add(j, mcm); break; } if (j == topSize - 1) { ms.add(mcm); } } } if (topSize >= Constants.TOP_MUSIC_COUNT) { for (int j = 0; j < topSize; j++) { if (mcm.getCommentCount() > ms.get(j).getCommentCount()) { ms.add(j, mcm); ms.remove(topSize); break; } } } return ms; } }
生成評論Excel表
//歌曲評論Excel生成 public static void generateCommentsExcel(MusicCommentMessage musicCommentMessage) throws IOException { HSSFWorkbook workbook = new HSSFWorkbook(); HSSFSheet sheet = workbook.createSheet("歌曲評論"); sheet.setDefaultColumnWidth(15); HSSFRow rowHead = sheet.createRow(0); HSSFCellStyle style = workbook.createCellStyle(); style.setAlignment(HSSFCellStyle.ALIGN_CENTER); HSSFFont font = workbook.createFont(); font.setColor(HSSFColor.LIGHT_BLUE.index); font.setFontHeightInPoints((short) 8); font.setBoldweight(HSSFFont.BOLDWEIGHT_BOLD); style.setFont(font); HSSFCell cellHead = rowHead.createCell(0); cellHead.setCellValue("歌名"); cellHead.setCellStyle(style); cellHead = rowHead.createCell(1); cellHead.setCellValue("評論型別"); cellHead.setCellStyle(style); cellHead = rowHead.createCell(2); cellHead.setCellValue("評論使用者暱稱"); cellHead.setCellStyle(style); cellHead = rowHead.createCell(3); cellHead.setCellValue("評論時間"); cellHead.setCellStyle(style); cellHead = rowHead.createCell(4); cellHead.setCellValue("評論內容"); cellHead.setCellStyle(style); cellHead = rowHead.createCell(5); cellHead.setCellValue("獲贊數"); cellHead.setCellStyle(style); HSSFCellStyle cellStyle = workbook.createCellStyle(); cellStyle.setAlignment(HSSFCellStyle.ALIGN_CENTER); List<MusicComment> comments = musicCommentMessage.getComments(); for (int i = 0; i < comments.size(); i++) { MusicComment comment = comments.get(i); HSSFRow row = sheet.createRow(i + 1); HSSFCell cell = row.createCell(0); cell.setCellValue(musicCommentMessage.getSongTitle()); cell.setCellStyle(cellStyle); cell = row.createCell(1); cell.setCellValue(comment.getType()); cell.setCellStyle(cellStyle); cell = row.createCell(2); cell.setCellValue(comment.getNickname()); cell.setCellStyle(cellStyle); cell = row.createCell(3); cell.setCellValue(comment.getCommentDate()); cell.setCellStyle(cellStyle); cell = row.createCell(4); cell.setCellValue(comment.getContent()); cell.setCellStyle(cellStyle); cell = row.createCell(5); cell.setCellValue(comment.getAppreciation()); cell.setCellStyle(cellStyle); } String path = Constants.COMMENTS_PATH + StringUtils.dealWithFilename(musicCommentMessage.getSongTitle()) + Constants.COMMENTS_SUFFIX; logger.info(path); FileOutputStream fos = new FileOutputStream(path); workbook.write(fos); fos.close(); }
歌曲佇列
package personal.mario.service;
import java.util.Queue;
import java.util.concurrent.ConcurrentLinkedQueue;
/*歌曲佇列*/
public class MusicQueueService {
private static Queue<String> uncrawledMusics = new ConcurrentLinkedQueue<String>();
private static Queue<String> crawledMusics = new ConcurrentLinkedQueue<String>();
public static void addUncrawledMusic(String e) {
uncrawledMusics.offer(e);
}
public static String getTopMusicUrl() {
if (!uncrawledMusics.isEmpty()) {
return uncrawledMusics.poll();
}
return null;
}
public static void addCrawledMusic(String e) {
crawledMusics.offer(e);
}
public static boolean isMusicCrawled(String id) {
return crawledMusics.contains(id);
}
public static boolean isUncrawledMusicQueueEmpty() {
return uncrawledMusics.isEmpty();
}
public static void printAll() {
while (!uncrawledMusics.isEmpty()) {
System.out.println(uncrawledMusics.poll());
}
}
public static int getCrawledMusicSize() {
return crawledMusics.size();
}
}
爬取結果