1. 程式人生 > >網易雲音樂爬蟲--評論爬取以及Top Music統計

網易雲音樂爬蟲--評論爬取以及Top Music統計

        網易云云音樂評論十分有趣,於是就想寫個爬蟲爬取評論。但是不熟悉Python,就用java寫了個。

        主要使用了HttpClient,,Jsoup, 佇列, 執行緒, log4j,poi生成Excel儲存結果, 書寫過程中主要一個問題就是評論獲取,網易對其進行了加密,進行好一番搜尋才找到解決方法。爬取歌單數,top歌曲數都可以動態進行配置.

        目錄結構

       

主程式

package personal.mario.main;

import java.io.IOException;
import java.util.List;
import org.apache.http.client.ClientProtocolException;
import org.apache.log4j.Logger;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import personal.mario.bean.MusicCommentMessage;
import personal.mario.service.HtmlFetcherService;
import personal.mario.service.HtmlParserService;
import personal.mario.service.MusicListQueueService;
import personal.mario.service.MusicQueueService;
import personal.mario.service.TopMusicCalculateService;
import personal.mario.utils.Constants;
import personal.mario.utils.GenerateExcelUtils;

/*
 * 主邏輯
 * author timeless.li
 * 2016-10-26
 * */
public class NetEaseCrawler implements Runnable {
	
	private int totalMusicList = Constants.MUSIC_LIST_COUNT;
	private int limit = Constants.PER_PAGE;
	private int offset =Constants.OFFSET;
	private HSSFWorkbook commentMessageWorkbook = new HSSFWorkbook();
	private List<MusicCommentMessage> ms = null;
	private static Logger logger = Logger.getLogger(NetEaseCrawler.class);
	
	@Override
	public void run() {
		try {
			//初始化待爬取的歌單URL佇列
			initUncrawledMusicListQueue();
			
			//記錄所有爬取出來的歌曲數,包含重複歌曲
			int count = 0;
			
			//歌曲資訊Excel初始化
			HSSFSheet commentMessageSheet = GenerateExcelUtils.generateCommentMessageExcelInit(commentMessageWorkbook);
			
			//開始根據歌單爬取
			while (!MusicListQueueService.isUncrawledMusicListEmpty()) {
				
				//填充待爬取歌曲佇列
				fillUncrawledMusicQueue(MusicListQueueService.getTopMusicList());
				
				//歌曲佇列為空就返回上層迴圈填充歌曲佇列
				while (!MusicQueueService.isUncrawledMusicQueueEmpty()) {
					
					//取出待爬取歌曲ID
					String songId = MusicQueueService.getTopMusicUrl();
					
					//判斷是否已經爬取過
					if (!MusicQueueService.isMusicCrawled(songId)) {
						//獲取到爬取結果,歌曲資訊
						MusicCommentMessage mcm = getCommentMessage(songId);
						
						//判斷是否加入Top歌曲列表
						ms = TopMusicCalculateService.getTopMusic(mcm);
						
						//向歌曲資訊Excel插入資料
						GenerateExcelUtils.generateCommentMessageExcelProcess(commentMessageWorkbook, commentMessageSheet, mcm, count);
						
						//生成歌曲評論Excel
						GenerateExcelUtils.generateCommentsExcel(mcm);
						
						//加入已經爬取的佇列,供以後查重判斷
						MusicQueueService.addCrawledMusic(songId);
						count++;
					}
				}
			}
			
			//生成歌曲資訊Excel
			GenerateExcelUtils.generateCommentMessageExcelWrite(commentMessageWorkbook);
			
			//生成Top歌曲Excel
			GenerateExcelUtils.generateTopMusicExcel(ms);
			
			logger.info("count : " + count);
			
			//實際爬取的歌曲數,不包含重複
			logger.info("size : " + MusicQueueService.getCrawledMusicSize());
		} catch (Exception e) {
			e.printStackTrace();
		}
	}
	
	/*
	 * 迴圈請求獲取所有歌單
	 * */
	public void initUncrawledMusicListQueue() throws ClientProtocolException, IOException {
		
		if (totalMusicList > limit) { 
			int tmpLimit = limit;
			int tmpOffset = offset;
			
			while (totalMusicList > tmpOffset) {
				
				String suffix = "limit=" + tmpLimit + "&offset=" + tmpOffset;
				tmpOffset += tmpLimit;
				
				if (tmpOffset + tmpLimit > totalMusicList) {
					tmpLimit =  totalMusicList - tmpOffset;
				}
				
				HtmlParserService.parseAndSaveMusicListUrl(HtmlFetcherService.fetch(Constants.SOURCE_URL + suffix));
			}
		} else {
			String suffix = "limit=" + totalMusicList + "&offset=" + offset;
			HtmlParserService.parseAndSaveMusicListUrl(HtmlFetcherService.fetch(Constants.SOURCE_URL + suffix));
		}
	}
	
	//填充要爬取的歌曲佇列
	public void fillUncrawledMusicQueue(String musicListUrl) throws IOException {
		HtmlParserService.parseMusicListAndGetMusics(musicListUrl);
	}
	
	//由於反爬的存在, 一旦被禁止爬取, 休眠幾秒後再進行爬取
	public MusicCommentMessage getCommentMessage(String songId) {
		try {
			MusicCommentMessage mc = HtmlParserService.parseCommentMessage(songId);
			
			if (mc == null) {
				logger.info("warining: be interceptted by net ease music server..");
				Thread.sleep((long) (Math.random() * 30000));
				
				//遞迴
				return getCommentMessage(songId);
			} else {
				return mc;
			}
		} catch (Exception e) {
			logger.info("error: be refused by net ease music server..");
			return getCommentMessage(songId);
		}
	}
}

計算Top歌曲
package personal.mario.service;

import java.util.ArrayList;
import java.util.List;
import personal.mario.bean.MusicCommentMessage;
import personal.mario.utils.Constants;

/*計算獲取TOP 歌曲*/
public class TopMusicCalculateService {
	private static List<MusicCommentMessage> ms = new ArrayList<MusicCommentMessage>();
	
	public static List<MusicCommentMessage> getTopMusic(MusicCommentMessage mcm) {

		int topSize = ms.size();
		
		if (topSize == 0) {
			ms.add(mcm);
		}
		
		if (topSize > 0 && topSize < Constants.TOP_MUSIC_COUNT) {
			for (int j = 0; j < topSize; j++) {
				if (mcm.getCommentCount() > ms.get(j).getCommentCount()) {
					ms.add(j, mcm);
					break;
				}
				
				if (j == topSize - 1) {
					ms.add(mcm);
				}
			}
		}
		
		if (topSize >= Constants.TOP_MUSIC_COUNT) {
			for (int j = 0; j < topSize; j++) {
				if (mcm.getCommentCount() > ms.get(j).getCommentCount()) {
					ms.add(j, mcm);
					ms.remove(topSize);
					break;
				}
			}
		}
		
		return ms;
	}
}

生成評論Excel表
//歌曲評論Excel生成
	public static void generateCommentsExcel(MusicCommentMessage musicCommentMessage) throws IOException {
		
		HSSFWorkbook workbook = new HSSFWorkbook();
		HSSFSheet sheet = workbook.createSheet("歌曲評論");
        sheet.setDefaultColumnWidth(15);
        
        HSSFRow rowHead = sheet.createRow(0);
        
        HSSFCellStyle style = workbook.createCellStyle();
        style.setAlignment(HSSFCellStyle.ALIGN_CENTER);
        
        HSSFFont font = workbook.createFont();
        font.setColor(HSSFColor.LIGHT_BLUE.index);
        font.setFontHeightInPoints((short) 8);
        font.setBoldweight(HSSFFont.BOLDWEIGHT_BOLD);
        style.setFont(font);

        HSSFCell cellHead = rowHead.createCell(0);
        cellHead.setCellValue("歌名");
        cellHead.setCellStyle(style);
        
        cellHead = rowHead.createCell(1);
        cellHead.setCellValue("評論型別");
        cellHead.setCellStyle(style);
        
        cellHead = rowHead.createCell(2);
        cellHead.setCellValue("評論使用者暱稱");
        cellHead.setCellStyle(style);
        
        cellHead = rowHead.createCell(3);
        cellHead.setCellValue("評論時間");
        cellHead.setCellStyle(style);
        
        cellHead = rowHead.createCell(4);
        cellHead.setCellValue("評論內容");
        cellHead.setCellStyle(style);
        
        cellHead = rowHead.createCell(5);
        cellHead.setCellValue("獲贊數");
        cellHead.setCellStyle(style);
        
        HSSFCellStyle cellStyle = workbook.createCellStyle();
        cellStyle.setAlignment(HSSFCellStyle.ALIGN_CENTER);
        
        List<MusicComment> comments = musicCommentMessage.getComments();
        
    	for (int i = 0; i < comments.size(); i++) {
    		MusicComment comment = comments.get(i);
    		HSSFRow row = sheet.createRow(i + 1);
    		    	
	        HSSFCell cell = row.createCell(0);
	        cell.setCellValue(musicCommentMessage.getSongTitle());
	        cell.setCellStyle(cellStyle);
	        
	        cell = row.createCell(1);
	        cell.setCellValue(comment.getType());
	        cell.setCellStyle(cellStyle);
	        
	        cell = row.createCell(2);
	        cell.setCellValue(comment.getNickname());
	        cell.setCellStyle(cellStyle);
	        
	        cell = row.createCell(3);
	        cell.setCellValue(comment.getCommentDate());
	        cell.setCellStyle(cellStyle);
	        
	        cell = row.createCell(4);
	        cell.setCellValue(comment.getContent());
	        cell.setCellStyle(cellStyle);
	        
	        cell = row.createCell(5);
	        cell.setCellValue(comment.getAppreciation());
	        cell.setCellStyle(cellStyle);
    	}
        
    	String path = Constants.COMMENTS_PATH + StringUtils.dealWithFilename(musicCommentMessage.getSongTitle()) + Constants.COMMENTS_SUFFIX;
    	logger.info(path);
        FileOutputStream fos = new FileOutputStream(path);
        workbook.write(fos);
        fos.close();
	}

歌曲佇列
package personal.mario.service;

import java.util.Queue;
import java.util.concurrent.ConcurrentLinkedQueue;

/*歌曲佇列*/
public class MusicQueueService {
	private static Queue<String> uncrawledMusics = new ConcurrentLinkedQueue<String>();
	private static Queue<String> crawledMusics = new ConcurrentLinkedQueue<String>();
	
	public static void addUncrawledMusic(String e) {
		uncrawledMusics.offer(e);
	}
	
	public static String getTopMusicUrl() {
		if (!uncrawledMusics.isEmpty()) {
			return uncrawledMusics.poll();
		}
		
		return null;
	}
	
	public static void addCrawledMusic(String e) {
		crawledMusics.offer(e);
	}
	
	public static boolean isMusicCrawled(String id) {
		return crawledMusics.contains(id);
	}
	
	public static boolean isUncrawledMusicQueueEmpty() {
		return uncrawledMusics.isEmpty();
	}
	
	public static void printAll() {
		while (!uncrawledMusics.isEmpty()) {
			System.out.println(uncrawledMusics.poll());
		}
	}
	
	public static int getCrawledMusicSize() {
		return crawledMusics.size();
	}
}
爬取結果