网易云云音乐评论十分有趣,于是就想写个爬虫爬取评论。但是不熟悉Python,就用java写了个。
主要使用了HttpClient,,Jsoup, 队列, 线程, log4j,poi生成Excel保存结果, 书写过程中主要一个问题就是评论获取,网易对其进行了加密,进行好一番搜索才找到解决方法。爬取歌单数,top歌曲数都可以动态进行配置.
目录结构
主程序
package personal.mario.main;
import java.io.IOException;
import java.util.List;
import org.apache.http.client.ClientProtocolException;
import org.apache.log4j.Logger;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import personal.mario.bean.MusicCommentMessage;
import personal.mario.service.HtmlFetcherService;
import personal.mario.service.HtmlParserService;
import personal.mario.service.MusicListQueueService;
import personal.mario.service.MusicQueueService;
import personal.mario.service.TopMusicCalculateService;
import personal.mario.utils.Constants;
import personal.mario.utils.GenerateExcelUtils;
/*
* 主逻辑
* author timeless.li
* 2016-10-26
* */
public class NetEaseCrawler implements Runnable {
private int totalMusicList = Constants.MUSIC_LIST_COUNT;
private int limit = Constants.PER_PAGE;
private int offset =Constants.OFFSET;
private HSSFWorkbook commentMessageWorkbook = new HSSFWorkbook();
private List<MusicCommentMessage> ms = null;
private static Logger logger = Logger.getLogger(NetEaseCrawler.class);
@Override
public void run() {
try {
//初始化待爬取的歌单URL队列
initUncrawledMusicListQueue();
//记录所有爬取出来的歌曲数,包含重复歌曲
int count = 0;
//歌曲信息Excel初始化
HSSFSheet commentMessageSheet = GenerateExcelUtils.generateCommentMessageExcelInit(commentMessageWorkbook);
//开始根据歌单爬取
while (!MusicListQueueService.isUncrawledMusicListEmpty()) {
//填充待爬取歌曲队列
fillUncrawledMusicQueue(MusicListQueueService.getTopMusicList());
//歌曲队列为空就返回上层循环填充歌曲队列
while (!MusicQueueService.isUncrawledMusicQueueEmpty()) {
//取出待爬取歌曲ID
String songId = MusicQueueService.getTopMusicUrl();
//判断是否已经爬取过
if (!MusicQueueService.isMusicCrawled(songId)) {
//获取到爬取结果,歌曲信息
MusicCommentMessage mcm = getCommentMessage(songId);
//判断是否加入Top歌曲列表
ms = TopMusicCalculateService.getTopMusic(mcm);
//向歌曲信息Excel插入数据
GenerateExcelUtils.generateCommentMessageExcelProcess(commentMessageWorkbook, commentMessageSheet, mcm, count);
//生成歌曲评论Excel
GenerateExcelUtils.generateCommentsExcel(mcm);
//加入已经爬取的队列,供以后查重判断
MusicQueueService.addCrawledMusic(songId);
count++;
}
}
}
//生成歌曲信息Excel
GenerateExcelUtils.generateCommentMessageExcelWrite(commentMessageWorkbook);
//生成Top歌曲Excel
GenerateExcelUtils.generateTopMusicExcel(ms);
logger.info("count : " + count);
//实际爬取的歌曲数,不包含重复
logger.info("size : " + MusicQueueService.getCrawledMusicSize());
} catch (Exception e) {
e.printStackTrace();
}
}
/*
* 循环请求获取所有歌单
* */
public void initUncrawledMusicListQueue() throws ClientProtocolException, IOException {
if (totalMusicList > limit) {
int tmpLimit = limit;
int tmpOffset = offset;
while (totalMusicList > tmpOffset) {
String suffix = "limit=" + tmpLimit + "&offset=" + tmpOffset;
tmpOffset += tmpLimit;
if (tmpOffset + tmpLimit > totalMusicList) {
tmpLimit = totalMusicList - tmpOffset;
}
HtmlParserService.parseAndSaveMusicListUrl(HtmlFetcherService.fetch(Constants.SOURCE_URL + suffix));
}
} else {
String suffix = "limit=" + totalMusicList + "&offset=" + offset;
HtmlParserService.parseAndSaveMusicListUrl(HtmlFetcherService.fetch(Constants.SOURCE_URL + suffix));
}
}
//填充要爬取的歌曲队列
public void fillUncrawledMusicQueue(String musicListUrl) throws IOException {
HtmlParserService.parseMusicListAndGetMusics(musicListUrl);
}
//由于反爬的存在, 一旦被禁止爬取, 休眠几秒后再进行爬取
public MusicCommentMessage getCommentMessage(String songId) {
try {
MusicCommentMessage mc = HtmlParserService.parseCommentMessage(songId);
if (mc == null) {
logger.info("warining: be interceptted by net ease music server..");
Thread.sleep((long) (Math.random() * 30000));
//递归
return getCommentMessage(songId);
} else {
return mc;
}
} catch (Exception e) {
logger.info("error: be refused by net ease music server..");
return getCommentMessage(songId);
}
}
}
计算Top歌曲
package personal.mario.service;
import java.util.ArrayList;
import java.util.List;
import personal.mario.bean.MusicCommentMessage;
import personal.mario.utils.Constants;
/*计算获取TOP 歌曲*/
public class TopMusicCalculateService {
private static List<MusicCommentMessage> ms = new ArrayList<MusicCommentMessage>();
public static List<MusicCommentMessage> getTopMusic(MusicCommentMessage mcm) {
int topSize = ms.size();
if (topSize == 0) {
ms.add(mcm);
}
if (topSize > 0 && topSize < Constants.TOP_MUSIC_COUNT) {
for (int j = 0; j < topSize; j++) {
if (mcm.getCommentCount() > ms.get(j).getCommentCount()) {
ms.add(j, mcm);
break;
}
if (j == topSize - 1) {
ms.add(mcm);
}
}
}
if (topSize >= Constants.TOP_MUSIC_COUNT) {
for (int j = 0; j < topSize; j++) {
if (mcm.getCommentCount() > ms.get(j).getCommentCount()) {
ms.add(j, mcm);
ms.remove(topSize);
break;
}
}
}
return ms;
}
}
生成评论Excel表
//歌曲评论Excel生成
public static void generateCommentsExcel(MusicCommentMessage musicCommentMessage) throws IOException {
HSSFWorkbook workbook = new HSSFWorkbook();
HSSFSheet sheet = workbook.createSheet("歌曲评论");
sheet.setDefaultColumnWidth(15);
HSSFRow rowHead = sheet.createRow(0);
HSSFCellStyle style = workbook.createCellStyle();
style.setAlignment(HSSFCellStyle.ALIGN_CENTER);
HSSFFont font = workbook.createFont();
font.setColor(HSSFColor.LIGHT_BLUE.index);
font.setFontHeightInPoints((short) 8);
font.setBoldweight(HSSFFont.BOLDWEIGHT_BOLD);
style.setFont(font);
HSSFCell cellHead = rowHead.createCell(0);
cellHead.setCellValue("歌名");
cellHead.setCellStyle(style);
cellHead = rowHead.createCell(1);
cellHead.setCellValue("评论类型");
cellHead.setCellStyle(style);
cellHead = rowHead.createCell(2);
cellHead.setCellValue("评论用户昵称");
cellHead.setCellStyle(style);
cellHead = rowHead.createCell(3);
cellHead.setCellValue("评论时间");
cellHead.setCellStyle(style);
cellHead = rowHead.createCell(4);
cellHead.setCellValue("评论内容");
cellHead.setCellStyle(style);
cellHead = rowHead.createCell(5);
cellHead.setCellValue("获赞数");
cellHead.setCellStyle(style);
HSSFCellStyle cellStyle = workbook.createCellStyle();
cellStyle.setAlignment(HSSFCellStyle.ALIGN_CENTER);
List<MusicComment> comments = musicCommentMessage.getComments();
for (int i = 0; i < comments.size(); i++) {
MusicComment comment = comments.get(i);
HSSFRow row = sheet.createRow(i + 1);
HSSFCell cell = row.createCell(0);
cell.setCellValue(musicCommentMessage.getSongTitle());
cell.setCellStyle(cellStyle);
cell = row.createCell(1);
cell.setCellValue(comment.getType());
cell.setCellStyle(cellStyle);
cell = row.createCell(2);
cell.setCellValue(comment.getNickname());
cell.setCellStyle(cellStyle);
cell = row.createCell(3);
cell.setCellValue(comment.getCommentDate());
cell.setCellStyle(cellStyle);
cell = row.createCell(4);
cell.setCellValue(comment.getContent());
cell.setCellStyle(cellStyle);
cell = row.createCell(5);
cell.setCellValue(comment.getAppreciation());
cell.setCellStyle(cellStyle);
}
String path = Constants.COMMENTS_PATH + StringUtils.dealWithFilename(musicCommentMessage.getSongTitle()) + Constants.COMMENTS_SUFFIX;
logger.info(path);
FileOutputStream fos = new FileOutputStream(path);
workbook.write(fos);
fos.close();
}
歌曲队列
package personal.mario.service;
import java.util.Queue;
import java.util.concurrent.ConcurrentLinkedQueue;
/*歌曲队列*/
public class MusicQueueService {
private static Queue<String> uncrawledMusics = new ConcurrentLinkedQueue<String>();
private static Queue<String> crawledMusics = new ConcurrentLinkedQueue<String>();
public static void addUncrawledMusic(String e) {
uncrawledMusics.offer(e);
}
public static String getTopMusicUrl() {
if (!uncrawledMusics.isEmpty()) {
return uncrawledMusics.poll();
}
return null;
}
public static void addCrawledMusic(String e) {
crawledMusics.offer(e);
}
public static boolean isMusicCrawled(String id) {
return crawledMusics.contains(id);
}
public static boolean isUncrawledMusicQueueEmpty() {
return uncrawledMusics.isEmpty();
}
public static void printAll() {
while (!uncrawledMusics.isEmpty()) {
System.out.println(uncrawledMusics.poll());
}
}
public static int getCrawledMusicSize() {
return crawledMusics.size();
}
}