发现了一个java爬虫框架,熟悉了一下写了一个爬取网易云音乐专辑歌曲信息的爬虫,其中歌曲的歌词和评论是动态获取的,我只能单独自己写代码进行请求。
代码如下
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.math.BigInteger;
import java.security.SecureRandom;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import javax.crypto.Cipher;
import javax.crypto.spec.IvParameterSpec;
import javax.crypto.spec.SecretKeySpec;
import javax.xml.bind.DatatypeConverter;
import org.apache.commons.codec.binary.Base64;
import org.apache.http.HttpEntity;
import org.apache.http.NameValuePair;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONPath;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
public class NetEaseMusicPageProcessor implements PageProcessor {
//正则表达式\\. \\转义java中的\ \.转义正则中的.
//主域名
public static final String BASE_URL = "http://music.163.com/";
//匹配专辑URL
public static final String ALBUM_URL = "http://music\\.163\\.com/album\\?id=\\d+";
//匹配歌曲URL
public static final String MUSIC_URL = "http://music\\.163\\.com/song\\?id=\\d+";
//初始地址, JAY_CHOU 周杰伦的床边故事专辑, 可以改为其他歌手专辑地址
public static final String START_URL = "http://music.163.com/album?id=34720827";
//加密使用到的文本
public static final String TEXT = "{\"username\": \"\", \"rememberLogin\": \"true\", \"password\": \"\"}";
//爬取结果保存文件路径
public static final String RESULT_PATH = "/home/user/workspace/WebMagicCrawler/result/";
private Site site = Site.me()
.setDomain("http://music.163.com")
.setSleepTime(1000)
.setRetryTimes(30)
.setCharset("utf-8")
.setTimeOut(30000)
.setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
@Override
public Site getSite() {
return site;
}
@Override
public void process(Page page) {
//根据URL判断页面类型
if (page.getUrl().regex(ALBUM_URL).match()) {
//爬取歌曲URl加入队列
page.addTargetRequests(page.getHtml().xpath("//div[@id=\"song-list-pre-cache\"]").links().regex(MUSIC_URL).all());
//爬取专辑URL加入队列
page.addTargetRequests(page.getHtml().xpath("//div[@class=\"cver u-cover u-cover-3\"]").links().regex(ALBUM_URL).all());
} else {
String url = page.getUrl().toString();
page.putField("title", page.getHtml().xpath("//em[@class='f-ff2']/text()"));
page.putField("author", page.getHtml().xpath("//p[@class='des s-fc4']/span/a/text()"));
page.putField("album", page.getHtml().xpath("//p[@class='des s-fc4']/a/text()"));
page.putField("URL", url);
//单独对AJAX请求获取评论数, 使用JSON解析返回结果
page.putField("commentCount", JSONPath.eval(JSON.parse(crawlAjaxUrl(url.substring(url.indexOf("id=") + 3))), "$.total"));
}
}
public static void main(String[] args) {
Spider.create(new NetEaseMusicPageProcessor())
//初始URL
.addUrl(START_URL)
.addPipeline(new ConsolePipeline())
//结果输出到文件
.addPipeline(new FilePipeline(RESULT_PATH))
.run();
}
//对AJAX数据进行单独请求
public String crawlAjaxUrl(String songId) {
CloseableHttpClient httpclient = HttpClients.createDefault();
CloseableHttpResponse response =null;
try {
//参数加密
String secKey = new BigInteger(100, new SecureRandom()).toString(32).substring(0, 16);
String encText = aesEncrypt(aesEncrypt(TEXT, "0CoJUm6Qyw8W8jud"), secKey);
String encSecKey = rsaEncrypt(secKey);
HttpPost httpPost = new HttpPost("http://music.163.com/weapi/v1/resource/comments/R_SO_4_" + songId + "/?csrf_token=");
httpPost.addHeader("Referer", BASE_URL);
List<NameValuePair> ls = new ArrayList<NameValuePair>();
ls.add(new BasicNameValuePair("params", encText));
ls.add(new BasicNameValuePair("encSecKey", encSecKey));
UrlEncodedFormEntity paramEntity = new UrlEncodedFormEntity(ls, "utf-8");
httpPost.setEntity(paramEntity);
response = httpclient.execute(httpPost);
HttpEntity entity = response.getEntity();
if (entity != null) {
return EntityUtils.toString(entity, "utf-8");
}
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
response.close();
httpclient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return "";
}
}
关于网易云音乐API加密可以参见我另外一篇博客。
下面是运行结果
[INFO] 2016-10-28 10:51:49 Spider http://music.163.com started!
[INFO] 2016-10-28 10:51:49 downloading page http://music.163.com/album?id=34720827
get page: http://music.163.com/album?id=34720827
[INFO] 2016-10-28 10:51:50 downloading page http://music.163.com/song?id=415792916
get page: http://music.163.com/song?id=415792916
title: 床边故事
author: 周杰伦
album: 周杰伦的床边故事
URL: http://music.163.com/song?id=415792916
commentCount: 52535
[INFO] 2016-10-28 10:51:52 downloading page http://music.163.com/song?id=418602084
get page: http://music.163.com/song?id=418602084
title: 说走就走
author: 周杰伦
album: 周杰伦的床边故事
URL: http://music.163.com/song?id=418602084
commentCount: 24256
[INFO] 2016-10-28 10:51:53 downloading page http://music.163.com/song?id=418603076
get page: http://music.163.com/song?id=418603076
title: 一点点
author: 周杰伦
album: 周杰伦的床边故事
URL: http://music.163.com/song?id=418603076
commentCount: 33127
[INFO] 2016-10-28 10:51:54 downloading page http://music.163.com/song?id=415792918
get page: http://music.163.com/song?id=415792918
title: 前世情人
author: 周杰伦
album: 周杰伦的床边故事
URL: http://music.163.com/song?id=415792918
commentCount: 26761
[INFO] 2016-10-28 10:51:55 downloading page http://music.163.com/song?id=418602085
get page: http://music.163.com/song?id=418602085
title: 英雄
author: 周杰伦
album: 周杰伦的床边故事
URL: http://music.163.com/song?id=418602085
commentCount: 13321
[INFO] 2016-10-28 10:51:57 downloading page http://music.163.com/song?id=417250561
get page: http://music.163.com/song?id=417250561
title: 不该(with aMEI)
author: 周杰伦
album: 周杰伦的床边故事
URL: http://music.163.com/song?id=417250561
commentCount: 37877
[INFO] 2016-10-28 10:51:58 downloading page http://music.163.com/song?id=418602086
get page: http://music.163.com/song?id=418602086
title: 土耳其冰淇淋
author: 周杰伦
album: 周杰伦的床边故事
URL: http://music.163.com/song?id=418602086
commentCount: 21751
[INFO] 2016-10-28 10:51:59 downloading page http://music.163.com/song?id=418603077
get page: http://music.163.com/song?id=418603077
title: 告白气球
author: 周杰伦
album: 周杰伦的床边故事
URL: http://music.163.com/song?id=418603077
commentCount: 117647
[INFO] 2016-10-28 10:52:00 downloading page http://music.163.com/song?id=417247652
get page: http://music.163.com/song?id=417247652
title: Now You See Me
author: 周杰伦
album: 周杰伦的床边故事
URL: http://music.163.com/song?id=417247652
commentCount: 17485
[INFO] 2016-10-28 10:52:01 downloading page http://music.163.com/song?id=418602087
get page: http://music.163.com/song?id=418602087
title: 爱情废柴
author: 周杰伦
album: 周杰伦的床边故事
URL: http://music.163.com/song?id=418602087
commentCount: 24268
[INFO] 2016-10-28 10:52:02 downloading page http://music.163.com/album?id=34685590
get page: http://music.163.com/album?id=34685590
[INFO] 2016-10-28 10:52:04 downloading page http://music.163.com/album?id=34588039
get page: http://music.163.com/album?id=34588039
[INFO] 2016-10-28 10:52:05 downloading page http://music.163.com/album?id=3084335
get page: http://music.163.com/album?id=3084335
[INFO] 2016-10-28 10:52:06 downloading page http://music.163.com/album?id=2662137
get page: http://music.163.com/album?id=2662137
[INFO] 2016-10-28 10:52:07 downloading page http://music.163.com/song?id=411921852
get page: http://music.163.com/song?id=411921852
title: 惊叹号(Live)
author: 周杰伦
album: 魔天伦 世界巡回演唱会
URL: http://music.163.com/song?id=411921852
commentCount: 16768
[INFO] 2016-10-28 10:52:08 downloading page http://music.163.com/song?id=412327297
get page: http://music.163.com/song?id=412327297
title: 龙拳 (Live)
author: 周杰伦
album: 魔天伦 世界巡回演唱会
URL: http://music.163.com/song?id=412327297
commentCount: 2563
[INFO] 2016-10-28 10:52:09 downloading page http://music.163.com/song?id=412327298
get page: http://music.163.com/song?id=412327298
title: 最后的战役 (Live)
author: 周杰伦
album: 魔天伦 世界巡回演唱会
URL: http://music.163.com/song?id=412327298
commentCount: 3849
[INFO] 2016-10-28 10:52:11 downloading page http://music.163.com/song?id=412327299
get page: http://music.163.com/song?id=412327299
title: 天台 (Live)
author: 周杰伦
album: 魔天伦 世界巡回演唱会
URL: http://music.163.com/song?id=412327299
commentCount: 1283
[INFO] 2016-10-28 10:52:12 downloading page http://music.163.com/song?id=412327300
get page: http://music.163.com/song?id=412327300
title: 比较大的大提琴 (Live)
author: 周杰伦
album: 魔天伦 世界巡回演唱会
URL: http://music.163.com/song?id=412327300
commentCount: 1223
[INFO] 2016-10-28 10:52:13 downloading page http://music.163.com/song?id=412327301
get page: http://music.163.com/song?id=412327301
title: 快门慢舞 (Live)
author: 周杰伦
album: 魔天伦 世界巡回演唱会
URL: http://music.163.com/song?id=412327301
commentCount: 1287
[INFO] 2016-10-28 10:52:14 downloading page http://music.163.com/song?id=412327302
get page: http://music.163.com/song?id=412327302
title: 打架舞 (Live)
author: 周杰伦
album: 魔天伦 世界巡回演唱会
URL: http://music.163.com/song?id=412327302
commentCount: null
[INFO] 2016-10-28 10:52:15 downloading page http://music.163.com/song?id=412327303
get page: http://music.163.com/song?id=412327303
title: 哪里都是你 (Live)
author: 周杰伦
album: 魔天伦 世界巡回演唱会
URL: http://music.163.com/song?id=412327303
commentCount: 2674
[INFO] 2016-10-28 10:52:16 downloading page http://music.163.com/song?id=412327304
get page: http://music.163.com/song?id=412327304
title: 一路向北 (Live)
author: 周杰伦
album: 魔天伦 世界巡回演唱会
URL: http://music.163.com/song?id=412327304
commentCount: 11502
[INFO] 2016-10-28 10:52:17 downloading page http://music.163.com/song?id=412327305
get page: http://music.163.com/song?id=412327305
title: 不能说的秘密 (Live)
author: 周杰伦
album: 魔天伦 世界巡回演唱会
URL: http://music.163.com/song?id=412327305
commentCount: 3595
[INFO] 2016-10-28 10:52:18 downloading page http://music.163.com/song?id=412327306
get page: http://music.163.com/song?id=412327306
title: 双截棍 (Live)
author: 周杰伦
album: 魔天伦 世界巡回演唱会
URL: http://music.163.com/song?id=412327306
commentCount: 1892
[INFO] 2016-10-28 10:52:20 downloading page http://music.163.com/song?id=412327307
get page: http://music.163.com/song?id=412327307
title: 明明就 (Live)
author: 周杰伦
album: 魔天伦 世界巡回演唱会
URL: http://music.163.com/song?id=412327307
commentCount: 1898
[INFO] 2016-10-28 10:52:21 downloading page http://music.163.com/song?id=412327308
get page: http://music.163.com/song?id=412327308
title: Mine Mine (Live)
author: 周杰伦
album: 魔天伦 世界巡回演唱会
URL: http://music.163.com/song?id=412327308
commentCount: 1957
[INFO] 2016-10-28 10:52:22 downloading page http://music.163.com/song?id=412327309
get page: http://music.163.com/song?id=412327309
title: 龙卷风 (Live)
author: 周杰伦
album: 魔天伦 世界巡回演唱会
URL: http://music.163.com/song?id=412327309
commentCount: 1923
[INFO] 2016-10-28 10:52:23 downloading page http://music.163.com/song?id=412327310
get page: http://music.163.com/song?id=412327310
title: 公公偏头痛 (Live)
author: 周杰伦
album: 魔天伦 世界巡回演唱会
URL: http://music.163.com/song?id=412327310
commentCount: 1348
[INFO] 2016-10-28 10:52:24 downloading page http://music.163.com/song?id=412327311
get page: http://music.163.com/song?id=412327311
title: 青花瓷 (Live)
author: 周杰伦
album: 魔天伦 世界巡回演唱会
URL: http://music.163.com/song?id=412327311
commentCount: 1916
[INFO] 2016-10-28 10:52:25 downloading page http://music.163.com/song?id=412327312
get page: http://music.163.com/song?id=412327312
title: 斗牛 / 水手怕水 / 大笨钟 (Live)
author: 周杰伦
album: 魔天伦 世界巡回演唱会
URL: http://music.163.com/song?id=412327312
commentCount: 1595
[INFO] 2016-10-28 10:52:27 downloading page http://music.163.com/song?id=412327313
get page: http://music.163.com/song?id=412327313
title: 彩虹 / 轨迹 (Live)
author: 周杰伦
album: 魔天伦 世界巡回演唱会
URL: http://music.163.com/song?id=412327313
commentCount: 3209
[INFO] 2016-10-28 10:52:28 downloading page http://music.163.com/song?id=412327314
get page: http://music.163.com/song?id=412327314
title: 手语 (Live)
author: 周杰伦
album: 魔天伦 世界巡回演唱会
URL: http://music.163.com/song?id=412327314
commentCount: 1403
[INFO] 2016-10-28 10:52:29 downloading page http://music.163.com/song?id=412327315
get page: http://music.163.com/song?id=412327315
title: 开不了口 (Live)
author: 周杰伦
album: 魔天伦 世界巡回演唱会
URL: http://music.163.com/song?id=412327315
commentCount: 3067
[INFO] 2016-10-28 10:52:30 downloading page http://music.163.com/song?id=412327316
get page: http://music.163.com/song?id=412327316
title: 乌克丽丽 (Live)
author: 周杰伦
album: 魔天伦 世界巡回演唱会
URL: http://music.163.com/song?id=412327316
commentCount: 1664
[INFO] 2016-10-28 10:52:32 downloading page http://music.163.com/song?id=412327317
get page: http://music.163.com/song?id=412327317
title: 阳光宅男 (Live)
author: 周杰伦
album: 魔天伦 世界巡回演唱会
URL: http://music.163.com/song?id=412327317
commentCount: 2018
[INFO] 2016-10-28 10:52:33 downloading page http://music.163.com/song?id=407679169
get page: http://music.163.com/song?id=407679169
title: 英雄
author: 周杰伦
album: 英雄
URL: http://music.163.com/song?id=407679169
commentCount: 72800
[INFO] 2016-10-28 10:52:34 downloading page http://music.163.com/song?id=29822016
get page: http://music.163.com/song?id=29822016
title: 阳明山
author: 周杰伦
album: 哎呦,不错哦
URL: http://music.163.com/song?id=29822016
commentCount: 3858
[INFO] 2016-10-28 10:52:35 downloading page http://music.163.com/song?id=29822010
get page: http://music.163.com/song?id=29822010
title: 窃爱
author: 周杰伦
album: 哎呦,不错哦
URL: http://music.163.com/song?id=29822010
commentCount: 5511
[INFO] 2016-10-28 10:52:36 downloading page http://music.163.com/song?id=29818120
get page: http://music.163.com/song?id=29818120
title: 算什么男人
author: 周杰伦
album: 哎呦,不错哦
URL: http://music.163.com/song?id=29818120
commentCount: 24813
[INFO] 2016-10-28 10:52:37 downloading page http://music.163.com/song?id=29822015
get page: http://music.163.com/song?id=29822015
title: 天涯过客
author: 周杰伦
album: 哎呦,不错哦
URL: http://music.163.com/song?id=29822015
commentCount: 10229
[INFO] 2016-10-28 10:52:39 downloading page http://music.163.com/song?id=29822033
get page: http://music.163.com/song?id=29822033
title: 怎么了
author: 周杰伦
album: 哎呦,不错哦
URL: http://music.163.com/song?id=29822033
commentCount: 5023
[INFO] 2016-10-28 10:52:40 downloading page http://music.163.com/song?id=29822032
get page: http://music.163.com/song?id=29822032
title: 一口气全念对
author: 周杰伦
album: 哎呦,不错哦
URL: http://music.163.com/song?id=29822032
commentCount: 5846
[INFO] 2016-10-28 10:52:41 downloading page http://music.163.com/song?id=29822017
get page: http://music.163.com/song?id=29822017
title: 我要夏天
author: 周杰伦
album: 哎呦,不错哦
URL: http://music.163.com/song?id=29822017
commentCount: 5938
[INFO] 2016-10-28 10:52:42 downloading page http://music.163.com/song?id=29822018
get page: http://music.163.com/song?id=29822018
title: 手写的从前
author: 周杰伦
album: 哎呦,不错哦
URL: http://music.163.com/song?id=29822018
commentCount: 22051
[INFO] 2016-10-28 10:52:47 downloading page http://music.163.com/song?id=29818117
get page: http://music.163.com/song?id=29818117
title: 鞋子特大号
author: 周杰伦
album: 哎呦,不错哦
URL: http://music.163.com/song?id=29818117
commentCount: null
[INFO] 2016-10-28 10:52:48 downloading page http://music.163.com/song?id=29822013
get page: http://music.163.com/song?id=29822013
title: 听爸爸的话
author: 周杰伦
album: 哎呦,不错哦
URL: http://music.163.com/song?id=29822013
commentCount: null
[INFO] 2016-10-28 10:52:49 downloading page http://music.163.com/song?id=29822012
get page: http://music.163.com/song?id=29822012
title: 美人鱼
author: 周杰伦
album: 哎呦,不错哦
URL: http://music.163.com/song?id=29822012
commentCount: 10662
[INFO] 2016-10-28 10:52:50 downloading page http://music.163.com/song?id=29822014
get page: http://music.163.com/song?id=29822014
title: 听见下雨的声音
author: 周杰伦
album: 哎呦,不错哦
URL: http://music.163.com/song?id=29822014
commentCount: 17453
[INFO] 2016-10-28 10:52:51 downloading page http://music.163.com/song?id=27698922
get page: http://music.163.com/song?id=27698922
title: 你怎么说+红尘客栈+千里之外
author: 周杰伦
album: 周杰伦2013《魔天伦》台北小巨蛋演唱会
URL: http://music.163.com/song?id=27698922
commentCount: 1915