Java爬取CSDN博客
最近由于要做一个关于技术博客搜索的搜索工具,我开始接触到了爬虫,因为Java学的比较精通(主要是有很多封装的工具包),写了一个小小的Demo,进入主题吧
参开博客链接
- 请求的规则,我用的是Jsoup请求网页,它可以在请求时,按照你设置的规则来进行请求,所有要有一个请求规则类,PS:但是后面好像并没有什么用到
规则的父类,一般请求都需要的
package com.lg.po;
import java.io.Serializable;
/** * 规则的父类,一般请求都需要的 * @author LG * */
public class SuperRule implements Serializable{
/** * 网页路径 */
private String url;
/** * GET/POST请求 */
private int requestType = GET;
/** * GET请求方式 */
public final static int GET = 0;
/** * POST请求方式 */
public final static int POST = 1;
/** * 参数集合 */
private String[] params;
/** * 参数对应的值 */
private String[] values;
/** * 对返回的HTML,第一次过滤所使用的标签, */
private String resultTagName;
/** * CLASS / ID / SELECTION * 设置resultTagName的类型,默认为ID */
private int type = ID;
public final static int ID = 0;
public final static int CLASS = 1;
public final static int SELECTION = 2;
public SuperRule(){}
public SuperRule(String url){
this.url = url;}
public SuperRule(String url, int requestType, String[] params, String[] values,
String resultTagName,int type) {
super();
this.url = url;
this.requestType = requestType;
this.params = params;
this.values = values;
this.resultTagName = resultTagName;
this.type = type;
}
//剩下的set,get方法自己脑补
}
解析CSDN博客的规则类
package com.lg.po;
/** * 博客规则类 * @author LG * */
public class BlogRule extends SuperRule{
/** * 用于判断是否从博客空间到某一篇具体的博客,默认false */
private boolean isDirect = false;
/** * 该博客链接的类型,默认BLOG */
private int blogType = BLOG;
/** * HOME我的 SPACE博客空间 BLOG博客 */
public final static int HOME = 0;
public final static int SPACE = 1;
public final static int BLOG = 2;
public BlogRule(){}
public BlogRule(boolean isDirect, int blogType) {
super();
this.isDirect = isDirect;
this.blogType = blogType;
}
2.获取网页的类容,因为向服务器请求的过程中,如果请求的频率和次数都比较高的话,可能遭到别人反爬虫,网上有很多解决反爬的方法,如,降低请求次数,分时段爬取,我这里用的是换代理,一个IP不行了另一个赶紧顶上,免费IP代理地址
package com.lg.utils;
import java.io.IOException;
import java.util.Map;
import java.util.Queue;
import java.util.concurrent.CountDownLatch;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import com.lg.filter.DatasBackUp;
import com.lg.po.BlogRule;
import com.lg.po.SuperRule;
/** * 解析网页的工具类 * @author LG * */
public class ParseCommenUtil {
/** * 标识换IP是只能一个线程换 */
private static boolean ischange = true;
/** * 代理IP */
private static String proxyHost = "218.106.96.196";
/** * 代理端口 */
private static int proxyPort = 80;
/** * 下载某个路径的网页 */
public static Document download(SuperRule rule,Queue<String> ipQueue){
Document doc = downloadHttp(rule,ipQueue);
try {
doc.setBaseUri("http://blog.csdn.net");//设置网页的基本路径,因为网页中有的用的是相对路径
} catch (Exception e) {
System.out.println("设置基础URL出错");
}
return doc;
}
/** * 通过HttpClient方式下载网页 * @param rule * @param ipQueue * @return */
public static Document downloadHttp(SuperRule rule,Queue<String> ipQueue){
//通过HttpClient方式在网页
CloseableHttpClient httpClient = HttpClients.createDefault();
HttpGet httpGet = new HttpGet(rule.getUrl());
//这里要设置模拟浏览器登录,要不然直接拒绝你
httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.2)");
httpGet.setHeader("Accept-Language", "zh-cn,zh;q=0.5");
httpGet.setHeader("Accept-Charset", "GB2312,utf-8;q=0.7,*;q=0.7");
HttpResponse response = null;
while(true){
try {
response = httpClient.execute(new HttpHost(proxyHost, proxyPort),httpGet);
HttpEntity entity = response.getEntity();
return Jsoup.parse(EntityUtils.toString(entity));
} catch (Exception e) { //发生异常换IP
ischange = true;//打开门
chageIP(rule, ipQueue);
}
}
}
/** * 换IP的方法,这里要注意静态变量的线程安全,所有用到了Java中异步的方式 * @param r * @param ipQueue */
public static synchronized void chageIP(SuperRule r,Queue<String> ipQueue){
if(ischange){
//当可以换IP是,一个线程进来,其它线程等待
if (ipQueue.size()>0) {
try {
String ip_port = ipQueue.poll();
String[] ips = ip_port.split(":");
proxyHost = ips[0];
proxyPort = Integer.parseInt(ips[1]);
System.out.println("成功换IP---------------------"+proxyHost);
} catch (Exception e2) {
System.out.println("换IP发生错误");
}
}
ischange = false;//当一个线程换完IP后,其它的线程就不能换了,除非这个IP不能用,关上门
}
}
/** * 输出队列中元素 * @param q */
public static void sop(Queue<BlogRule> q){
BlogRule rule = null;
while((rule=q.poll())!=null){
System.out.println(rule.getUrl());
}
}
/** * 爬取是一个树状的方式,所以会有重复的,这里是通过一个Map来记录已经解析过得URL,用Map是因为HashMap * 是根据哈希表来进行存储数据的,查询的效率高 */
public static boolean isDownloaded(Map<String, Boolean> map,String url){
if(map.get(url)==null){
//如果不在Map中
return false;
}
else {
return true;
}
}
/** * 有时爬取的时候会出现一些错误,或者准备的IP用完了,不得不停止程序,但是存入数据库的解析过的网页数据又不能用,因为会重复, * 所以这里就写了一个把没有处理完的URL和已经处理过的URL写到本地去得方法 * @param queuespace */
public static void writeToLocal(Queue<BlogRule> queue,Queue<BlogRule> queuespace,Map<String, Boolean> map){
CountDownLatch countDownLatch = new CountDownLatch(3);//计数器,只有等这全部写完后,才能运行下面的程序
String basepath = "C:\\Users\\LG\\Desktop\\proxy\\";//存储路径自己定义
new Thread(new DatasBackUp(queue,basepath+"blogurl.txt",countDownLatch)).start();//存取博文的文件
new Thread(new DatasBackUp(queuespace,basepath+"blogspace.txt",countDownLatch)).start();//存取博客空间(这里可以查看博主所有博文)的路径,
new Thread(new DatasBackUp(map,basepath+"blogtemp.txt",countDownLatch)).start();//存取解析过的URL路径
try {
countDownLatch.await();
System.out.println("备份完成");
} catch (Exception e1) {
System.out.println("备份等待时错误");
}
}
}
3.解析博客的主要实现类,包含一个抽象类
package com.lg.parse;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.Map;
import java.util.Queue;
import java.util.Set;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.CountDownLatch;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.lg.po.BlogDatas;
import com.lg.po.BlogRule;
import com.lg.utils.ParseCommenUtil;
/** * 解析博客的抽象类 * @author LG * */
public abstract class AbstractBlogParse implements BlogParseable<BlogDatas>{
/** * 检测是否重复的存储单位 */
public static Map<String, Boolean> map = Collections.synchronizedMap(new HashMap<String, Boolean>());
/** * 开启线程 的数量 */
protected int count = 0;
public void setCount(int count) {
this.count = count;
}
/** * 存放代理IP的队列 */
public static Queue<String> ipQueue = new LinkedList<String>();
/** * 待解析的博文链接队列,线程安全 */
public static Queue<BlogRule> queue = new ConcurrentLinkedQueue<BlogRule>();
/** * 带解析的博客空间队列,线程安全 */
public static Queue<BlogRule> queuespace = new ConcurrentLinkedQueue<BlogRule>();
/** * 解析的主要方法,该方法继承自BlogParseable接口,该接口中只有一个parseMain()方法 */
public Set<BlogDatas> parseMain() {
CountDownLatch countDownLatch = new CountDownLatch(count);
for (int i = 0; i < count; i++) { //开启多线程爬取,--------这里是正式开始爬了
new Thread(new MyRunnable(countDownLatch)).start();
}
try {
countDownLatch.await();
} catch (Exception e1) {
System.out.println("countDownLatch发成了异常---计数器");
}
return null;
}
public abstract void parseSpace(BlogRule rule);
public abstract void parseBlog(BlogRule rule);
/** * 遍历博客空间 链接 得到如:http://blog.csdn.net/lmj623565791,存入队列 */
protected void getSpacesUrl(Elements links) {
if (links.size()>0) {
for(Element link:links){
BlogRule br = new BlogRule(link.absUrl("href"),BlogRule.SPACE);
if (!ParseCommenUtil.isDownloaded(map, br.getUrl())) {
queuespace.offer(br);
}
}
}
}
/** * 遍历所有博客,得到如:http://blog.csdn.net/lmj623565791/article/details/50709663,存入队列 */
protected void getBlogsUrl(Elements links) {
if (links.size()>0) {
for(Element link:links){
BlogRule br = new BlogRule();
String url = link.absUrl("href");
if (!ParseCommenUtil.isDownloaded(map, url)) {
br.setUrl(url);
br.setBlogType(BlogRule.BLOG);
br.setIsDirect(true);
queue.offer(br);
}
}
}
}
/** * 解析网页的多线程类 * @author LG * */
class MyRunnable implements Runnable{
private CountDownLatch countdown = null;
public MyRunnable(CountDownLatch countdown){
this.countdown = countdown;
}
public void run() {
BlogRule rule = null;
while(((rule=queue.poll())!=null)||((rule=queuespace.poll())!=null)){
//当解析队列不为空时
System.out.println(queue.size()+"--"+queuespace.size());
//判断博客的类型
switch (rule.getBlogType()){
case BlogRule.BLOG:
parseBlog(rule);
break;
case BlogRule.SPACE:
parseSpace(rule);
break;
}
}
countdown.countDown();
}
}
}
//继承类
package com.lg.parse;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.lg.db.OperateDB;
import com.lg.po.BlogRule;
import com.lg.po.SuperRule;
import com.lg.utils.ParseCommenUtil;
import com.lg.utils.TextUtil;
/** * 解析CSDN博客主要类 */
public class CsdnBlogsParse extends AbstractBlogParse{
/** * 记录解析的网页的数量 */
public static int dataCount = 0;
/** * 多久向本地更新一次没有处理完的URL和已经处理完的URL,这里设置的值当存储到数 * 据库的超过500条时,更新一次 */
public static int backTime = 0;
/** * 暂存已经解析的网页数据,用于数据库的插入,可用于多线程 */
private static StringBuffer dataBuffer = new StringBuffer();
public static StringBuffer getDataBuffer() {
return dataBuffer;
}
public static void setDataBuffer(StringBuffer dataBuffer) {
CsdnBlogsParse.dataBuffer = dataBuffer;
}
/** * 解析的某一篇博客的方法 * @param url */
public void parseBlog(BlogRule rule){
if (!ParseCommenUtil.isDownloaded(map, rule.getUrl())) {
//如果url没被访问过
Document doc = ParseCommenUtil.download(rule,ipQueue);
//缩小寻找的范围
Element ele = doc.getElementById("article_details");
if (ele!=null) {
String blog_title = null;//标题
String blog_url = null;//链接
String labels = null;//标签
String readDate = null,reads=null,comments = null;//发布时间
int readTimes = 0;//阅读量
int commentTimes = 0;//评论量
String userClass = null;//分类
Elements title = ele.select("div.article_title > h1 > span > a");
if (title!=null&&title.size()>0) {
blog_title = title.first().ownText();
blog_title = blog_title.replaceAll("'", "\\\\'");
blog_url = title.first().absUrl("href");
}
Elements cate = ele.select("span.link_categories");
if (cate!=null&&cate.size()>0) {
Elements categorys = cate.first().getElementsByTag("a");
if (categorys!=null&&categorys.size()>0) {
String temp = "";
for(Element category:categorys)
temp+= category.text()+",";
labels = temp;
}
}
Elements reads_before = ele.select("div.article_r");
if (reads_before!=null&&reads_before.size()>0) {
readDate = reads_before.first().child(0).text();
reads = reads_before.first().child(1).text();
readTimes = Integer.parseInt(reads.substring(0, reads.indexOf("人")));
comments = reads_before.first().child(2).text();
}
commentTimes = Integer.parseInt(comments.substring(comments.indexOf("(")+1, comments.indexOf(")")));
Elements c = ele.select("div.category_r > label > span");
if (c!=null&&c.size()>0) {
Element classes = c.first();
userClass = classes.ownText();
}
dataBuffer.append("('" +blog_title+"','" +blog_url+ "','" + labels+ "','" +readDate+ "',"+readTimes +"," +
commentTimes +",'" + userClass + "'" +"),");
OperateDB.insert();//如果解析的网页数据已经达到50,开始向数据库存数据
parseComment(doc);
if(!rule.getIsDirect()){
//将该博客空间加入队列
queuespace.offer(new BlogRule("http://blog.csdn.net/"+TextUtil.getUserName(rule.getUrl()),BlogRule.SPACE));
parseHome(doc);
}
}
map.put(rule.getUrl(), true);
}
}
/** * 获取博客空间中所有博客的URL */
public void parseSpace(BlogRule rule){
if (!ParseCommenUtil.isDownloaded(map, rule.getUrl())) {
Document doc = ParseCommenUtil.download(rule,ipQueue);
if (TextUtil.isOpenBlog(doc)&&doc.select("div.article_title")!=null) {
//如果开通了博客
Elements links = doc.select("div.article_title > h1 > span > a");
if (links!=null&&links.size()>0) {
getBlogsUrl(links);
}
parseNextPage(doc);
//左边相关链接
//parseLeftLink(doc);
//我的博客
parseHome(doc);
map.put(rule.getUrl(), true); //将解析过的URL放入Map中
}
}
}
/** * 解析我的个人信息 */
protected void parseHome(Element ele){
String personHome = ele.getElementById("blog_userface").child(0).absUrl("href");
Document doc = ParseCommenUtil.download(new SuperRule(personHome),ipQueue);
Elements relations = doc.select("div.mod_relations");
if (relations!=null&&relations.size()>0) {
Elements persionlinks = relations.first().getElementsByTag("a");
getSpacesUrl(persionlinks);
}
}
/** * 解析左边栏 相关链接 */
protected void parseLeftLink(Element doc) {
Element links_before = doc.getElementById("side");
if (links_before!=null) {
Elements links = links_before.select("ul.panel_body > ul > li > a");
getSpacesUrl(links);
}
}
/** * 解析博客评论人的路径,有问题,有待改进 */
protected void parseComment(Element doc) {
Element links_before = doc.getElementById("comment_list");
if (links_before!=null) {
Elements links = links_before.select("dd.comment_userface > a");
getSpacesUrl(links);
}
}
/** * 解析博客空间中下一页博客URL */
public void parseNextPage(Element doc){
Element pagelists = doc.getElementById("papelist");
if (pagelists!=null) {
String firstUrl = pagelists.select("a").first().absUrl("href");
String nextUrl = firstUrl.split("article/list")[0] + "article/list/";//获取下一页的路径
String pagedesc = pagelists.select("span").first().text();
Pattern pattern = Pattern.compile("([0-9]+).*?([0-9]+)");
Matcher matcher = pattern.matcher(pagedesc.trim());
int pageCount = 0;
while(matcher.find()){
pageCount = Integer.parseInt(matcher.group(2));
}
for (int i = 2; i <= pageCount; i++) {
Document docpage = ParseCommenUtil.download(new SuperRule(nextUrl + i),ipQueue);
getBlogsUrl(docpage.select("div.article_title > h1 > span > a"));
}
}
}
}
4获取基础数据类,爬取数据都要有一个头,这个类就是头
package com.lg.base;
import java.util.Set;
import java.util.concurrent.CountDownLatch;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.lg.filter.DatasRecover;
import com.lg.parse.AbstractBlogParse;
import com.lg.parse.CsdnBlogsParse;
import com.lg.po.BlogDatas;
import com.lg.po.BlogRule;
import com.lg.utils.ParseCommenUtil;
/** * 获取基础数据的类,以http://blog.csdn.net/index.html为获取基础数据的接口 * @author LG * */
public class CsdnParseIndex extends AbstractBlogParse{
/** * 解析博客空间和个人博客的对象 */
CsdnBlogsParse blogsParse = null;
Document doc = null;
/** * 用户自定义的开始路径,初始化的规则 */
private BlogRule blogrule = null;
public CsdnParseIndex(BlogRule blogrule){
this.blogrule = blogrule;
}
public CsdnParseIndex(){}
public Set<BlogDatas> parseMain() {
//初始化
init();
if (queue.size()<=0||queuespace.size()>0) {
//如果备份中没有数据
if (blogrule==null) {
//按照默认来进行
blogrule = new BlogRule("http://blog.csdn.net/index.html", 3);
parseBlog(null);
parseSpace(null);
//取出下一页数据
for (int count = 2; count < 23; count++) {
String url = "http://blog.csdn.net/index.html?&page="+count;
blogrule.setUrl(url);
parseBlog(null);
}
}
else {
//用户自定义
queue.offer(blogrule);
}
}
blogsParse = new CsdnBlogsParse();
blogsParse.setCount(count);
blogsParse.parseMain();//开始解析
return null;
}
/** * 取出http://blog.csdn.net/index.html左边链接 */
@Override
public void parseSpace(BlogRule rule) {
Element left = doc.select("div.main_left").first();
Elements alives = left.select("dl.alive_user > dt > a");
//取出左边推荐专家
getSpacesUrl(alives);
Elements experts = left.select("dl.experts > dd > ul > li > a");
//取出左边博客专家
getSpacesUrl(experts);
}
/** * 取出http://blog.csdn.net/index.html中间元素 */
@Override
public void parseBlog(BlogRule rule) {
doc = ParseCommenUtil.download(blogrule,null);
Element center = doc.select("div.main_center").first();
Elements links = center.select("div.blog_list > h1");
if (links.size()>0) {
for(Element link:links){
BlogRule br = new BlogRule();
String url = null;
if(link.select("a").size()>1)
url = link.child(1).absUrl("href");
else {
url = link.child(0).absUrl("href");
}
if (!ParseCommenUtil.isDownloaded(map, url)) {
br.setUrl(url);
br.setBlogType(BlogRule.BLOG);
br.setIsDirect(true);
queue.offer(br);
}
}
}
}
/** * 判断是否有没有处理完的URL,如果有,从这里开始,还有就是读取IP地址 */
public void init(){
CountDownLatch countDownLatch = new CountDownLatch(4);
for (int i = 0; i < 4; i++) {
new Thread(new DatasRecover(i, countDownLatch)).start();
}
try {
countDownLatch.await();
System.out.println("数据恢复完成");
} catch (Exception e1) {
System.out.println("备份恢复时错误");
}
}
}
5爬虫类,可以接受任意的基础数据类
package com.lg.spider;
import com.lg.parse.AbstractBlogParse;
import com.lg.parse.BlogParseable;
/** * 爬虫类 * @author LG * */
public class Spider {
/** * 爬虫的数量 */
public int count;
public Spider(int count){
this.count = count;
}
AbstractBlogParse BlogParse = null;
public AbstractBlogParse getBlogParseable() {
return BlogParse;
}
public void setBlogParseable(AbstractBlogParse BlogParse) {
this.BlogParse = BlogParse;
}
/** * 爬取的入口 * @return * @throws Exception */
public String process() throws Exception {
BlogParse.setCount(count);
BlogParse.parseMain();
return null;
}
}
6测试类,
public static void testParseIndex() throws Exception{
String url = "http://blog.csdn.net/sunny2038/article/details/6926079";
BlogRule rule = new BlogRule();
rule.setUrl(url);
rule.setBlogType(2);
Spider spider = new Spider(10);
CsdnParseIndex ci = new CsdnParseIndex();
spider.setBlogParseable(ci);
spider.process();
}
代码有点多,还有其他的类在这里就不贴出来了,给个下载链接,
第一次写博客,排版不好,希望谅解,代码写的不是很好,各位技术大牛们多多指教。
还没有评论,来说两句吧...