使用轻量级JAVA 爬虫Gecco工具抓取新闻DEMO

雨点打透心脏的1/2处 2022-07-26 06:06 88阅读 0赞

## 写在前面 ##

最近看到Gecoo爬虫工具，感觉比较简单好用，所有写个DEMO测试一下，抓取网站  
[http://zj.zjol.com.cn/home.html][http_zj.zjol.com.cn_home.html]，主要抓取新闻的标题和发布时间做为抓取测试对象。抓取HTML节点通过像Jquery选择器一样选择节点，非常方便，Gecco代码主要利用注解实现来实现URL匹配，看起来比较简洁美观。

**Gecoo GitHub地址**  
[https://github.com/xtuhcy/gecco][https_github.com_xtuhcy_gecco]  
**Gecoo 作者博客**  
[http://my.oschina.net/u/2336761/blog?fromerr=ZuKKo3fH][http_my.oschina.net_u_2336761_blog_fromerr_ZuKKo3fH]

## 添加Maven依赖 ##

<dependency>
                <groupId>com.geccocrawler</groupId>
                <artifactId>gecco</artifactId>
                <version>1.0.8</version>
            </dependency>

## 编写抓取列表页面 ##

@Gecco(matchUrl = "http://zj.zjol.com.cn/home.html?pageIndex={pageIndex}&pageSize={pageSize}",pipelines = "zJNewsListPipelines")
    public class ZJNewsGeccoList implements HtmlBean {
        
        @Request
        private HttpRequest request;
        @RequestParameter
        private int pageIndex;
        @RequestParameter
        private int pageSize;
        @HtmlField(cssPath = "#content > div > div > div.con_index > div.r.main_mod > div > ul > li  > dl > dt > a")
        private List<HrefBean> newList;
    }

@PipelineName("zJNewsListPipelines")
    public class ZJNewsListPipelines implements Pipeline<ZJNewsGeccoList> {
        
        public void process(ZJNewsGeccoList zjNewsGeccoList) {
            HttpRequest request=zjNewsGeccoList.getRequest();
            for (HrefBean bean:zjNewsGeccoList.getNewList()){
                //进入祥情页面抓取
           SchedulerContext.into(request.subRequest("http://zj.zjol.com.cn"+bean.getUrl()));
            }
            int page=zjNewsGeccoList.getPageIndex()+1;
            String nextUrl = "http://zj.zjol.com.cn/home.html?pageIndex="+page+"&pageSize=100";
            //抓取下一页
            SchedulerContext.into(request.subRequest(nextUrl));
        }
    }

## 编写抓取祥情页面 ##

@Gecco(matchUrl = "http://zj.zjol.com.cn/news/{code}.html" ,pipelines = "zjNewsDetailPipeline")
    public class ZJNewsDetail implements HtmlBean {
        
    
        @Text
        @HtmlField(cssPath = "#headline")
        private String title ;
    
        @Text
        @HtmlField(cssPath = "#content > div > div.news_con > div.news-content > div:nth-child(1) > div > p.go-left.post-time.c-gray")
        private String createTime;
    }

@PipelineName("zjNewsDetailPipeline")
    public class ZJNewsDetailPipeline implements Pipeline<ZJNewsDetail> {
        
        public void process(ZJNewsDetail zjNewsDetail) {
            System.out.println(zjNewsDetail.getTitle()+"  "+zjNewsDetail.getCreateTime());
        }
    }

## 启动主函数 ##

public class Main {
        public static void main(String [] rags){
            GeccoEngine.create()
                    //工程的包路径
                    .classpath("com.zhaochao.gecco.zj")
                    //开始抓取的页面地址
                    .start("http://zj.zjol.com.cn/home.html?pageIndex=1&pageSize=100")
                    //开启几个爬虫线程
                    .thread(10)
                    //单个爬虫每次抓取完一个请求后的间隔时间
                    .interval(10)
                    //使用pc端userAgent
                    .mobile(false)
                    //开始运行
                    .run();
        }
    }

## 抓取结果 ##

![这里写图片描述][20160408141411339]

![这里写图片描述][20160408141337116]

## 项目完成代码 ##

[http://git.oschina.net/whzhaochao/geccoDemo][http_git.oschina.net_whzhaochao_geccoDemo]

[http_zj.zjol.com.cn_home.html]: http://zj.zjol.com.cn/home.html
[https_github.com_xtuhcy_gecco]: https://github.com/xtuhcy/gecco
[http_my.oschina.net_u_2336761_blog_fromerr_ZuKKo3fH]: http://my.oschina.net/u/2336761/blog?fromerr=ZuKKo3fH
[20160408141411339]: /images/20220724/d41c6a620df64d69b1be4152a1f2073e.png
[20160408141337116]: /images/20220724/5fae6efd8d394475be9662470f187780.png
[http_git.oschina.net_whzhaochao_geccoDemo]: http://git.oschina.net/whzhaochao/geccoDemo