java爬取国家应急平台漏洞公告数据-蒲公英云

java爬取国家应急平台漏洞公告数据

java爬取http类型的网站比较容易实现，因为不需要建立证书的通道，直接通过httpclient访问链接获取相应源码就可以获取相关数据，现在我们可以通过证书的方式，实现java爬取https网站的相关数据。

获取https网站证书的方式可以看我上一篇博客：https://blog.csdn.net/qq_36706878/article/details/102546563

这里直接附上源码和运行结果

import java.io.File;
import java.io.IOException;
import javax.net.ssl.SSLContext;
import org.apache.http.*;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.conn.ssl.TrustSelfSignedStrategy;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.ssl.SSLContexts;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.apache.commons.logging.*;
public class HttpsReptile {
    //国家应急中心域名
    public static String mainUrl = "https://www.cert.org.cn";
    public static void httpsReptile(){
         //国家应急中心漏洞公告页面
         String leakUrl = "https://www.cert.org.cn/publish/main/9/index.html";
         //处理第一页的源码
         Document leakListDoc = sendHttp(leakUrl);
         int page = 1;
         System.out.println("处理第"+page+"页数据");
         solvePageData(leakListDoc);
         getPageCode(leakListDoc,page);
    }
    /**
     * 获取下一页的源码
     * @param leakListDoc
     * @param page
     */
    public static void getPageCode(Document leakListDoc,int page) {
        //获取下一页的链接 访问获取源码
         Elements aElements = leakListDoc.getElementsByTag("a");
         for(int i=0;i<aElements.size();i++){
             Element aString = aElements.get(i);
             if(aString.toString().contains("下一页")) {
                 String nextHref = aString.attr("href");
                 nextHref = mainUrl + nextHref;
                 //处理下一页的源码
                 Document newPageLeakListDoc = sendHttp(nextHref);
                 page++;
                 System.out.println("处理第"+page+"页数据");
                 solvePageData(newPageLeakListDoc);
                 getPageCode(newPageLeakListDoc,page);
             }
         }
    }
    /**
     * 处理某一页的漏洞公告
     * @param leakListDoc
     */
    public static void solvePageData(Document leakListDoc) {
        Elements ulElements = leakListDoc.getElementsByClass("waring_con");
         for(int i=0;i<ulElements.size();i++){
             Elements liElements = ulElements.get(i).getElementsByTag("li");
             for(int j=0;j<liElements.size();j++) {
                 String onclickvalue = liElements.get(j).toString();
                 onclickvalue = onclickvalue.substring(85, 159);
                 System.out.println(onclickvalue);
             }
         }
    }
    public static Document sendHttp(String url) {
        String html = "";
        CloseableHttpClient httpclient = null;
        CloseableHttpResponse response = null;
        try {
            SSLConnectionSocketFactory sslsf = createSSLConnSocketFactory();
            httpclient = HttpClients.custom()
                .setSSLSocketFactory(sslsf).build();
            HttpGet httpget = new HttpGet(url);
            httpget.addHeader(HttpHeaders.USER_AGENT,
                    "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0");
            RequestConfig requestConfig = RequestConfig.custom()
                    .setSocketTimeout(10000).setConnectTimeout(10000).build();// 设置请求和传输超时时间
            httpget.setConfig(requestConfig);
            System.out.println("Executing request " + httpget.getRequestLine());
            response = httpclient.execute(httpget);
            HttpEntity entity = response.getEntity();
            System.out.println("----------------------------------------");
            System.out.println(response.getStatusLine());
            int resStatu = response.getStatusLine().getStatusCode();// 返回码
            if (resStatu == HttpStatus.SC_OK) {// 200正常 其他就不对
                // 获得相应实体
                if (entity != null) {
                    html = EntityUtils.toString(entity, "UTF-8");
                    html = html.replace(" ", " ");
                    //System.out.println(html);
                }
            }
            EntityUtils.consume(entity);
        } catch(Exception e){
            e.printStackTrace();
        }finally{
            if(response!=null){
                try {
                    response.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
            if(httpclient!=null){
                try {
                    httpclient.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
        Document document = Jsoup.parse(html);
        return document;
    }
    // ssl通道证书的创建
    private static SSLConnectionSocketFactory createSSLConnSocketFactory()
            throws Exception {
        SSLContext sslcontext = SSLContexts
                .custom()
                .loadTrustMaterial(
                        new File(
                                "E:/key.keystore"),
                        "123456".toCharArray(), new TrustSelfSignedStrategy())   //文件和密码要对应
                .build();
        SSLConnectionSocketFactory sslsf = new SSLConnectionSocketFactory(
                sslcontext, new String[] { "TLSv1" }, null,
                SSLConnectionSocketFactory.getDefaultHostnameVerifier());
        return sslsf;
    }
    public static void main(String[] args) {
        // TODO Auto-generated method stub
        httpsReptile();
        //sendHttp("https://blog.csdn.net/qq_36706878");
    }
}