java爬取国家应急平台漏洞公告数据

深碍√TFBOYSˉ_ 2023-06-08 03:08 78阅读 0赞

java爬取http类型的网站比较容易实现,因为不需要建立证书的通道,直接通过httpclient访问链接获取相应源码就可以获取相关数据,现在我们可以通过证书的方式,实现java爬取https网站的相关数据。

获取https网站证书的方式可以看我上一篇博客:https://blog.csdn.net/qq_36706878/article/details/102546563

这里直接附上源码和运行结果

  1. import java.io.File;
  2. import java.io.IOException;
  3. import javax.net.ssl.SSLContext;
  4. import org.apache.http.*;
  5. import org.apache.http.client.config.RequestConfig;
  6. import org.apache.http.client.methods.CloseableHttpResponse;
  7. import org.apache.http.client.methods.HttpGet;
  8. import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
  9. import org.apache.http.conn.ssl.TrustSelfSignedStrategy;
  10. import org.apache.http.impl.client.CloseableHttpClient;
  11. import org.apache.http.impl.client.HttpClients;
  12. import org.apache.http.ssl.SSLContexts;
  13. import org.apache.http.util.EntityUtils;
  14. import org.jsoup.Jsoup;
  15. import org.jsoup.nodes.Document;
  16. import org.jsoup.nodes.Element;
  17. import org.jsoup.select.Elements;
  18. import org.apache.commons.logging.*;
  19. public class HttpsReptile {
  20. //国家应急中心域名
  21. public static String mainUrl = "https://www.cert.org.cn";
  22. public static void httpsReptile(){
  23. //国家应急中心漏洞公告页面
  24. String leakUrl = "https://www.cert.org.cn/publish/main/9/index.html";
  25. //处理第一页的源码
  26. Document leakListDoc = sendHttp(leakUrl);
  27. int page = 1;
  28. System.out.println("处理第"+page+"页数据");
  29. solvePageData(leakListDoc);
  30. getPageCode(leakListDoc,page);
  31. }
  32. /**
  33. * 获取下一页的源码
  34. * @param leakListDoc
  35. * @param page
  36. */
  37. public static void getPageCode(Document leakListDoc,int page) {
  38. //获取下一页的链接 访问获取源码
  39. Elements aElements = leakListDoc.getElementsByTag("a");
  40. for(int i=0;i<aElements.size();i++){
  41. Element aString = aElements.get(i);
  42. if(aString.toString().contains("下一页")) {
  43. String nextHref = aString.attr("href");
  44. nextHref = mainUrl + nextHref;
  45. //处理下一页的源码
  46. Document newPageLeakListDoc = sendHttp(nextHref);
  47. page++;
  48. System.out.println("处理第"+page+"页数据");
  49. solvePageData(newPageLeakListDoc);
  50. getPageCode(newPageLeakListDoc,page);
  51. }
  52. }
  53. }
  54. /**
  55. * 处理某一页的漏洞公告
  56. * @param leakListDoc
  57. */
  58. public static void solvePageData(Document leakListDoc) {
  59. Elements ulElements = leakListDoc.getElementsByClass("waring_con");
  60. for(int i=0;i<ulElements.size();i++){
  61. Elements liElements = ulElements.get(i).getElementsByTag("li");
  62. for(int j=0;j<liElements.size();j++) {
  63. String onclickvalue = liElements.get(j).toString();
  64. onclickvalue = onclickvalue.substring(85, 159);
  65. System.out.println(onclickvalue);
  66. }
  67. }
  68. }
  69. public static Document sendHttp(String url) {
  70. String html = "";
  71. CloseableHttpClient httpclient = null;
  72. CloseableHttpResponse response = null;
  73. try {
  74. SSLConnectionSocketFactory sslsf = createSSLConnSocketFactory();
  75. httpclient = HttpClients.custom()
  76. .setSSLSocketFactory(sslsf).build();
  77. HttpGet httpget = new HttpGet(url);
  78. httpget.addHeader(HttpHeaders.USER_AGENT,
  79. "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0");
  80. RequestConfig requestConfig = RequestConfig.custom()
  81. .setSocketTimeout(10000).setConnectTimeout(10000).build();// 设置请求和传输超时时间
  82. httpget.setConfig(requestConfig);
  83. System.out.println("Executing request " + httpget.getRequestLine());
  84. response = httpclient.execute(httpget);
  85. HttpEntity entity = response.getEntity();
  86. System.out.println("----------------------------------------");
  87. System.out.println(response.getStatusLine());
  88. int resStatu = response.getStatusLine().getStatusCode();// 返回码
  89. if (resStatu == HttpStatus.SC_OK) {// 200正常 其他就不对
  90. // 获得相应实体
  91. if (entity != null) {
  92. html = EntityUtils.toString(entity, "UTF-8");
  93. html = html.replace(" ", " ");
  94. //System.out.println(html);
  95. }
  96. }
  97. EntityUtils.consume(entity);
  98. } catch(Exception e){
  99. e.printStackTrace();
  100. }finally{
  101. if(response!=null){
  102. try {
  103. response.close();
  104. } catch (IOException e) {
  105. e.printStackTrace();
  106. }
  107. }
  108. if(httpclient!=null){
  109. try {
  110. httpclient.close();
  111. } catch (IOException e) {
  112. e.printStackTrace();
  113. }
  114. }
  115. }
  116. Document document = Jsoup.parse(html);
  117. return document;
  118. }
  119. // ssl通道证书的创建
  120. private static SSLConnectionSocketFactory createSSLConnSocketFactory()
  121. throws Exception {
  122. SSLContext sslcontext = SSLContexts
  123. .custom()
  124. .loadTrustMaterial(
  125. new File(
  126. "E:/key.keystore"),
  127. "123456".toCharArray(), new TrustSelfSignedStrategy()) //文件和密码要对应
  128. .build();
  129. SSLConnectionSocketFactory sslsf = new SSLConnectionSocketFactory(
  130. sslcontext, new String[] { "TLSv1" }, null,
  131. SSLConnectionSocketFactory.getDefaultHostnameVerifier());
  132. return sslsf;
  133. }
  134. public static void main(String[] args) {
  135. // TODO Auto-generated method stub
  136. httpsReptile();
  137. //sendHttp("https://blog.csdn.net/qq_36706878");
  138. }
  139. }

运行结果如下 获取到详情页的各个链接 再次获取数据就行

watermark_type_ZmFuZ3poZW5naGVpdGk_shadow_10_text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3FxXzM2NzA2ODc4_size_16_color_FFFFFF_t_70

发表评论

表情:
评论列表 (有 0 条评论,78人围观)

还没有评论,来说两句吧...

相关阅读