java爬取国家应急平台漏洞公告数据
java爬取http类型的网站比较容易实现,因为不需要建立证书的通道,直接通过httpclient访问链接获取相应源码就可以获取相关数据,现在我们可以通过证书的方式,实现java爬取https网站的相关数据。
获取https网站证书的方式可以看我上一篇博客:https://blog.csdn.net/qq_36706878/article/details/102546563
这里直接附上源码和运行结果
import java.io.File;
import java.io.IOException;
import javax.net.ssl.SSLContext;
import org.apache.http.*;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.conn.ssl.TrustSelfSignedStrategy;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.ssl.SSLContexts;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.apache.commons.logging.*;
public class HttpsReptile {
//国家应急中心域名
public static String mainUrl = "https://www.cert.org.cn";
public static void httpsReptile(){
//国家应急中心漏洞公告页面
String leakUrl = "https://www.cert.org.cn/publish/main/9/index.html";
//处理第一页的源码
Document leakListDoc = sendHttp(leakUrl);
int page = 1;
System.out.println("处理第"+page+"页数据");
solvePageData(leakListDoc);
getPageCode(leakListDoc,page);
}
/**
* 获取下一页的源码
* @param leakListDoc
* @param page
*/
public static void getPageCode(Document leakListDoc,int page) {
//获取下一页的链接 访问获取源码
Elements aElements = leakListDoc.getElementsByTag("a");
for(int i=0;i<aElements.size();i++){
Element aString = aElements.get(i);
if(aString.toString().contains("下一页")) {
String nextHref = aString.attr("href");
nextHref = mainUrl + nextHref;
//处理下一页的源码
Document newPageLeakListDoc = sendHttp(nextHref);
page++;
System.out.println("处理第"+page+"页数据");
solvePageData(newPageLeakListDoc);
getPageCode(newPageLeakListDoc,page);
}
}
}
/**
* 处理某一页的漏洞公告
* @param leakListDoc
*/
public static void solvePageData(Document leakListDoc) {
Elements ulElements = leakListDoc.getElementsByClass("waring_con");
for(int i=0;i<ulElements.size();i++){
Elements liElements = ulElements.get(i).getElementsByTag("li");
for(int j=0;j<liElements.size();j++) {
String onclickvalue = liElements.get(j).toString();
onclickvalue = onclickvalue.substring(85, 159);
System.out.println(onclickvalue);
}
}
}
public static Document sendHttp(String url) {
String html = "";
CloseableHttpClient httpclient = null;
CloseableHttpResponse response = null;
try {
SSLConnectionSocketFactory sslsf = createSSLConnSocketFactory();
httpclient = HttpClients.custom()
.setSSLSocketFactory(sslsf).build();
HttpGet httpget = new HttpGet(url);
httpget.addHeader(HttpHeaders.USER_AGENT,
"Mozilla/5.0 (Windows NT 6.3; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0");
RequestConfig requestConfig = RequestConfig.custom()
.setSocketTimeout(10000).setConnectTimeout(10000).build();// 设置请求和传输超时时间
httpget.setConfig(requestConfig);
System.out.println("Executing request " + httpget.getRequestLine());
response = httpclient.execute(httpget);
HttpEntity entity = response.getEntity();
System.out.println("----------------------------------------");
System.out.println(response.getStatusLine());
int resStatu = response.getStatusLine().getStatusCode();// 返回码
if (resStatu == HttpStatus.SC_OK) {// 200正常 其他就不对
// 获得相应实体
if (entity != null) {
html = EntityUtils.toString(entity, "UTF-8");
html = html.replace(" ", " ");
//System.out.println(html);
}
}
EntityUtils.consume(entity);
} catch(Exception e){
e.printStackTrace();
}finally{
if(response!=null){
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if(httpclient!=null){
try {
httpclient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
Document document = Jsoup.parse(html);
return document;
}
// ssl通道证书的创建
private static SSLConnectionSocketFactory createSSLConnSocketFactory()
throws Exception {
SSLContext sslcontext = SSLContexts
.custom()
.loadTrustMaterial(
new File(
"E:/key.keystore"),
"123456".toCharArray(), new TrustSelfSignedStrategy()) //文件和密码要对应
.build();
SSLConnectionSocketFactory sslsf = new SSLConnectionSocketFactory(
sslcontext, new String[] { "TLSv1" }, null,
SSLConnectionSocketFactory.getDefaultHostnameVerifier());
return sslsf;
}
public static void main(String[] args) {
// TODO Auto-generated method stub
httpsReptile();
//sendHttp("https://blog.csdn.net/qq_36706878");
}
}
运行结果如下 获取到详情页的各个链接 再次获取数据就行
还没有评论,来说两句吧...