java 爬取网页内容。 标题、图片等

绝地灬酷狼 2022-09-12 10:49 294阅读 0赞
  1. package com.fh.util;
  2. import java.io.BufferedReader;
  3. import java.io.IOException;
  4. import java.io.InputStream;
  5. import java.io.InputStreamReader;
  6. import java.net.URL;
  7. import java.net.URLConnection;
  8. import java.util.ArrayList;
  9. import java.util.List;
  10. import java.util.regex.Matcher;
  11. import java.util.regex.Pattern;
  12. /**
  13. * 说明:爬取网页
  14. * 作者:FH Admin
  15. * from:fhadmin.cn
  16. */
  17. public class GetWeb {
  18. /**
  19. * 获取当前网页的code
  20. *
  21. * @param httpUrl
  22. * 网页地址
  23. * @return
  24. * @throws IOException
  25. */
  26. public static String getHtmlCode(String httpUrl) throws IOException {
  27. String content = ""; // 定义字符串content
  28. URL url = new URL(httpUrl); // 生成传入的URL的对象
  29. BufferedReader reader = new BufferedReader(new InputStreamReader(
  30. url.openStream(), "utf-8"));// 获得当前url的字节流(缓冲)
  31. String input;
  32. while ((input = reader.readLine()) != null) { // 当前行存在数据时
  33. content += input; // 将读取数据赋给content
  34. }
  35. reader.close(); // 关闭缓冲区
  36. return content;
  37. }
  38. /**
  39. * 把网页中的所有图片的完整路径放到list里面
  40. *
  41. * @param wwwurl
  42. * 要爬的网页连接
  43. * @throws IOException
  44. */
  45. public static List<String> getImagePathList(String httpUrl)
  46. throws IOException {
  47. // 通过扩展名匹配网页图片的正则表达式
  48. // String searchImgReg =
  49. // "(?x)(src|SRC|background|BACKGROUND)=('|\")/?(([\\w-]+/)*([\\w-]+\\.(jpg|JPG|png|PNG|gif|GIF)))('|\")";
  50. // String searchImgReg2 =
  51. // "(?x)(src|SRC|background|BACKGROUND)=('|\")(http://([\\w-]+\\.)+[\\w-]+(:[0-9]+)*(/[\\w-]+)*(/[\\w-]+\\.(jpg|JPG|png|PNG|gif|GIF)))('|\")";
  52. // 通过img标签匹配网页图片的正则表达式
  53. String searchImgReg = "<(img|IMG)\\b[^>]*\\b(src|SRC|src2|SRC2)\\b\\s*=\\s*('|\")?([^'\"\n\r\f>]+(\\.jpg|\\.bmp|\\.eps|\\.gif|\\.mif|\\.miff|\\.png|\\.tif|\\.tiff|\\.svg|\\.wmf|\\.jpe|\\.jpeg|\\.dib|\\.ico|\\.tga|\\.cut|\\.pic)\\b)[^>]*>";
  54. List<String> imgList = new ArrayList<String>(); // 存放图片的list
  55. String content = null;
  56. content = getHtmlCode(httpUrl); // 获得content
  57. Pattern pattern = Pattern.compile(searchImgReg); // 讲编译的正则表达式对象赋给pattern
  58. Matcher matcher = pattern.matcher(content); // 对字符串content执行正则表达式
  59. while (matcher.find()) {
  60. String quote = matcher.group(3);
  61. String imgsrc = (quote == null || quote.trim().length() == 0) ? matcher.group(4).split("\\s+")[0] : matcher.group(4);
  62. if (!imgsrc.startsWith("http://") && !imgsrc.startsWith("https://")) { // 检验地址是否http://
  63. String[] httpUrlarr = httpUrl.split("/");
  64. String wwwhost = httpUrlarr[0] + "//" + httpUrlarr[2]; //获取域名完整地址
  65. if(!isNetFileAvailable(wwwhost + "/" + imgsrc)){
  66. for(int i=3;i<httpUrlarr.length;i++){
  67. wwwhost = wwwhost + "/" + httpUrlarr[i];
  68. if(isNetFileAvailable(wwwhost + "/" + imgsrc)){
  69. imgsrc = wwwhost + "/" + imgsrc;
  70. break;
  71. }
  72. }
  73. }else{
  74. imgsrc = wwwhost + "/" + imgsrc;
  75. }
  76. }
  77. imgList.add(imgsrc);
  78. }
  79. return imgList;
  80. }
  81. /**
  82. * 获取网页的标题
  83. *
  84. * @param httpUrl
  85. * 要爬的网页连接
  86. * @return
  87. */
  88. public static String getTilte(String httpUrl) {
  89. String searchTitle = "(<title>|<TITLE>)(.*?)(</title>|</TITLE>)"; // 获取网页的标题的正则表达式
  90. Pattern pattern = Pattern.compile(searchTitle); // 获得content
  91. try {
  92. Matcher matcher = pattern.matcher(getHtmlCode(httpUrl));
  93. while (matcher.find()) {
  94. return matcher.group(2);
  95. }
  96. } catch (IOException e) {
  97. e.printStackTrace();
  98. }
  99. return null;
  100. }
  101. /**
  102. * 检测网络资源是否存在 
  103. *
  104. * @param strUrl
  105. * @return
  106. */
  107. public static boolean isNetFileAvailable(String strUrl) {
  108. InputStream netFileInputStream = null;
  109. try {
  110. URL url = new URL(strUrl);
  111. URLConnection urlConn = url.openConnection();
  112. netFileInputStream = urlConn.getInputStream();
  113. if (null != netFileInputStream) {
  114. return true;
  115. } else {
  116. return false;
  117. }
  118. } catch (IOException e) {
  119. return false;
  120. } finally {
  121. try {
  122. if (netFileInputStream != null)
  123. netFileInputStream.close();
  124. } catch (IOException e) {
  125. }
  126. }
  127. }
  128. }

发表评论

表情:
评论列表 (有 0 条评论,294人围观)

还没有评论,来说两句吧...

相关阅读