Jsoup爬取网络内容(包括图片文件),保存到本地和保存到数据库(一)

绝地灬酷狼 2022-12-20 15:54 363阅读 0赞

背景:

项目需要某个区县的天气数据,需要从中国气象局的官网中进行爬取。但是,中国气象局服务器调用接口返回的数据没有我想要的信息,比如说是未来24小时的天气温度,气压,风速等信息,这些数据接口中都没有,那么只能从页面中进行获取,所以使用了Jsoup,有一些信息是图片的形式返回,需要将图片下载下来存储到数据库中,用到的数据库是postgres数据库。工具类是Hutool和Jsoup

爬取的目标信息

target.xml

  1. <div id=day0 class="clearfix pull-left">
  2. <div class="hour3 hbg">
  3. <div> 11:00 </div>
  4. <div class=hourimg>
  5. <img src="http://image.nmc.cn/assets/img/w/40x40/3/0.png">
  6. </div>
  7. <div> - </div>
  8. <div class=tmp_lte_20> 16.2℃ </div>
  9. <div> 1.7m/s </div>
  10. <div> 东北风 </div>
  11. <div> 1002.1hPa </div>
  12. <div> 56% </div>
  13. <div class=hide> 0% </div>
  14. </div>
  15. <div class="hour3 hbg">
  16. <div> 14:00 </div>
  17. <div class=hourimg>
  18. <img src="http://image.nmc.cn/assets/img/w/40x40/3/0.png">
  19. </div>
  20. <div> - </div>
  21. <div class=tmp_lte_20> 17.8℃ </div>
  22. <div> 1.8m/s </div>
  23. <div> 东北风 </div>
  24. <div> 1000.5hPa </div>
  25. <div> 50.5% </div>
  26. <div class=hide> 0% </div>
  27. </div>
  28. <div class="hour3 hbg">
  29. <div> 17:00 </div>
  30. <div class=hourimg>
  31. <img src="http://image.nmc.cn/assets/img/w/40x40/3/0.png">
  32. </div>
  33. <div> - </div>
  34. <div class=tmp_lte_15> 13.1℃ </div>
  35. <div> 1.3m/s </div>
  36. <div> 东风 </div>
  37. <div> 1000.4hPa </div>
  38. <div> 61.6% </div>
  39. <div class=hide> 0% </div>
  40. </div>
  41. <div class="hour3 hbg">
  42. <div> 20:00 </div>
  43. <div class=hourimg>
  44. <img src="http://image.nmc.cn/assets/img/w/40x40/3/0.png">
  45. </div>
  46. <div> - </div>
  47. <div class=tmp_lte_10> 8.3℃ </div>
  48. <div> 0.8m/s </div>
  49. <div> 东风 </div>
  50. <div> 1002hPa </div>
  51. <div> 65.5% </div>
  52. <div class=hide> 0.2% </div>
  53. </div>
  54. <div class="hour3 hbg">
  55. <div> 23:00 </div>
  56. <div class=hourimg>
  57. <img src="http://image.nmc.cn/assets/img/w/40x40/3/0.png">
  58. </div>
  59. <div> - </div>
  60. <div class=tmp_lte_10> 6.6℃ </div>
  61. <div> 0.4m/s </div>
  62. <div> 东南风 </div>
  63. <div> 1001hPa </div>
  64. <div> 71.7% </div>
  65. <div class=hide> 10.1% </div>
  66. </div>
  67. <div class="hour3 ">
  68. <div> 14日02:00 </div>
  69. <div class=hourimg>
  70. <img src="http://image.nmc.cn/assets/img/w/40x40/3/0.png">
  71. </div>
  72. <div> - </div>
  73. <div class=tmp_lte_10> 6.2℃ </div>
  74. <div> 0.5m/s </div>
  75. <div> 南风 </div>
  76. <div> 1000.5hPa </div>
  77. <div> 81.9% </div>
  78. <div class=hide> 10.9% </div>
  79. </div>
  80. <div class="hour3 ">
  81. <div> 05:00 </div>
  82. <div class=hourimg>
  83. <img src="http://image.nmc.cn/assets/img/w/40x40/3/1.png">
  84. </div>
  85. <div> - </div>
  86. <div class=tmp_lte_10> 7.8℃ </div>
  87. <div> 0.7m/s </div>
  88. <div> 南风 </div>
  89. <div> 1000.1hPa </div>
  90. <div> 93.6% </div>
  91. <div class=hide> 52.6% </div>
  92. </div>
  93. <div class="hour3 ">
  94. <div> 08:00 </div>
  95. <div class=hourimg>
  96. <img src="http://image.nmc.cn/assets/img/w/40x40/3/1.png">
  97. </div>
  98. <div> - </div>
  99. <div class=tmp_lte_10> 9.1℃ </div>
  100. <div> 0.9m/s </div>
  101. <div> 南风 </div>
  102. <div> 1000.3hPa </div>
  103. <div> 84.5% </div>
  104. <div class=hide> 61.4% </div>
  105. </div>
  106. </div>

爬取的目标数据展示

watermark_type_ZmFuZ3poZW5naGVpdGk_shadow_10_text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3BzaGRoeA_size_16_color_FFFFFF_t_70

建立实体类

TwentyFourHoursDomain.java

  1. package com.imegaware.crawler.weatherForecast.psd;
  2. import java.io.File;
  3. /**
  4. * 24小时每隔三个小时需要的属性
  5. * @author pshdhx
  6. *
  7. */
  8. public class TwentyFourHoursDomain {
  9. private String monitorTime; //未来监测时间
  10. private String url; //天气图标url :http://image.nmc.cn/assets/img/w/40x40/3/0.png=晴天
  11. private File file; //天气图标url :http://image.nmc.cn/assets/img/w/40x40/3/0.png=晴天
  12. private String rain; //降水 -
  13. private String temperature; //气温
  14. private String windSpeed; //风速
  15. private String windDirection; //风向
  16. private String airPressure; //气压
  17. private String humidity;//湿度
  18. private String url2; //本地url
  19. public String getUrl2() {
  20. return url2;
  21. }
  22. public void setUrl2(String url2) {
  23. this.url2 = url2;
  24. }
  25. public String getMonitorTime() {
  26. return monitorTime;
  27. }
  28. public void setMonitorTime(String monitorTime) {
  29. this.monitorTime = monitorTime;
  30. }
  31. public String getUrl() {
  32. return url;
  33. }
  34. public void setUrl(String url) {
  35. this.url = url;
  36. }
  37. public File getFile() {
  38. return file;
  39. }
  40. public void setFile(File file) {
  41. this.file = file;
  42. }
  43. public String getRain() {
  44. return rain;
  45. }
  46. public void setRain(String rain) {
  47. this.rain = rain;
  48. }
  49. public String getTemperature() {
  50. return temperature;
  51. }
  52. public void setTemperature(String temperature) {
  53. this.temperature = temperature;
  54. }
  55. public String getWindSpeed() {
  56. return windSpeed;
  57. }
  58. public void setWindSpeed(String windSpeed) {
  59. this.windSpeed = windSpeed;
  60. }
  61. public String getWindDirection() {
  62. return windDirection;
  63. }
  64. public void setWindDirection(String windDirection) {
  65. this.windDirection = windDirection;
  66. }
  67. public String getAirPressure() {
  68. return airPressure;
  69. }
  70. public void setAirPressure(String airPressure) {
  71. this.airPressure = airPressure;
  72. }
  73. public String getHumidity() {
  74. return humidity;
  75. }
  76. public void setHumidity(String humidity) {
  77. this.humidity = humidity;
  78. }
  79. @Override
  80. public String toString() {
  81. return "TwentyFourHoursDomain [monitorTime=" + monitorTime + ", url=" + url + ", file=" + file + ", rain="
  82. + rain + ", temperature=" + temperature + ", windSpeed=" + windSpeed + ", windDirection="
  83. + windDirection + ", airPressure=" + airPressure + ", humidity=" + humidity + "]";
  84. }
  85. }

JdbcUtil.java

  1. package com.imegaware.crawler.util;
  2. import java.io.InputStream;
  3. import java.sql.Connection;
  4. import java.sql.DriverManager;
  5. import java.sql.PreparedStatement;
  6. import java.sql.ResultSet;
  7. import java.sql.SQLException;
  8. import java.util.Properties;
  9. /**
  10. *
  11. * @author pshdhx
  12. *
  13. */
  14. public class JdbcUtil {
  15. private static String USERNAME;
  16. private static String PASSWORD;
  17. private static String DRIVER;
  18. private static String URL;
  19. static {
  20. loadConfig();
  21. }
  22. /**
  23. * 加载数据库配置信息
  24. */
  25. public static void loadConfig() {
  26. try {
  27. InputStream inStream = JdbcUtil.class.getClassLoader().getResourceAsStream("jdbc.properties");
  28. Properties prop = new Properties();
  29. prop.load(inStream);
  30. USERNAME = prop.getProperty("jdbc.username");
  31. PASSWORD = prop.getProperty("jdbc.password");
  32. DRIVER = prop.getProperty("jdbc.driver");
  33. URL = prop.getProperty("jdbc.url");
  34. } catch (Exception e) {
  35. throw new RuntimeException("读取数据库配置文件异常!");
  36. }
  37. }
  38. public JdbcUtil() {
  39. super();
  40. }
  41. /**
  42. * 获取数据库连接
  43. *
  44. * @return
  45. */
  46. public static Connection getConnection() throws RuntimeException {
  47. try {
  48. Class.forName(DRIVER);
  49. Connection connection = DriverManager.getConnection(URL, USERNAME, PASSWORD);
  50. return connection;
  51. } catch (Exception e) {
  52. throw new RuntimeException("无法获取数据库连接!", e);
  53. }
  54. }
  55. /**
  56. * @Title close
  57. * @Description 释放连接
  58. * @author maven
  59. * @param connection
  60. * @param preparedStatement
  61. * @param resultSet
  62. * @return void
  63. */
  64. public static void close(Connection connection, PreparedStatement preparedStatement, ResultSet resultSet) {
  65. if (resultSet != null) {
  66. try {
  67. resultSet.close();
  68. } catch (SQLException e) {
  69. e.printStackTrace();
  70. }
  71. }
  72. if (preparedStatement != null) {
  73. try {
  74. preparedStatement.close();
  75. } catch (SQLException e) {
  76. e.printStackTrace();
  77. }
  78. }
  79. if (connection != null) {
  80. try {
  81. connection.close();
  82. } catch (SQLException e) {
  83. e.printStackTrace();
  84. }
  85. }
  86. }
  87. public static void close(PreparedStatement preparedStatement, ResultSet resultSet) {
  88. if (resultSet != null) {
  89. try {
  90. resultSet.close();
  91. } catch (SQLException e) {
  92. e.printStackTrace();
  93. }
  94. }
  95. if (preparedStatement != null) {
  96. try {
  97. preparedStatement.close();
  98. } catch (SQLException e) {
  99. e.printStackTrace();
  100. }
  101. }
  102. }
  103. public static void close(PreparedStatement preparedStatement) {
  104. if (preparedStatement != null) {
  105. try {
  106. preparedStatement.close();
  107. } catch (SQLException e) {
  108. e.printStackTrace();
  109. }
  110. }
  111. }
  112. public static void close(ResultSet resultSet) {
  113. if (resultSet != null) {
  114. try {
  115. resultSet.close();
  116. } catch (SQLException e) {
  117. e.printStackTrace();
  118. }
  119. }
  120. }
  121. public static void close(Connection connection) {
  122. if (connection != null) {
  123. try {
  124. connection.close();
  125. } catch (SQLException e) {
  126. e.printStackTrace();
  127. }
  128. }
  129. }
  130. }

ImageUtil.java

  1. package com.imegaware.crawler.util;
  2. import java.io.File;
  3. import java.io.FileInputStream;
  4. import java.io.FileOutputStream;
  5. import java.io.IOException;
  6. import java.io.InputStream;
  7. /**
  8. * 图片工具类
  9. * @author pshdhx
  10. *
  11. */
  12. public class ImageUtil {
  13. // 读取本地图片获取输入流
  14. public static FileInputStream readImage(String path) throws IOException {
  15. return new FileInputStream(new File(path));
  16. }
  17. // 读取表中图片获取输出流
  18. public static void readBin2Image(InputStream in, String targetPath) {
  19. File file = new File(targetPath);
  20. String path = targetPath.substring(0, targetPath.lastIndexOf("/"));
  21. if (!file.exists()) {
  22. new File(path).mkdir();
  23. }
  24. FileOutputStream fos = null;
  25. try {
  26. fos = new FileOutputStream(file);
  27. int len = 0;
  28. byte[] buf = new byte[1024];
  29. while ((len = in.read(buf)) != -1) {
  30. fos.write(buf, 0, len);
  31. }
  32. fos.flush();
  33. } catch (Exception e) {
  34. e.printStackTrace();
  35. } finally {
  36. if (null != fos) {
  37. try {
  38. fos.close();
  39. } catch (IOException e) {
  40. e.printStackTrace();
  41. }
  42. }
  43. }
  44. }
  45. }

爬取主类

TwentyFourHoursWeather.java

  1. package com.imegaware.crawler.weatherForecast.psd;
  2. import java.io.File;
  3. import java.io.FileInputStream;
  4. import java.io.IOException;
  5. import java.sql.Connection;
  6. import java.sql.PreparedStatement;
  7. import java.sql.SQLException;
  8. import java.util.ArrayList;
  9. import java.util.List;
  10. import org.jsoup.Jsoup;
  11. import org.jsoup.nodes.Document;
  12. import org.jsoup.nodes.Element;
  13. import org.jsoup.nodes.Node;
  14. import com.alibaba.fastjson.JSONArray;
  15. import com.alibaba.fastjson.JSONObject;
  16. import com.imegaware.crawler.util.ImageUtil;
  17. import com.imegaware.crawler.util.JdbcUtil;
  18. import com.ruoyi.common.utils.StringUtils;
  19. import cn.hutool.core.io.FileUtil;
  20. import cn.hutool.http.HttpUtil;
  21. /**
  22. * 利用jsoup爬取气象局未来24小时的天气情况,温度,湿度,降水,风速,风向,气压
  23. * @author pshdhx
  24. *
  25. */
  26. public class TwentyFourHoursWeather {
  27. //图片存储到本地的路径
  28. private static String myurl = "/home/sdzw/JavaPaChong/ruoyi/imw-crawler/src/main/java/com/imegaware/crawler/weatherForecast/psd/";
  29. private static List<TwentyFourHoursDomain> getItemContent() {
  30. // TODO Auto-generated method stub
  31. Document doc ;
  32. List<TwentyFourHoursDomain> list = new ArrayList<>();
  33. try {
  34. doc = Jsoup.connect("http://www.nmc.cn/publish/forecast/ASD/zichuan.html").get();
  35. Element day0 = doc.getElementById("day0");
  36. for(int i=0;i<day0.childNodeSize();i++) {
  37. Node childNode = day0.childNode(i);
  38. //System.out.println(childNode.childNodeSize()+"--8个模块的子模块=需要的参数个数");
  39. TwentyFourHoursDomain pojo = new TwentyFourHoursDomain();
  40. pojo.setMonitorTime(StringUtils.trim(childNode.childNode(0).childNode(0)+""));
  41. String url = childNode.childNode(1).childNode(0).attr("src")+""; //网址
  42. long size = HttpUtil.downloadFile(url, FileUtil.file(myurl));
  43. //System.out.println("Download size: " + size);
  44. String[] split = url.split("[/]");
  45. File file = new File(myurl+split[split.length-1]);
  46. pojo.setFile(file);
  47. pojo.setUrl2(myurl+split[split.length-1]);
  48. //System.out.println(pojo.getUrl2()+"========"+"本地地址");
  49. pojo.setUrl(url);
  50. pojo.setRain(StringUtils.trim(childNode.childNode(2).childNode(0)+""));
  51. pojo.setTemperature(StringUtils.trim(childNode.childNode(3).childNode(0)+""));
  52. pojo.setWindSpeed(StringUtils.trim(childNode.childNode(4).childNode(0)+""));
  53. pojo.setWindDirection(StringUtils.trim(childNode.childNode(5).childNode(0)+""));
  54. pojo.setAirPressure(StringUtils.trim(childNode.childNode(6).childNode(0)+""));
  55. pojo.setHumidity(StringUtils.trim(childNode.childNode(7).childNode(0)+""));
  56. list.add(pojo);
  57. }
  58. saveData(list);
  59. } catch (IOException e) {
  60. // TODO Auto-generated catch block
  61. e.printStackTrace();
  62. }
  63. return list;
  64. }
  65. /**
  66. * @Title saveData
  67. * @Description 数据入库
  68. * @author maven
  69. * @param param
  70. * @return
  71. * @return int
  72. * @throws SQLException
  73. */
  74. private static void saveData(List<TwentyFourHoursDomain> list) throws RuntimeException {
  75. int rows = 0;
  76. PreparedStatement pstmt;
  77. FileInputStream in = null;
  78. try {
  79. Connection conn = JdbcUtil.getConnection();
  80. String sql = "INSERT INTO twenty_four_hour_weather(id,monitor_time,url,file,rain,temperature,wind_speed,wind_direction,air_pressure,humidity,region_name,region_code,task_time) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,now());";
  81. pstmt = conn.prepareStatement(sql);
  82. for (int i = 0; i < list.size(); i++) {
  83. pstmt.setString(1, (i+100)*Math.random()*1000000+""+i+(i+10)*Math.random()*1000000);
  84. pstmt.setString(2, list.get(i).getMonitorTime()+"");
  85. pstmt.setString(3, list.get(i).getUrl());
  86. File file = list.get(i).getFile();
  87. in = ImageUtil.readImage(list.get(i).getUrl2());
  88. pstmt.setBinaryStream(4, in, in.available());
  89. pstmt.setString(5, list.get(i).getRain());
  90. pstmt.setString(6, list.get(i).getTemperature());
  91. pstmt.setString(7, list.get(i).getWindSpeed());
  92. pstmt.setString(8, list.get(i).getWindDirection());
  93. pstmt.setString(9, list.get(i).getAirPressure());
  94. pstmt.setString(10, list.get(i).getHumidity());
  95. pstmt.setString(11, "淄川区");
  96. pstmt.setString(12, "370302");
  97. pstmt.addBatch();
  98. }
  99. int[] x = pstmt.executeBatch();
  100. JdbcUtil.close(conn, pstmt, null);
  101. for (int i : x) {
  102. rows += i;
  103. }
  104. System.out.println("入库完成,共插入" + rows + "行数据");
  105. } catch (Exception e) {
  106. throw new RuntimeException("数据入库失败!", e);
  107. }
  108. }
  109. public static void main(String[] args) {
  110. TwentyFourHoursWeather test = new TwentyFourHoursWeather();
  111. List<TwentyFourHoursDomain> itemContent = TwentyFourHoursWeather.getItemContent();
  112. for(int i=0;i<itemContent.size();i++) {
  113. System.out.println(itemContent.get(i).toString());
  114. }
  115. System.out.println(itemContent.size());
  116. }
  117. }

pom.xml

  1. <dependency>
  2. <groupId>org.jsoup</groupId>
  3. <artifactId>jsoup</artifactId>
  4. <version>1.11.3</version>
  5. </dependency>
  6. <dependency>
  7. <groupId>cn.hutool</groupId>
  8. <artifactId>hutool-all</artifactId>
  9. <version>5.4.7</version>
  10. </dependency>

功能实现截图:

watermark_type_ZmFuZ3poZW5naGVpdGk_shadow_10_text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3BzaGRoeA_size_16_color_FFFFFF_t_70 1

发表评论

表情:
评论列表 (有 0 条评论,363人围观)

还没有评论,来说两句吧...

相关阅读