java爬虫案例——SpringBoot使用HttpClient、Jsoup爬取京东手机数据
文章目录
- 前言
- 一、准备工作
- 二、项目文件
- 1.项目依赖
- 2.项目配置文件
- 3.pojo
- 4.dao接口
- 5.service接口及其实现类
- 6.HttpClient封装工具类
- 7.爬取任务实现
- 8.启动类
- 三、项目执行效果
- 总结
前言
之前同事分享了一些关于Java爬虫的视频,其中有一个是用HttpClient及Jsoup爬取京东上的一些手机数据(如图片、标题、sku、spu等),同时参考几篇博客后基本实现目标,在此篇做个简单记录。
一、准备工作
由于需要将爬取到的数据的数据存储到数据库表中,因此需要建库建表。建库建表SQL如下:
DROP DATABASE IF EXISTS `crawler`;
CREATE DATABASE IF NOT EXISTS `crawler` DEFAULT CHARSET = `utf8`;
USE `crawler`;
SET FOREIGN_KEY_CHECKS = 0;
DROP TABLE IF EXISTS `jd_item`;
CREATE TABLE `jd_item` (
`id` bigint(10) NOT NULL AUTO_INCREMENT COMMENT '主键id',
`spu` bigint(15) DEFAULT NULL COMMENT '商品集合id',
`sku` bigint(15) DEFAULT NULL COMMENT '商品最小品类单元id',
`title` varchar(100) DEFAULT NULL COMMENT '商品标题',
`price` bigint(10) DEFAULT NULL COMMENT '商品价格',
`pic` varchar(200) DEFAULT NULL COMMENT '商品图片',
`url` varchar(200) DEFAULT NULL COMMENT '商品详情地址',
`created` datetime DEFAULT NULL COMMENT '创建时间',
`updated` datetime DEFAULT NULL COMMENT '更新时间',
PRIMARY KEY (`id`),
KEY `sku` (`sku`) USING BTREE
) ENGINE = InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET = utf8 COMMENT = '京东商品表';
- 项目目录
二、项目文件
1.项目依赖
pom.xml:
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<artifactId>spring-boot-starter-parent</artifactId>
<groupId>org.springframework.boot</groupId>
<version>2.3.4.RELEASE</version>
</parent>
<groupId>cn.mlnt</groupId>
<artifactId>mlnt-crawler-jd</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<!--SpringMVC-->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<!--SpringData Jpa-->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-data-jpa</artifactId>
</dependency>
<!--MySQL连接包-->
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>8.0.21</version>
</dependency>
<!--HttpClient-->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
</dependency>
<!--Jsoup-->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.13.1</version>
</dependency>
<!--工具包-->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
</dependency>
</dependencies>
</project>
2.项目配置文件
application.properties(或使用.yml):
#DB Configuration:
spring.datasource.driverClassName=com.mysql.cj.jdbc.Driver
spring.datasource.url=jdbc:mysql://127.0.0.1:3306/crawler?useUnicode=true&characterEncoding=utf8&useSSL=false&allowPublicKeyRetrieval=true&serverTimezone=Asia/Shanghai
spring.datasource.username=root
spring.datasource.password=123456
#JPA Configuration:
spring.jpa.database=MySQL
spring.jpa.show-sql=true
3.pojo
Item.java:
package cn.mlnt.jd.pojo;
import javax.persistence.*;
import java.util.Date;
@Entity
@Table(name="jd_item")
public class Item {
// 主键
@Id
@GeneratedValue(strategy = GenerationType.IDENTITY)
private Long id;
// 标准产品单位(商品集合)
private Long spu;
// 库存量单位(最小品类单元)
private Long sku;
// 商品标题
private String title;
// 商品价格
private Double price;
// 商品图片
private String pic;
// 商品详情地址
private String url;
// 创建时间
private Date created;
// 更新时间
private Date updated;
public Long getId() {
return id;
}
public void setId(Long id) {
this.id = id;
}
public Long getSpu() {
return spu;
}
public void setSpu(Long spu) {
this.spu = spu;
}
public Long getSku() {
return sku;
}
public void setSku(Long sku) {
this.sku = sku;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public Double getPrice() {
return price;
}
public void setPrice(Double price) {
this.price = price;
}
public String getPic() {
return pic;
}
public void setPic(String pic) {
this.pic = pic;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public Date getCreated() {
return created;
}
public void setCreated(Date created) {
this.created = created;
}
public Date getUpdated() {
return updated;
}
public void setUpdated(Date updated) {
this.updated = updated;
}
}
4.dao接口
ItemDao.java:
package cn.mlnt.jd.dao;
import cn.mlnt.jd.pojo.Item;
import org.springframework.data.jpa.repository.JpaRepository;
public interface ItemDao extends JpaRepository<Item, Long> {
}
5.service接口及其实现类
ItemService.java:
package cn.mlnt.jd.service;
import cn.mlnt.jd.pojo.Item;
import java.util.List;
public interface ItemService {
/** * 保存商品 * @param item */
public void save(Item item);
/** * 根据条件查询商品 * @param item * @return */
public List<Item> findAll(Item item);
}
ItemServiceImpl.java:
package cn.mlnt.jd.service.impl;
import cn.mlnt.jd.dao.ItemDao;
import cn.mlnt.jd.pojo.Item;
import cn.mlnt.jd.service.ItemService;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.domain.Example;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;
import java.util.List;
@Service
public class ItemServiceImpl implements ItemService {
@Autowired
private ItemDao itemDao;
@Override
@Transactional
public void save(Item item) {
this.itemDao.save(item);
}
@Override
public List<Item> findAll(Item item) {
// 声明查询条件
Example<Item> example = Example.of(item);
// 根据查询条件进行查询数据
List<Item> list = this.itemDao.findAll(example);
return list;
}
}
6.HttpClient封装工具类
HttpUtils.java:
package cn.mlnt.jd.util;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.springframework.stereotype.Component;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.UUID;
@Component
public class HttpUtils {
private PoolingHttpClientConnectionManager cm;
public HttpUtils() {
this.cm = new PoolingHttpClientConnectionManager();
// 设置最大连接数
this.cm.setMaxTotal(100);
// 设置每个主机的最大连接数
this.cm.setDefaultMaxPerRoute(10);
}
/** * 根据请求地址下载页面数据 * @param url * @return 页面数据 */
public String doGetHtml(String url) {
// 获取HttpClient对象
CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(this.cm).build();
// 创建httpGet对象,设置url地址
HttpGet httpGet = new HttpGet(url);
// 设置请求信息
httpGet.setConfig(this.getConfig());
// 设置请求Request Headers中的User-Agent,浏览器访问
httpGet.addHeader("User-Agent","Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Mobile Safari/537.36");
CloseableHttpResponse response = null;
try {
// 使用HttpClient发起请求,获取响应
response = httpClient.execute(httpGet);
// 解析响应,返回结果
if(response.getStatusLine().getStatusCode() == 200) {
String content = "";
// 判断响应体Entity是否不为空,如果不为空就可以使用EntityUtils
if(response.getEntity() != null) {
content = EntityUtils.toString(response.getEntity(), "utf8");
return content;
}
}
} catch (IOException e) {
e.printStackTrace();
} finally {
// 关闭response
if(response != null) {
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
// 返回空字符串
return "";
}
/** * 下载图片 * @param url * @return 图片名称 */
public String doGetImage(String url) {
// 获取HttpClient对象
CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(this.cm).build();
// 创建httpGet对象,设置url地址
HttpGet httpGet = new HttpGet(url);
// 设置请求信息
httpGet.setConfig(this.getConfig());
CloseableHttpResponse response = null;
try {
// 使用HttpClient发起请求,获取响应
response = httpClient.execute(httpGet);
// 解析响应,返回结果
if(response.getStatusLine().getStatusCode() == 200) {
// 判断响应体Entity是否不为空
if(response.getEntity() != null) {
// 下载图片
// 获取图片的后缀
String extName = url.substring(url.lastIndexOf("."));
// 创建图片名,重命名图片
String picName = UUID.randomUUID().toString()+extName;
// 下载图片
// 声明OutPutStream,下载图片存储路径
OutputStream outputStream = new FileOutputStream(new File("E:\\images\\"+picName));
response.getEntity().writeTo(outputStream);
// 返回图片名称
return picName;
}
}
} catch (IOException e) {
e.printStackTrace();
} finally {
// 关闭response
if(response != null) {
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
// 如果下载失败,返回空字符串
return "";
}
/** * 设置请求信息 * @return */
private RequestConfig getConfig() {
RequestConfig config = RequestConfig.custom()
// 创建链接的最长时间
.setConnectTimeout(1000)
// 获取连接到最长时间
.setConnectionRequestTimeout(500)
// 数据传输的最长时间
.setSocketTimeout(10000)
.build();
return config;
}
public static void main(String[] args) throws IOException {
HttpUtils httpUtils = new HttpUtils();
String itemInfo = httpUtils.doGetHtml("https://item.jd.com/100009082466.html");
String title = Jsoup.parse(itemInfo).select("div#itemName").text();
System.out.println(Jsoup.parse(itemInfo).select("div#itemName"));
System.out.println(title);
}
}
7.爬取任务实现
ItemTask.java:
package cn.mlnt.jd.task;
import cn.mlnt.jd.pojo.Item;
import cn.mlnt.jd.service.ItemService;
import cn.mlnt.jd.util.HttpUtils;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import javax.annotation.Resource;
import java.util.Date;
import java.util.List;
@Component
public class ItemTask {
@Resource
private HttpUtils httpUtils;
@Resource
private ItemService itemService;
private static final ObjectMapper MAPPER = new ObjectMapper();
/** * 当下载任务完成后,间隔多长时间进行下一次任务 * @throws Exception */
@Scheduled(fixedDelay = 100*1000)
public void itemTask() throws Exception {
// 声明需要解析的初始地址
String url = "https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&pvid=1e449f956a3b49319117b81bbde91f3c";
// 按照页面对手机的搜索结果进行遍历解析
for (int i = 1; i < 10; i=i+2) {
String html = httpUtils.doGetHtml(url + i);
// 解析页面,获取商品数据并存储
if (html != null) {
this.parse(html);
}
}
System.out.println("手机数据抓取完成!");
}
/** * 解析页面,获取商品数据并存储 * @param html * @throws Exception */
private void parse(String html) throws Exception {
// 解析html获取Document对象
Document document = Jsoup.parse(html);
// 获取spu
Elements spuEles = document.select("div#J_goodsList > ul > li");
for (Element spuEle : spuEles) {
// 获取spu
String attr = spuEle.attr("data-spu");
long spu = Long.parseLong(attr.equals("") ? "0" : attr);
// 获取sku
Elements skuEles = spuEle.select("li.ps-item");
for (Element skuELe : skuEles) {
// 获取sku
long sku = Long.parseLong(skuELe.select("[data-sku]").attr("data-sku"));
// 根据sku查询商品数据
Item item = new Item();
item.setSku(sku);
List<Item> list = this.itemService.findAll(item);
if(list.size() > 0) {
// 如果商品存在,就进行下一个循环,该商品不保存,因为已存在
continue;
}
// 设置商品的spu
item.setSpu(spu);
// 获取商品详情的url
String itemUrl = "https://item.jd.com/" + sku + ".html";
item.setUrl(itemUrl);
// 获取商品的图片
String picUrl = "https:" + skuELe.select("img[data-sku]").first().attr("data-lazy-img");
//图片路径可能会为空的情况:一下为两种解决方式,第一种会让数据不全,第二种任会报错
if(picUrl.equals("https:")){
break;
}
picUrl = picUrl.replace("/n9/", "/n1/");
String picName = this.httpUtils.doGetImage(picUrl);
item.setPic(picName);
// 获取商品的价格
String priceJson = this.httpUtils.doGetHtml("https://p.3.cn/prices/mgets?skuIds=J_" + sku);
double price = MAPPER.readTree(priceJson).get(0).get("p").asDouble();
item.setPrice(price);
//获取商品的标题
String itemInfo = this.httpUtils.doGetHtml(item.getUrl());
// String title = Jsoup.parse(itemInfo).select("div.sku-name").text();
String title = Jsoup.parse(itemInfo).select("div#itemName").text();
item.setTitle(title);
item.setCreated(new Date());
item.setUpdated(item.getCreated());
// 保存商品数据到数据库中
this.itemService.save(item);
}
}
}
}
8.启动类
Application.java:
package cn.mlnt.jd;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.scheduling.annotation.EnableScheduling;
@SpringBootApplication
/** * 使用定时任务,需要先开启定时任务,需添加注解 */
@EnableScheduling
public class Application {
public static void main(String[] args) {
SpringApplication.run(Application.class, args);
}
}
三、项目执行效果
- 爬取到的图片
- 存储到数据库中的记录
总结
参照视频敲完后,执行项目并没有爬到数据,因为视频中没有提及要添加header,声明为浏览器访问。后来参考网上的博客后,遇到的问题基本解决。
//设置请求Request Headers中的User-Agent,告诉京东说这是浏览器访问
httpGet.addHeader("User-Agent","Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Mobile Safari/537.36");
参考文章地址:
- https://blog.csdn.net/hellowork10/article/details/106292150
- https://blog.csdn.net/weixin_44505194/article/details/106634835
还没有评论,来说两句吧...