
import requests
from lxml import etree
from fake_useragent import UserAgent
class Qiu:
def __init__(self):
self.count = 1
def __call__(self, *args, **kwargs):
self.get_max_page()
def get_html(self, base_url):
# 随机产生一个浏览器信息
headers = {"User-Agent": UserAgent().random}
response = requests.get(base_url, headers=headers)
html = response.text
# print(html)
html_xml = etree.HTML(html)
return html_xml
def get_max_page(self):
base_url = "https://www.qiushibaike.com/8hr/page/2/"
html_xml = self.get_html(base_url)
# 获取最大页码
max_page = int(html_xml.xpath("//a/span[@class='page-numbers']/text()")[-1].strip())
# print(max_page)
self.get_data(max_page)
def get_data(self, max_page):
for page in range(1, max_page + 1):
print("===================第{}页开始下载=========================".format(page))
page_url = "https://www.qiushibaike.com/8hr/page/{}/".format(page)
# print(page_url)
html_xml = self.get_html(page_url)
# 缩小范围
li_list = html_xml.xpath("//li[contains(@id, 'qiushi_tag_')]")
# print(len(li_list))
for li in li_list:
# 获取图片
pic = li.xpath(".//a[contains(@class, 'recmd-left')]/img/@src")[0]
# if "/w/150/h/112" in pic:
# pic = "https:" + pic[:-12]
# else:
# pic = ""
# 三元表达式 实现上面的代码
pic = "https:" + pic[:-12] if "/w/150/h/112" in pic else ""
# print(pic)
# 获取昵称
nike_name = li.xpath(".//span[@class='recmd-name']/text()")[0]
# print(nike_name)
# 获取内容
content = li.xpath(".//a[@class='recmd-content']/text()")
content = content[0] if content else ""
# print(content)
# 获取好笑数量
laught_num = li.xpath(".//div[@class='recmd-num']/span[1]/text()")[0]
# if "万" in laught_num:
# laught_num = int(float(laught_num[:-1]) * 10000)
# else:
# laught_num = int(laught_num)
laught_num = int(float(laught_num[:-1]) * 10000) if "万" in laught_num else int(laught_num)
# print(laught_num)
# 评论数量
comment_num = li.xpath(".//div[@class='recmd-num']/span[4]/text()")
comment_num = int(comment_num[0]) if comment_num else 0
# print(comment_num)
qiu_dict = {
"pic": pic,
"nike_name": nike_name,
"content": content,
"laught_num": laught_num,
"comment_num": comment_num,
}
print(self.count, qiu_dict)
self.count += 1
if __name__ == '__main__':
qiu = Qiu()
qiu()
还没有评论,来说两句吧...