爬虫:爬取糗事百科数据

青旅半醒 2022-01-26 04:45 367阅读 0赞

在这里插入图片描述

  1. import requests
  2. from lxml import etree
  3. from fake_useragent import UserAgent
  4. class Qiu:
  5. def __init__(self):
  6. self.count = 1
  7. def __call__(self, *args, **kwargs):
  8. self.get_max_page()
  9. def get_html(self, base_url):
  10. # 随机产生一个浏览器信息
  11. headers = {"User-Agent": UserAgent().random}
  12. response = requests.get(base_url, headers=headers)
  13. html = response.text
  14. # print(html)
  15. html_xml = etree.HTML(html)
  16. return html_xml
  17. def get_max_page(self):
  18. base_url = "https://www.qiushibaike.com/8hr/page/2/"
  19. html_xml = self.get_html(base_url)
  20. # 获取最大页码
  21. max_page = int(html_xml.xpath("//a/span[@class='page-numbers']/text()")[-1].strip())
  22. # print(max_page)
  23. self.get_data(max_page)
  24. def get_data(self, max_page):
  25. for page in range(1, max_page + 1):
  26. print("===================第{}页开始下载=========================".format(page))
  27. page_url = "https://www.qiushibaike.com/8hr/page/{}/".format(page)
  28. # print(page_url)
  29. html_xml = self.get_html(page_url)
  30. # 缩小范围
  31. li_list = html_xml.xpath("//li[contains(@id, 'qiushi_tag_')]")
  32. # print(len(li_list))
  33. for li in li_list:
  34. # 获取图片
  35. pic = li.xpath(".//a[contains(@class, 'recmd-left')]/img/@src")[0]
  36. # if "/w/150/h/112" in pic:
  37. # pic = "https:" + pic[:-12]
  38. # else:
  39. # pic = ""
  40. # 三元表达式 实现上面的代码
  41. pic = "https:" + pic[:-12] if "/w/150/h/112" in pic else ""
  42. # print(pic)
  43. # 获取昵称
  44. nike_name = li.xpath(".//span[@class='recmd-name']/text()")[0]
  45. # print(nike_name)
  46. # 获取内容
  47. content = li.xpath(".//a[@class='recmd-content']/text()")
  48. content = content[0] if content else ""
  49. # print(content)
  50. # 获取好笑数量
  51. laught_num = li.xpath(".//div[@class='recmd-num']/span[1]/text()")[0]
  52. # if "万" in laught_num:
  53. # laught_num = int(float(laught_num[:-1]) * 10000)
  54. # else:
  55. # laught_num = int(laught_num)
  56. laught_num = int(float(laught_num[:-1]) * 10000) if "万" in laught_num else int(laught_num)
  57. # print(laught_num)
  58. # 评论数量
  59. comment_num = li.xpath(".//div[@class='recmd-num']/span[4]/text()")
  60. comment_num = int(comment_num[0]) if comment_num else 0
  61. # print(comment_num)
  62. qiu_dict = {
  63. "pic": pic,
  64. "nike_name": nike_name,
  65. "content": content,
  66. "laught_num": laught_num,
  67. "comment_num": comment_num,
  68. }
  69. print(self.count, qiu_dict)
  70. self.count += 1
  71. if __name__ == '__main__':
  72. qiu = Qiu()
  73. qiu()

发表评论

表情:
评论列表 (有 0 条评论,367人围观)

还没有评论,来说两句吧...

相关阅读

    相关 百科爬虫

    这几天看了不少phtyon 的基础,试着做了一个daemo 但不是很成功 不知道家里网络不太好还正则匹配的不好,re.findall 的数据不是特别的稳定,有时候要加载很长