python 爬虫爬取小说信息

桃扇骨 2022-04-02 11:42 670阅读 0赞

1.进入小说主页(以下示例是我在网上随便找的一片小说),获取该小说的名称、作者以及相关描述信息

2.获取该小说的所有章节列表信息(最重要的是每个章节的链接地址href)

3.根据每个章节的地址信息下载每个章节的内容并解析

4.将解析后的内容打印并写入文件或者数据库

示例代码v1版如下(仅供学习交流):

  1. # !/usr/bin/env python
  2. # -*-coding:utf-8-*-
  3. """
  4. @Author : xiaofeng
  5. @Time : 2018/12/26 11:41
  6. @Desc : Less interests,More interest.
  7. @Project : python_appliction
  8. @FileName: dianxs.py
  9. @Software: PyCharm
  10. @Blog :https://blog.csdn.net/zwx19921215
  11. """
  12. import requests
  13. import time
  14. from lxml import html
  15. import os
  16. """
  17. 简单爬取小说文章内容('殿行说小说网')
  18. """
  19. class Dianxs():
  20. # 构造函数初始化
  21. def __init__(self, host, url, headers, path):
  22. self.host = host
  23. self.url = url
  24. self.headers = headers
  25. self.path = path
  26. """
  27. 下载并解析小说主页信息,获取小说列表
  28. """
  29. def download_page(self):
  30. response = requests.get(url=self.url, headers=self.headers)
  31. text = html.fromstring(response.text)
  32. novel = text.xpath('//div[@class="info"]/h1/text()')
  33. author = text.xpath('//div[@class="info"]/p[1]/text()')
  34. act = text.xpath('//div[@class="info"]/p[2]/text()')
  35. last_update = text.xpath('//div[@class="info"]/p[3]/text()')
  36. last_chapter_href = text.xpath('//div[@class="info"]/p[4]/a/@href')
  37. last_chapter_title = text.xpath('//div[@class="info"]/p[4]/a/text()')
  38. introduce = text.xpath('//div[@class="info"]/p[@class="introduce"]/text()')
  39. print('---------------------------description--------------------------------')
  40. print(novel)
  41. # author[0].replace('\xa0', '')
  42. print(author)
  43. print(act)
  44. print(last_update)
  45. print(last_chapter_title, ' , ', last_chapter_href)
  46. print('简介:', introduce)
  47. print('-----------------------------------------------------------------------')
  48. print('\n')
  49. chapters = text.xpath('//div[@class="section-panel section-list"]/dl/dd/a/text()')
  50. hrefs = text.xpath('//div[@class="section-panel section-list"]/dl/dd/a/@href')
  51. print(chapters)
  52. print(hrefs)
  53. print('\n')
  54. for href in hrefs:
  55. time.sleep(1)
  56. address = self.host + href
  57. self.parse_html(address)
  58. """
  59. 解析文章内容
  60. @:param address 章节页地址
  61. """
  62. def parse_html(self, address):
  63. response = requests.get(url=address, headers=self.headers, timeout=10)
  64. if response.status_code != 200:
  65. self.parse_html(address)
  66. text = html.fromstring(response.text)
  67. title = text.xpath('//div[@class="read-title"]/h2/text()')
  68. content = text.xpath('//div[@class="read-content"]/p/text()')
  69. print('-------- ', title, '-----------')
  70. print(content)
  71. print('\n')
  72. # ''.join(content):list集合转字符串;list = list(string):字符串转list集合
  73. title_str = ''.join(title)
  74. content_str = ''.join(content)
  75. self.write_to_file(title_str, content_str)
  76. """
  77. 章节内容写入文件
  78. """
  79. def write_to_file(self, title, content):
  80. flag = os.path.exists(self.path)
  81. if not flag:
  82. # 'w' 表示写模式,没有文件则会创建一个
  83. f = open(self.path, 'w')
  84. f.close()
  85. # with 可以不用显示调用close方法
  86. # 'a' 表示追加写入
  87. with open(self.path, mode='a', encoding='utf-8') as file:
  88. file.write(title + '\n')
  89. file.writelines(content)
  90. file.write('\n\n')
  91. if __name__ == '__main__':
  92. host = 'https://www.dianxs.com'
  93. url = 'https://www.dianxs.com/book/64554/'
  94. headers = {
  95. 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
  96. }
  97. path = 'G:/test/novel.txt'
  98. app = Dianxs(host, url, headers, path)
  99. app.download_page()

控制台输出:

watermark_type_ZmFuZ3poZW5naGVpdGk_shadow_10_text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3p3eDE5OTIxMjE1_size_16_color_FFFFFF_t_70

watermark_type_ZmFuZ3poZW5naGVpdGk_shadow_10_text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3p3eDE5OTIxMjE1_size_16_color_FFFFFF_t_70 1

文件写入内容:

watermark_type_ZmFuZ3poZW5naGVpdGk_shadow_10_text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3p3eDE5OTIxMjE1_size_16_color_FFFFFF_t_70 2

v2版:从网站主页开始深度优先爬取所有分类栏目以及栏目下所有小说信息

改进v2版示例如下

  1. # !/usr/bin/env python
  2. # -*-coding:utf-8-*-
  3. """
  4. @Author : xiaofeng
  5. @Time : 2018/12/26 11:41
  6. @Desc : Less interests,More interest.
  7. @Project : python_appliction
  8. @FileName: dianxs2.py
  9. @Software: PyCharm
  10. @Blog :https://blog.csdn.net/zwx19921215
  11. """
  12. import os
  13. import requests
  14. from lxml import html
  15. """
  16. 简单爬取小说文章内容('殿行说小说网')
  17. """
  18. class Dianxs():
  19. # 构造函数初始化
  20. def __init__(self, host, url, headers, path):
  21. self.host = host
  22. self.url = url
  23. self.headers = headers
  24. self.path = path
  25. self.novel_name = ''
  26. """
  27. 主站导航栏栏目列表爬取
  28. """
  29. def nav_page(self):
  30. print('------------------殿行说----------------------------')
  31. response = requests.get(url=self.host, headers=self.headers)
  32. text = html.fromstring(response.text)
  33. nav_list = text.xpath('//ul[@class="nav"]/li/a/text()')
  34. nav_href_list = text.xpath('//ul[@class="nav"]/li/a/@href')
  35. nav_list.pop(0)
  36. nav_href_list.pop(0)
  37. print(nav_list)
  38. print(nav_href_list)
  39. i = 0
  40. for nav_item in nav_href_list:
  41. address = self.host + nav_item
  42. nav_title = nav_list[i]
  43. self.nav_item(address, nav_title)
  44. i += 1
  45. """
  46. 小说栏目下所有章节爬取
  47. """
  48. def nav_item(self, url, nav_title):
  49. response = requests.get(url=url, headers=self.headers)
  50. text = html.fromstring(response.text)
  51. novel_list = text.xpath(
  52. '//div[@class="panel new-xs-list w300 w265 fr simple"]/ul/li//span[@class="xs-name"]/a/text()')
  53. novel_list_href = text.xpath(
  54. '//div[@class="panel new-xs-list w300 w265 fr simple"]/ul/li//span[@class="xs-name"]/a/@href')
  55. print('--------------------', nav_title, '-----------------')
  56. print(novel_list)
  57. print(novel_list_href)
  58. print('\n')
  59. for nov_item in novel_list_href:
  60. self.url = self.host + nov_item
  61. self.download_page()
  62. """
  63. 每个章节下载,并解析小说主页信息,获取小说列表
  64. """
  65. def download_page(self):
  66. response = requests.get(url=self.url, headers=self.headers)
  67. text = html.fromstring(response.text)
  68. novel = text.xpath('//div[@class="info"]/h1/text()')
  69. author = text.xpath('//div[@class="info"]/p[1]/text()')
  70. act = text.xpath('//div[@class="info"]/p[2]/text()')
  71. last_update = text.xpath('//div[@class="info"]/p[3]/text()')
  72. last_chapter_href = text.xpath('//div[@class="info"]/p[4]/a/@href')
  73. last_chapter_title = text.xpath('//div[@class="info"]/p[4]/a/text()')
  74. introduce = text.xpath('//div[@class="info"]/p[@class="introduce"]/text()')
  75. print('---------------------------description--------------------------------')
  76. print(novel)
  77. # author[0].replace('\xa0', '')
  78. print(author)
  79. print(act)
  80. print(last_update)
  81. print(last_chapter_title, ' , ', last_chapter_href)
  82. print('简介:', introduce)
  83. print('-----------------------------------------------------------------------')
  84. print('\n')
  85. chapters = text.xpath('//div[@class="section-panel section-list"]/dl/dd/a/text()')
  86. hrefs = text.xpath('//div[@class="section-panel section-list"]/dl/dd/a/@href')
  87. print(chapters)
  88. print(hrefs)
  89. print('\n')
  90. for href in hrefs:
  91. # time.sleep(1)
  92. address = self.host + href
  93. self.novel_name = novel
  94. self.parse_html(address)
  95. """
  96. 解析文章内容
  97. @:param address 章节页地址
  98. """
  99. def parse_html(self, address):
  100. response = requests.get(url=address, headers=self.headers, timeout=10)
  101. if response.status_code != 200:
  102. self.parse_html(address)
  103. text = html.fromstring(response.text)
  104. title = text.xpath('//div[@class="read-title"]/h2/text()')
  105. content = text.xpath('//div[@class="read-content"]/p/text()')
  106. print('-------- ', title, '-----------')
  107. print(content)
  108. print('\n')
  109. # ''.join(content):list集合转字符串;list = list(string):字符串转list集合
  110. title_str = ''.join(title)
  111. content_str = ''.join(content)
  112. self.write_to_file(title_str, content_str)
  113. """
  114. 章节内容写入文件
  115. """
  116. def write_to_file(self, title, content):
  117. file_path = self.path + ''.join(self.novel_name)
  118. if not os.path.exists(file_path):
  119. os.makedirs(file_path)
  120. file_name = file_path + '/' + title + '.txt'
  121. flag = os.path.exists(file_name)
  122. if not flag:
  123. # 'w' 表示写模式,没有文件则会创建一个
  124. try:
  125. f = open(file_name, 'w')
  126. f.close()
  127. except Exception as e:
  128. print(e)
  129. # 可将发生异常信息的章节收集,进行后续处理
  130. # todo
  131. # with 可以不用显示调用close方法
  132. # 'a' 表示追加写入
  133. with open(file_name, mode='a', encoding='utf-8') as file:
  134. file.write(title + '\n')
  135. file.writelines(content)
  136. file.write('\n\n')
  137. if __name__ == '__main__':
  138. host = 'https://www.dianxs.com'
  139. url = 'https://www.dianxs.com/book/64554/'
  140. headers = {
  141. 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
  142. }
  143. path = 'G:/殿兴说/'
  144. app = Dianxs(host, url, headers, path)
  145. app.nav_page()
  146. # app.download_page()

20181227140428747.png

watermark_type_ZmFuZ3poZW5naGVpdGk_shadow_10_text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3p3eDE5OTIxMjE1_size_16_color_FFFFFF_t_70 3

发表评论

表情:
评论列表 (有 0 条评论,670人围观)

还没有评论,来说两句吧...

相关阅读

    相关 python 爬虫小说信息

    1.进入小说主页(以下示例是我在网上随便找的一片小说),获取该小说的名称、作者以及相关描述信息 2.获取该小说的所有章节列表信息(最重要的是每个章节的链接地址href) 3