python 爬虫爬取小说信息-蒲公英云

1.进入小说主页（以下示例是我在网上随便找的一片小说），获取该小说的名称、作者以及相关描述信息

2.获取该小说的所有章节列表信息（最重要的是每个章节的链接地址href）

3.根据每个章节的地址信息下载每个章节的内容并解析

4.将解析后的内容打印并写入文件或者数据库

示例代码v1版如下（仅供学习交流）：

# ！/usr/bin/env python
# -*-coding:utf-8-*-
"""
@Author  : xiaofeng
@Time    : 2018/12/26 11:41
@Desc : Less interests,More interest.
@Project : python_appliction
@FileName: dianxs.py
@Software: PyCharm
@Blog    ：https://blog.csdn.net/zwx19921215
"""
import requests
import time
from lxml import html
import os
"""
简单爬取小说文章内容（'殿行说小说网'）
"""
class Dianxs():
    # 构造函数初始化
    def __init__(self, host, url, headers, path):
        self.host = host
        self.url = url
        self.headers = headers
        self.path = path
    """
    下载并解析小说主页信息，获取小说列表
    """
    def download_page(self):
        response = requests.get(url=self.url, headers=self.headers)
        text = html.fromstring(response.text)
        novel = text.xpath('//div[@class="info"]/h1/text()')
        author = text.xpath('//div[@class="info"]/p[1]/text()')
        act = text.xpath('//div[@class="info"]/p[2]/text()')
        last_update = text.xpath('//div[@class="info"]/p[3]/text()')
        last_chapter_href = text.xpath('//div[@class="info"]/p[4]/a/@href')
        last_chapter_title = text.xpath('//div[@class="info"]/p[4]/a/text()')
        introduce = text.xpath('//div[@class="info"]/p[@class="introduce"]/text()')
        print('---------------------------description--------------------------------')
        print(novel)
        # author[0].replace('\xa0', '')
        print(author)
        print(act)
        print(last_update)
        print(last_chapter_title, ' , ', last_chapter_href)
        print('简介：', introduce)
        print('-----------------------------------------------------------------------')
        print('\n')
        chapters = text.xpath('//div[@class="section-panel section-list"]/dl/dd/a/text()')
        hrefs = text.xpath('//div[@class="section-panel section-list"]/dl/dd/a/@href')
        print(chapters)
        print(hrefs)
        print('\n')
        for href in hrefs:
            time.sleep(1)
            address = self.host + href
            self.parse_html(address)
    """
    解析文章内容
    @:param address 章节页地址
    """
    def parse_html(self, address):
        response = requests.get(url=address, headers=self.headers, timeout=10)
        if response.status_code != 200:
            self.parse_html(address)
        text = html.fromstring(response.text)
        title = text.xpath('//div[@class="read-title"]/h2/text()')
        content = text.xpath('//div[@class="read-content"]/p/text()')
        print('-------- ', title, '-----------')
        print(content)
        print('\n')
        # ''.join(content)：list集合转字符串；list = list(string)：字符串转list集合
        title_str = ''.join(title)
        content_str = ''.join(content)
        self.write_to_file(title_str, content_str)
    """
    章节内容写入文件
    """
    def write_to_file(self, title, content):
        flag = os.path.exists(self.path)
        if not flag:
            # 'w' 表示写模式，没有文件则会创建一个
            f = open(self.path, 'w')
            f.close()
        # with 可以不用显示调用close方法
        # 'a' 表示追加写入
        with open(self.path, mode='a', encoding='utf-8') as file:
            file.write(title + '\n')
            file.writelines(content)
            file.write('\n\n')
if __name__ == '__main__':
    host = 'https://www.dianxs.com'
    url = 'https://www.dianxs.com/book/64554/'
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
    }
    path = 'G:/test/novel.txt'
    app = Dianxs(host, url, headers, path)
    app.download_page()

控制台输出：

watermark_type_ZmFuZ3poZW5naGVpdGk_shadow_10_text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3p3eDE5OTIxMjE1_size_16_color_FFFFFF_t_70

watermark_type_ZmFuZ3poZW5naGVpdGk_shadow_10_text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3p3eDE5OTIxMjE1_size_16_color_FFFFFF_t_70 1

文件写入内容：

watermark_type_ZmFuZ3poZW5naGVpdGk_shadow_10_text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3p3eDE5OTIxMjE1_size_16_color_FFFFFF_t_70 2

v2版：从网站主页开始深度优先爬取所有分类栏目以及栏目下所有小说信息

改进v2版示例如下

# ！/usr/bin/env python
# -*-coding:utf-8-*-
"""
@Author  : xiaofeng
@Time    : 2018/12/26 11:41
@Desc : Less interests,More interest.
@Project : python_appliction
@FileName: dianxs2.py
@Software: PyCharm
@Blog    ：https://blog.csdn.net/zwx19921215
"""
import os
import requests
from lxml import html
"""
简单爬取小说文章内容（'殿行说小说网'）
"""
class Dianxs():
    # 构造函数初始化
    def __init__(self, host, url, headers, path):
        self.host = host
        self.url = url
        self.headers = headers
        self.path = path
        self.novel_name = ''
    """
    主站导航栏栏目列表爬取
    """
    def nav_page(self):
        print('------------------殿行说----------------------------')
        response = requests.get(url=self.host, headers=self.headers)
        text = html.fromstring(response.text)
        nav_list = text.xpath('//ul[@class="nav"]/li/a/text()')
        nav_href_list = text.xpath('//ul[@class="nav"]/li/a/@href')
        nav_list.pop(0)
        nav_href_list.pop(0)
        print(nav_list)
        print(nav_href_list)
        i = 0
        for nav_item in nav_href_list:
            address = self.host + nav_item
            nav_title = nav_list[i]
            self.nav_item(address, nav_title)
            i += 1
    """
    小说栏目下所有章节爬取
    """
    def nav_item(self, url, nav_title):
        response = requests.get(url=url, headers=self.headers)
        text = html.fromstring(response.text)
        novel_list = text.xpath(
            '//div[@class="panel new-xs-list w300 w265 fr simple"]/ul/li//span[@class="xs-name"]/a/text()')
        novel_list_href = text.xpath(
            '//div[@class="panel new-xs-list w300 w265 fr simple"]/ul/li//span[@class="xs-name"]/a/@href')
        print('--------------------', nav_title, '-----------------')
        print(novel_list)
        print(novel_list_href)
        print('\n')
        for nov_item in novel_list_href:
            self.url = self.host + nov_item
            self.download_page()
    """
    每个章节下载，并解析小说主页信息，获取小说列表
    """
    def download_page(self):
        response = requests.get(url=self.url, headers=self.headers)
        text = html.fromstring(response.text)
        novel = text.xpath('//div[@class="info"]/h1/text()')
        author = text.xpath('//div[@class="info"]/p[1]/text()')
        act = text.xpath('//div[@class="info"]/p[2]/text()')
        last_update = text.xpath('//div[@class="info"]/p[3]/text()')
        last_chapter_href = text.xpath('//div[@class="info"]/p[4]/a/@href')
        last_chapter_title = text.xpath('//div[@class="info"]/p[4]/a/text()')
        introduce = text.xpath('//div[@class="info"]/p[@class="introduce"]/text()')
        print('---------------------------description--------------------------------')
        print(novel)
        # author[0].replace('\xa0', '')
        print(author)
        print(act)
        print(last_update)
        print(last_chapter_title, ' , ', last_chapter_href)
        print('简介：', introduce)
        print('-----------------------------------------------------------------------')
        print('\n')
        chapters = text.xpath('//div[@class="section-panel section-list"]/dl/dd/a/text()')
        hrefs = text.xpath('//div[@class="section-panel section-list"]/dl/dd/a/@href')
        print(chapters)
        print(hrefs)
        print('\n')
        for href in hrefs:
            # time.sleep(1)
            address = self.host + href
            self.novel_name = novel
            self.parse_html(address)
    """
    解析文章内容
    @:param address 章节页地址
    """
    def parse_html(self, address):
        response = requests.get(url=address, headers=self.headers, timeout=10)
        if response.status_code != 200:
            self.parse_html(address)
        text = html.fromstring(response.text)
        title = text.xpath('//div[@class="read-title"]/h2/text()')
        content = text.xpath('//div[@class="read-content"]/p/text()')
        print('-------- ', title, '-----------')
        print(content)
        print('\n')
        # ''.join(content)：list集合转字符串；list = list(string)：字符串转list集合
        title_str = ''.join(title)
        content_str = ''.join(content)
        self.write_to_file(title_str, content_str)
    """
    章节内容写入文件
    """
    def write_to_file(self, title, content):
        file_path = self.path + ''.join(self.novel_name)
        if not os.path.exists(file_path):
            os.makedirs(file_path)
        file_name = file_path + '/' + title + '.txt'
        flag = os.path.exists(file_name)
        if not flag:
            # 'w' 表示写模式，没有文件则会创建一个
            try:
                f = open(file_name, 'w')
                f.close()
            except Exception as e:
                print(e)
                # 可将发生异常信息的章节收集，进行后续处理
                # todo
        # with 可以不用显示调用close方法
        # 'a' 表示追加写入
        with open(file_name, mode='a', encoding='utf-8') as file:
            file.write(title + '\n')
            file.writelines(content)
            file.write('\n\n')
if __name__ == '__main__':
    host = 'https://www.dianxs.com'
    url = 'https://www.dianxs.com/book/64554/'
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
    }
    path = 'G:/殿兴说/'
    app = Dianxs(host, url, headers, path)
    app.nav_page()
    # app.download_page()