scrapy爬取链接后再爬取链接内容

╰+攻爆jí腚メ 2022-05-15 15:10 410阅读 0赞

以下代码是在python3.6环境下测试通过

#!/usr/bin/python 
    # -*- coding:utf-8 -*- 
    from scrapy.http import Request
    from scrapy.spiders import Spider
    from scrapy.selector import Selector
    from storage.items import W3SchoolItem
    
    class StorageSpider(Spider):
        """ 有三个必需的定义的成员:name,start_urls,parse() """
        name = "storage" #这个spider的标识
        allowed_domains = ["www.zyc56.org.cn"] #域名限制
        start_urls = [  #一个url列表，spider从这些网页开始抓取
            "http://www.zyc56.org.cn/index.php?m=content&c=index&a=lists&catid=31"
        ]
        
        def parse(self, response):
            sel = Selector(response)
            item = StorageItem()
            
            mainXpath = sel.xpath('//div[@class="map_intro clear"]')
            elseXpath = sel.xpath('//div[@class="map_article"]')
            
            item['crawlUrl'] = response.url
            item['enterpriseName'] = mainXpath.xpath('dl/dd[1]/text()').extract() #公司名称
            item['contactUser'] = mainXpath.xpath('dl/dd[2]/text()').extract() #联系人
            item['contactNumber'] = mainXpath.xpath('dl/dd[3]/b/text()').extract() #联系电话
            item['warehouseType'] = mainXpath.xpath('dl/dd[4]/text()').extract()#仓库类型
            item['releaseTime'] = mainXpath.xpath('dl/dt/span/text()').extract()#发布时间
             
            item['warehouseAddress'] = elseXpath.xpath('div/span/text()').extract() #所在地区
            item['warehouseDetailAddr'] = elseXpath.xpath('div/text()[2]').extract() #所在详细地址
             
            sonPath = elseXpath.xpath('table/tbody/tr/td[contains(text(),"仓库规模")]/following-sibling::td[position()=1]')
            if not len(sonPath): #空数组
                sonPath = elseXpath.xpath('table/tbody/tr/td[contains(text(),"仓库建设方案")]/../following-sibling::tr/td[position()=2]')
            
            item['warehouseSize'] = sonPath.xpath('normalize-space(translate(translate(string(.),"\xa0",""),"平米",""))').extract()
            
            if len(item['enterpriseName']):
                yield item
            
            alinkList = sel.xpath('//dd[@class="intro"]/a/@href').extract()
            for alink in alinkList:
                yield Request(url=alink, callback=self.parse)

[pipelines.py][] 文件代码如下：

# -*- coding: utf-8 -*-
    
    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
    
    #from scrapy.exporters import JsonItemExporter
    import pymysql
    
    class StoragePipeline(object):
    
    # def open_spider(self, spider):
    # #可选实现，当spider被开启时，这个方法被调用。
    # #输出到 w3school_data_utf8.json 文件
    # self.file = open('w3school_data_utf8.json', 'wb')
    # self.exporter = JsonItemExporter(self.file, encoding='utf-8')
    # self.exporter.start_exporting()
    # 
    # def close_spier(self, spider):
    # #可选实现，当spider被关闭时，这个方法被调用
    # self.exporter.finish_exporting()
    # self.file.close()
    # 
    # def process_item(self, item, spider):
    # self.exporter.export_item(item)
    # return item
        def __init__(self):
            self.dbpool = pymysql.connect(
                host = '127.0.0.1',
                db = 'db_scrapy',
                user = 'root',
                passwd = 'abc123',
                charset = 'utf8'
            )
            
        def process_item(self, item, spider):
            db = self.dbpool
            cur = db.cursor()
            try: 
                cur.execute("insert into storage_info(enterprise_name, warehouse_address, warehouse_detail_addr, warehouse_size,warehouse_type, contact_user, contact_number, release_time, add_type, crawl_url) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)", #release_time,
                    (
                    item['enterpriseName'][0][5:],
                    item['warehouseAddress'][0],
                    item['warehouseDetailAddr'][0].strip()[5:],
                    item['warehouseSize'][0].strip(),
                    item['warehouseType'][0][5:],
                    item['contactUser'][0][4:],
                    item['contactNumber'][0],
                    item['releaseTime'][0][3:],
                    1,
                    item['crawlUrl']
                    )
                )
                db.commit()
            except Exception as e:
                print('错误',format(e))
                db.rollback()
                db.close()
                
            return item

[items.py][] 文件代码如下：

# -*- coding: utf-8 -*-
    
    # Define here the models for your scraped items
    #
    # See documentation in:
    # https://doc.scrapy.org/en/latest/topics/items.html
    
    import scrapy
    
    class StorageItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        enterpriseName = scrapy.Field()
        warehouseAddress = scrapy.Field()
        warehouseDetailAddr = scrapy.Field()
        warehouseSize = scrapy.Field()
        warehouseType = scrapy.Field()
        releaseTime = scrapy.Field()
        contactUser = scrapy.Field()
        contactNumber = scrapy.Field()
        addType = scrapy.Field()
        crawlUrl = scrapy.Field()

[settings.py][] 文件代码需修改如下配置：

# Configure item pipelines
    # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
    ITEM_PIPELINES = { 
        'storage.pipelines.StoragePipeline': 300,
    }

[pipelines.py]: http://pipelines.py
[items.py]: http://items.py
[settings.py]: http://settings.py