用Scrapy和Selenium+PhantomJS爬淘宝评论

柔情只为你懂 2022-06-07 13:09 226阅读 0赞

## 用Scrapy爬商品ID ##

首先要设置`ROBOTSTXT_OBEY = False`

base.py

# -*- coding: utf-8 -*-
    
    import scrapy
    import codecs
    
    
    class BaseSpider(scrapy.Spider):
        allowed_domains = ["taobao.com"]
    
        def __init__(self, *args, **kwargs):
            super().__init__(*args, **kwargs)
    
            self.file = codecs.open(self.name + '.txt', 'w', 'utf-8')
    
        def __del__(self):
            self.file.close()

tce\_id.py 用来爬小分类的ID

# -*- coding: utf-8 -*-
    
    from .base import BaseSpider
    import json
    
    
    CATEGORY_URLS = [
        'https://www.taobao.com/markets/nvzhuang/taobaonvzhuang', 
        'https://www.taobao.com/markets/nanzhuang/2017new', 
        'https://neiyi.taobao.com', 
        'https://www.taobao.com/markets/xie/nvxie/index', 
        'https://www.taobao.com/markets/bao/xiangbao', 
        'https://pei.taobao.com', 
        'https://www.taobao.com/markets/qbb/index?spm=a21bo.50862.201879-item-1008.5.YrbXb6&pvid=b9f2df4c-6d60-4af4-b500-c5168009831f&scm=1007.12802.34660.100200300000000', 
        'https://www.taobao.com/markets/qbb/index?spm=a21bo.50862.201867-main.8.mL7cax&pvid=b9f2df4c-6d60-4af4-b500-c5168009831f&scm=1007.12802.34660.100200300000000', 
        'https://www.taobao.com/markets/qbb/index?spm=a21bo.50862.201867-main.8&pvid=b9f2df4c-6d60-4af4-b500-c5168009831f&scm=1007.12802.34660.100200300000000', 
        'https://www.taobao.com/markets/jiadian/index', 
        'https://www.taobao.com/markets/3c/shuma', 
        'https://www.taobao.com/markets/3c/sj', 
        'https://mei.taobao.com/', 
        'https://www.taobao.com/market/baihuo/xihuyongpin.php?spm=a217u.7383845.a214d5z-static.49.e8DQmz', 
        'https://g.taobao.com/brand_detail.htm?navigator=all&_input_charset=utf-8&q=%E8%90%A5%E5%85%BB%E5%93%81&spm=a21bo.50862.201867-links-4.54.oMw9IU', 
        'https://www.taobao.com/market/peishi/zhubao.php', 
        'https://www.taobao.com/market/peishi/yanjing.php?spm=a219r.lm5630.a214d69.14.CkLAJ7', 
        'https://www.taobao.com/market/peishi/shoubiao.php', 
        'https://www.taobao.com/markets/coolcity/coolcityHome', 
        'https://www.taobao.com/markets/coolcity/coolcityHome', 
        'https://www.taobao.com/markets/amusement/home', 
        'https://game.taobao.com', 
        'https://www.taobao.com/markets/acg/dongman', 
        'https://www.taobao.com/markets/acg/yingshi', 
        'https://chi.taobao.com', 
        'https://chi.taobao.com', 
        'https://chi.taobao.com', 
        'https://s.taobao.com/search?q=%E5%9B%AD%E8%89%BA&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.50862.201856-taobao-item.1&ie=utf8&initiative_id=tbindexz_20170419', 
        'https://s.taobao.com/search?ie=utf8&initiative_id=staobaoz_20170419&stats_click=search_radio_all%3A1&js=1&imgfile=&q=%E8%BF%9B%E5%8F%A3%E7%8B%97%E7%B2%AE&suggest=history_3&_input_charset=utf-8&wq=&suggest_query=&source=suggest', 
        'https://s.taobao.com/search?q=%E5%86%9C%E8%B5%84&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.50862.201856-taobao-item.1&ie=utf8&initiative_id=tbindexz_20170221', 
        'https://fang.taobao.com/', 
        'https://s.taobao.com/list?spm=a21bo.50862.201867-links-10.27.iQWRJS&source=youjia&cat=50097129', 
        'https://www.jiyoujia.com/markets/youjia/zhuangxiucailiao', 
        'https://s.taobao.com/list?spm=a21bo.7932212.202572.1.rtUtMQ&source=youjia&q=%E5%AE%B6%E5%85%B7', 
        'https://s.taobao.com/list?source=youjia&cat=50065206%2C50065205', 
        'https://s.taobao.com/list?spm=a21bo.50862.201867-links-11.80.K6jN68&source=youjia&cat=50008163&bcoffset=0&s=240', 
        'https://car.tmall.com/wow/car/act/carfp', 
        'https://2car.taobao.com/', 
        'https://car.tmall.com/wow/car/act/carfp', 
        'https://www.taobao.com/markets/bangong/pchome', 
        'https://www.taobao.com/markets/dingzhi/home', 
        'https://wujin.taobao.com/', 
        'https://s.taobao.com/list?source=youjia&q=%E7%99%BE%E8%B4%A7', 
        'https://s.taobao.com/list?source=youjia&cat=50035867&bcoffset=0&s=240', 
        'https://www.taobao.com/market/jiadian/baojian.php?spm=a21bo.50862.201867-main.46.K6jN68', 
        'https://xue.taobao.com', 
        'https://ka.taobao.com/', 
        'https://s.taobao.com/list?q=%E4%B8%8A%E9%97%A8%E6%9C%8D%E5%8A%A1&cat=50097750'
    ]
    
    
    class TceIdSpider(BaseSpider):
        name = "TceId"
        start_urls = CATEGORY_URLS
    
        def parse(self, response):
            data = response.xpath('//div[@tms-data]/@tms-data').extract()
            data = [json.loads(cur_data) for cur_data in data]
    
            tce_ids = []
            for cur_data in data:
                for key in cur_data:
                    if not key.startswith('items'): continue
                    for item in cur_data[key]:
                        if not ('tms_type' in item 
                                and item['tms_type'] == 'jsonp'):
                            continue
                        tce_ids.append([
                            str(item['data_para']['tce_sid']),
                            item['data_para']['tce_vid']
                        ])
    
            if not tce_ids:
                self.logger.warning('No tce_id on "{}"'.format(response.url))
            else:
                for tce_id in tce_ids:
                    json.dump(tce_id, self.file)
                    self.file.write('\n')

item\_id.py 用来爬商品ID

# -*- coding: utf-8 -*-
    
    from .base import BaseSpider
    import codecs
    import json
    
    
    def gen_start_urls():
        tce_sids = []
        tce_vids = []
        with codecs.open('TceId.txt', 'r', 'utf-8') as file:
            for line in file:
                data = json.loads(line)
                tce_sids.append(data[0])
                tce_vids.append(data[1])
        size = len(tce_sids)
    
        for start in range(0, size, 20): # Up to 20 at a time
            end = min(start + 20, size)
            url = ('https://tce.taobao.com/api/mget.htm?'
                   'callback=jsonp123&tce_sid={0}&tce_vi'
                   'd={1}&tid={2}&tab={2}&topic={2}&coun'
                   't={2}').format(
                        ','.join(tce_sids[start:end]),
                        ','.join(tce_vids[start:end]),
                        ',' * (end - start)
                    )
            yield url
    
    class ItemIdSpider(BaseSpider):
        name = "ItemId"
        start_urls = gen_start_urls()
    
        def parse(self, response):
            data = response.text[response.text.find('{') : 
                                 response.text.rfind('}') + 1]
            data = json.loads(data)
    
            # if not 'result' in data:
                # return
            for tce in data['result'].values():
                for item in tce['result']:
                    if (not 'auction_id' in item 
                        or item['auction_id'] == '0'):
                        continue
                    self.file.write(item['auction_id'])
                    self.file.write('\n')

## 用Selenium+PhantomJS爬评论列表 ##

本来也是打算用Scrapy的，不过url参数里有个ua（User Action）不会算，干脆就用Selenium模拟浏览器爬了，不过还是模仿了一下Scrapy的爬虫格式

为什么不用scrapy-splash？因为我这里怎么也装不上Splash所以就放弃了…

这里开了个脑洞用修改jsonp回调的方法获取返回的数据，这样获取到的数据最多，也不用管DOM结构了

测试中不加代理IP，不用sleep也不会被反爬虫，可能淘宝只是判断并发数不能太多吧，不知道其他人爬行不行

# -*- coding: utf-8 -*-
    
    import codecs
    from selenium import webdriver
    from selenium.common.exceptions import *
    import re
    import math
    import json
    
    
    def gen_start_urls():
        with codecs.open('ItemId.txt', 'r', 'utf-8') as file:
            for line in file:
                yield 'https://item.taobao.com/item.htm?id=' + line.strip()
    
    class CommentSpider():
        name = "Comment"
        start_urls = gen_start_urls()
    
        def start_requests(self):
            try:
                driver = webdriver.PhantomJS(executable_path=r'F:\WebDriver\phantomjs-2.1.1-windows\bin\phantomjs.exe')
                driver.implicitly_wait(10)
    
                for url in self.start_urls:
                    print('Crawling "{}"'.format(url))
                    driver.get(url)
                    self.parse_item(driver)
    
                return []
    
            finally:
                driver.close()
    
        def parse_item(self, driver):
            try:
                if not '//item.taobao.com/item.htm' in driver.current_url:
                    print('Unknown page')
                    return
    
                comment_count = int(driver.find_element_by_class_name('J_ReviewsCount').text)
                if comment_count == 0:
                    print('No comment')
                    return
                print('Estimated max pages', math.ceil(comment_count / 20))
    
            except Exception as e:
                print(e)
    
            try:
                # Hook jsonp callback
                driver.execute_script('jsonp_tbcrate_reviews_list = '
                                      'function(data){ comment_data = data }')
                # Display comments
                driver.find_element_by_css_selector('a.tb-tab-anchor[data-index="1"]').click()
                revbd_elem = driver.find_element_by_class_name('tb-revbd')
    
            except Exception as e:
                print(e)
                return
    
            with codecs.open(self.get_filename(driver), 'w', 'utf-8') as file:
                page = 0
                while True:
                    page += 1
                    print('Page', page)
    
                    if not self.parse_comments(driver, revbd_elem, file): break
                    if not self.go_to_next_page(revbd_elem): break
    
        def get_filename(self, driver):
            REPLACE = {
                '\\': '＼',
                '/': '／',
                ':': '：',
                '*': '＊',
                '?': '？',
                '"': "'",
                '<': '＜',
                '>': '＞',
                '|': '｜'
            }
            title = driver.title
            for k, v in REPLACE.items():
                title = title.replace(k, v)
    
            return 'Comments/{} {}.txt'.format(
                    re.findall(r'[\?&]id=(\d+)', driver.current_url)[0],
                    title
                )
    
        def parse_comments(self, driver, revbd_elem, file):
            try:
                # Wait for request end
                comment_elems = revbd_elem.find_elements_by_class_name('J_KgRate_ReviewItem')
                if not comment_elems:
                    print('Anti spider!')
                    return False
    
                # Get jsonp response
                comment_data = driver.execute_script('return comment_data')
                json.dump(comment_data, file)
                file.write('\n')
    
            except Exception as e:
                print(e)
    
            return True
    
        def go_to_next_page(self, revbd_elem):
            try:
                next_elem = revbd_elem.find_element_by_class_name('pg-next')
                if 'pg-disabled' in next_elem.get_attribute('class'): return False
                next_elem.click()
    
            except NoSuchElementException: # Only 1 page
                return False
            except Exception as e:
                print(e)
                return False
    
            return True
    
    if __name__ == '__main__':
        spider = CommentSpider()
        spider.start_requests()

## 结果 ##

随便爬的一条评论是这样的

![结果][SouthEast]

[SouthEast]: /images/20220607/140f9d8565b44b45a6c64ff4c556cde6.png