用Scrapy和Selenium+PhantomJS爬淘宝评论 柔情只为你懂 2022-06-07 13:09 192阅读 0赞 ## 用Scrapy爬商品ID ## 首先要设置`ROBOTSTXT_OBEY = False` base.py # -*- coding: utf-8 -*- import scrapy import codecs class BaseSpider(scrapy.Spider): allowed_domains = ["taobao.com"] def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.file = codecs.open(self.name + '.txt', 'w', 'utf-8') def __del__(self): self.file.close() tce\_id.py 用来爬小分类的ID # -*- coding: utf-8 -*- from .base import BaseSpider import json CATEGORY_URLS = [ 'https://www.taobao.com/markets/nvzhuang/taobaonvzhuang', 'https://www.taobao.com/markets/nanzhuang/2017new', 'https://neiyi.taobao.com', 'https://www.taobao.com/markets/xie/nvxie/index', 'https://www.taobao.com/markets/bao/xiangbao', 'https://pei.taobao.com', 'https://www.taobao.com/markets/qbb/index?spm=a21bo.50862.201879-item-1008.5.YrbXb6&pvid=b9f2df4c-6d60-4af4-b500-c5168009831f&scm=1007.12802.34660.100200300000000', 'https://www.taobao.com/markets/qbb/index?spm=a21bo.50862.201867-main.8.mL7cax&pvid=b9f2df4c-6d60-4af4-b500-c5168009831f&scm=1007.12802.34660.100200300000000', 'https://www.taobao.com/markets/qbb/index?spm=a21bo.50862.201867-main.8&pvid=b9f2df4c-6d60-4af4-b500-c5168009831f&scm=1007.12802.34660.100200300000000', 'https://www.taobao.com/markets/jiadian/index', 'https://www.taobao.com/markets/3c/shuma', 'https://www.taobao.com/markets/3c/sj', 'https://mei.taobao.com/', 'https://www.taobao.com/market/baihuo/xihuyongpin.php?spm=a217u.7383845.a214d5z-static.49.e8DQmz', 'https://g.taobao.com/brand_detail.htm?navigator=all&_input_charset=utf-8&q=%E8%90%A5%E5%85%BB%E5%93%81&spm=a21bo.50862.201867-links-4.54.oMw9IU', 'https://www.taobao.com/market/peishi/zhubao.php', 'https://www.taobao.com/market/peishi/yanjing.php?spm=a219r.lm5630.a214d69.14.CkLAJ7', 'https://www.taobao.com/market/peishi/shoubiao.php', 'https://www.taobao.com/markets/coolcity/coolcityHome', 'https://www.taobao.com/markets/coolcity/coolcityHome', 'https://www.taobao.com/markets/amusement/home', 'https://game.taobao.com', 'https://www.taobao.com/markets/acg/dongman', 'https://www.taobao.com/markets/acg/yingshi', 'https://chi.taobao.com', 'https://chi.taobao.com', 'https://chi.taobao.com', 'https://s.taobao.com/search?q=%E5%9B%AD%E8%89%BA&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.50862.201856-taobao-item.1&ie=utf8&initiative_id=tbindexz_20170419', 'https://s.taobao.com/search?ie=utf8&initiative_id=staobaoz_20170419&stats_click=search_radio_all%3A1&js=1&imgfile=&q=%E8%BF%9B%E5%8F%A3%E7%8B%97%E7%B2%AE&suggest=history_3&_input_charset=utf-8&wq=&suggest_query=&source=suggest', 'https://s.taobao.com/search?q=%E5%86%9C%E8%B5%84&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.50862.201856-taobao-item.1&ie=utf8&initiative_id=tbindexz_20170221', 'https://fang.taobao.com/', 'https://s.taobao.com/list?spm=a21bo.50862.201867-links-10.27.iQWRJS&source=youjia&cat=50097129', 'https://www.jiyoujia.com/markets/youjia/zhuangxiucailiao', 'https://s.taobao.com/list?spm=a21bo.7932212.202572.1.rtUtMQ&source=youjia&q=%E5%AE%B6%E5%85%B7', 'https://s.taobao.com/list?source=youjia&cat=50065206%2C50065205', 'https://s.taobao.com/list?spm=a21bo.50862.201867-links-11.80.K6jN68&source=youjia&cat=50008163&bcoffset=0&s=240', 'https://car.tmall.com/wow/car/act/carfp', 'https://2car.taobao.com/', 'https://car.tmall.com/wow/car/act/carfp', 'https://www.taobao.com/markets/bangong/pchome', 'https://www.taobao.com/markets/dingzhi/home', 'https://wujin.taobao.com/', 'https://s.taobao.com/list?source=youjia&q=%E7%99%BE%E8%B4%A7', 'https://s.taobao.com/list?source=youjia&cat=50035867&bcoffset=0&s=240', 'https://www.taobao.com/market/jiadian/baojian.php?spm=a21bo.50862.201867-main.46.K6jN68', 'https://xue.taobao.com', 'https://ka.taobao.com/', 'https://s.taobao.com/list?q=%E4%B8%8A%E9%97%A8%E6%9C%8D%E5%8A%A1&cat=50097750' ] class TceIdSpider(BaseSpider): name = "TceId" start_urls = CATEGORY_URLS def parse(self, response): data = response.xpath('//div[@tms-data]/@tms-data').extract() data = [json.loads(cur_data) for cur_data in data] tce_ids = [] for cur_data in data: for key in cur_data: if not key.startswith('items'): continue for item in cur_data[key]: if not ('tms_type' in item and item['tms_type'] == 'jsonp'): continue tce_ids.append([ str(item['data_para']['tce_sid']), item['data_para']['tce_vid'] ]) if not tce_ids: self.logger.warning('No tce_id on "{}"'.format(response.url)) else: for tce_id in tce_ids: json.dump(tce_id, self.file) self.file.write('\n') item\_id.py 用来爬商品ID # -*- coding: utf-8 -*- from .base import BaseSpider import codecs import json def gen_start_urls(): tce_sids = [] tce_vids = [] with codecs.open('TceId.txt', 'r', 'utf-8') as file: for line in file: data = json.loads(line) tce_sids.append(data[0]) tce_vids.append(data[1]) size = len(tce_sids) for start in range(0, size, 20): # Up to 20 at a time end = min(start + 20, size) url = ('https://tce.taobao.com/api/mget.htm?' 'callback=jsonp123&tce_sid={0}&tce_vi' 'd={1}&tid={2}&tab={2}&topic={2}&coun' 't={2}').format( ','.join(tce_sids[start:end]), ','.join(tce_vids[start:end]), ',' * (end - start) ) yield url class ItemIdSpider(BaseSpider): name = "ItemId" start_urls = gen_start_urls() def parse(self, response): data = response.text[response.text.find('{') : response.text.rfind('}') + 1] data = json.loads(data) # if not 'result' in data: # return for tce in data['result'].values(): for item in tce['result']: if (not 'auction_id' in item or item['auction_id'] == '0'): continue self.file.write(item['auction_id']) self.file.write('\n') ## 用Selenium+PhantomJS爬评论列表 ## 本来也是打算用Scrapy的,不过url参数里有个ua(User Action)不会算,干脆就用Selenium模拟浏览器爬了,不过还是模仿了一下Scrapy的爬虫格式 为什么不用scrapy-splash?因为我这里怎么也装不上Splash所以就放弃了… 这里开了个脑洞用修改jsonp回调的方法获取返回的数据,这样获取到的数据最多,也不用管DOM结构了 测试中不加代理IP,不用sleep也不会被反爬虫,可能淘宝只是判断并发数不能太多吧,不知道其他人爬行不行 # -*- coding: utf-8 -*- import codecs from selenium import webdriver from selenium.common.exceptions import * import re import math import json def gen_start_urls(): with codecs.open('ItemId.txt', 'r', 'utf-8') as file: for line in file: yield 'https://item.taobao.com/item.htm?id=' + line.strip() class CommentSpider(): name = "Comment" start_urls = gen_start_urls() def start_requests(self): try: driver = webdriver.PhantomJS(executable_path=r'F:\WebDriver\phantomjs-2.1.1-windows\bin\phantomjs.exe') driver.implicitly_wait(10) for url in self.start_urls: print('Crawling "{}"'.format(url)) driver.get(url) self.parse_item(driver) return [] finally: driver.close() def parse_item(self, driver): try: if not '//item.taobao.com/item.htm' in driver.current_url: print('Unknown page') return comment_count = int(driver.find_element_by_class_name('J_ReviewsCount').text) if comment_count == 0: print('No comment') return print('Estimated max pages', math.ceil(comment_count / 20)) except Exception as e: print(e) try: # Hook jsonp callback driver.execute_script('jsonp_tbcrate_reviews_list = ' 'function(data){ comment_data = data }') # Display comments driver.find_element_by_css_selector('a.tb-tab-anchor[data-index="1"]').click() revbd_elem = driver.find_element_by_class_name('tb-revbd') except Exception as e: print(e) return with codecs.open(self.get_filename(driver), 'w', 'utf-8') as file: page = 0 while True: page += 1 print('Page', page) if not self.parse_comments(driver, revbd_elem, file): break if not self.go_to_next_page(revbd_elem): break def get_filename(self, driver): REPLACE = { '\\': '\', '/': '/', ':': ':', '*': '*', '?': '?', '"': "'", '<': '<', '>': '>', '|': '|' } title = driver.title for k, v in REPLACE.items(): title = title.replace(k, v) return 'Comments/{} {}.txt'.format( re.findall(r'[\?&]id=(\d+)', driver.current_url)[0], title ) def parse_comments(self, driver, revbd_elem, file): try: # Wait for request end comment_elems = revbd_elem.find_elements_by_class_name('J_KgRate_ReviewItem') if not comment_elems: print('Anti spider!') return False # Get jsonp response comment_data = driver.execute_script('return comment_data') json.dump(comment_data, file) file.write('\n') except Exception as e: print(e) return True def go_to_next_page(self, revbd_elem): try: next_elem = revbd_elem.find_element_by_class_name('pg-next') if 'pg-disabled' in next_elem.get_attribute('class'): return False next_elem.click() except NoSuchElementException: # Only 1 page return False except Exception as e: print(e) return False return True if __name__ == '__main__': spider = CommentSpider() spider.start_requests() ## 结果 ## 随便爬的一条评论是这样的 ![结果][SouthEast] [SouthEast]: /images/20220607/140f9d8565b44b45a6c64ff4c556cde6.png
还没有评论,来说两句吧...