scrapy爬取亚马逊商品评论 曾经终败给现在 2021-12-23 00:05 445阅读 0赞 这一篇使用scrapy爬虫框架实现亚马逊商品评论的抓取。 1、创建一个爬虫项目: scrapy startproject MySpiderTest 2、item.py中定义数据item: import scrapy from scrapy.item import Field, Item class ItcastItem(Item): # define the fields for your item here like: # name = scrapy.Field() name = Field() title = Field() info = Field() # 评论信息 class AmazonReviewItem(Item): user_id = Field() user_name = Field() data_asin = Field() name = Field() # 商品名称 review_title = Field() review_star_rating = Field() # 评分 review_date = Field() # 日期 review_info = Field() # 商品信息 class AmazonGoodsItem(scrapy.Item): # define the fields for your item here like: #collection = 'amazon' # 数据表 s_href = scrapy.Field() # 小分类url data_asin = scrapy.Field() # 商品编号 name = scrapy.Field() # 商品名称 goods_url = scrapy.Field() # 商品url brand = scrapy.Field() # 商品品牌 price = scrapy.Field() # 商品价格 freight = scrapy.Field() # 运费 3、spider目录创建爬虫amazon\_review.py: # -*- coding: utf-8 -*- import scrapy from urllib import parse as url_parse from mySpiderTest.items import AmazonGoodsItem, AmazonReviewItem import re from copy import deepcopy # 爬取亚马逊评论信息 # 通过搜索关键字查询出来的列表,如k=phone class AmazonReviewSpider(scrapy.Spider): name = 'amazon_review' allowed_domains = ['www.amazon.com'] # start_urls = ['https://www.amazon.com/s?k=phone&ref=nb_sb_noss'] def __init__(self, category=None, *args, **kwargs): super(AmazonReviewSpider, self).__init__(*args, **kwargs) self.start_urls = [] if category is not None: keys = category.split(",") for key in keys: self.start_urls.append('https://www.amazon.com/s?k=' + key + '&ref=nb_sb_noss') else: # 默认搜索phone self.start_urls = ['https://www.amazon.com/s?k=phone&ref=nb_sb_noss'] self.log("category = %s" % category) def parse(self, response): item = AmazonGoodsItem() div_list = response.xpath("//*[@id=\"search\"]//div[@class=\"s-result-list s-search-results sg-row\"]/div") self.log("div_list_len=%s" % str(len(div_list))) for each_div in div_list: # data_asin = each_div.xpath("@data-asin").extract_first() # item['data_asin'] = data_asin goods_url = each_div.xpath(".//h2/a/@href").extract_first() item['goods_url'] = url_parse.unquote(goods_url) item['name'] = self.get_goods_name(item['goods_url']) item['data_asin'] = self.get_data_asin(item['goods_url']) # self.log("************* item[name]: %s" % item) # 商品评论详情第一页 review_url = 'https://www.amazon.com/' + item['name'] \ + '/product-reviews/' + item['data_asin'] \ + '/ref=cm_cr_getr_d_paging_btm_next_1?ie=UTF8' \ + '&reviewerType=all_reviews&pageNumber=1' yield scrapy.Request( review_url, callback=self.parse_review_detail, meta={"item": deepcopy(item)} ) # 下一页 xpath=//*[@id="search"]/div[1]/div[2]/div/span[7]/div/div/div/ul/li[7]/a next_url = response.xpath( "//*[@id='search']/div[1]/div[2]/div/span[7]/div/div/div/ul/li[7]/a/@href").extract_first() if next_url is not None: next_url = 'https://www.amazon.cn' + next_url yield scrapy.Request( next_url, callback=self.parse ) def parse_review_detail(self, response): goods_item = response.meta["item"] # //*[@id="customer_review-R35WB3S3WWC9DN"]/div[4]/span for each in response.xpath("//*[starts-with(@id,\"customer_review-\")]"): item = AmazonReviewItem() item['data_asin'] = goods_item['data_asin'] item['name'] = goods_item['name'] item['data_asin'] = goods_item['data_asin'] item['name'] = goods_item['name'] item["user_id"] = each.xpath("@id").extract_first().split("-")[1] item["user_name"] = each.xpath("//span[@class='a-profile-name']")\ .xpath('string(.)').extract()[0] item['review_title'] = each.xpath("//a[@data-hook='review-title']")\ .xpath('string(.)').extract()[0] item['review_star_rating'] = each.xpath("//i[@data-hook='review-star-rating']")\ .xpath('string(.)').extract()[0] item['review_date'] = each.xpath("//span[@data-hook='review-date']")\ .xpath('string(.)').extract()[0] item['review_info'] = each.xpath("//span[@data-hook='review-body']") \ .xpath('string(.)').extract()[0] yield item # 是否有下一页 next_page = response.xpath("//*[@id=\"cm_cr-pagination_bar\"]/ul/li[2]/a/@href").extract_first() self.log("-------next_page = %s" % next_page) if next_page is not None: next_page = response.urljoin(next_page) yield scrapy.Request( next_page, callback=self.parse_review_detail, meta={"item": deepcopy(goods_item)} ) @staticmethod def get_goods_name(url): name = '' if url is None: return name regex1 = re.compile(r"url=\/.*?\/") is_contain_url = re.search(regex1, url) if is_contain_url: ''' url = /gp/slredirect/picassoRedirect.html/ref=pa_sp_atf_aps_sr_pg1_2?ie=UTF8&adId=A0805009AF1ES0KA13RD&url=/VTech-CS6529-4-Answering-Cordless-Handsets/dp/B00WHYS0R2/ref=sr_1_2_sspa?keywords=phone&qid=1561524478&s=gateway&sr=8-2-spons&psc=1&qualifier=1561524478&id=96741371645175&widgetName=sp_atf ''' name = re.findall(re.compile(regex1), url)[0].split("/")[1] else: ''' url = /Panasonic-KX-TGD532W-Expandable-Cordless-Answering/dp/B071GQB94T/ref=sr_1_3?keywords=phone&qid=1561524478&s=gateway&sr=8-3 ''' name = url.split("/")[1] return name @staticmethod def get_data_asin(url): asin = '' if url is None: return asin regex1 = re.compile(r"dp\/.*?\/") is_contain_dp = re.search(regex1, url) ''' url = /gp/slredirect/picassoRedirect.html/ref=pa_sp_atf_aps_sr_pg1_2?ie=UTF8&adId=A0805009AF1ES0KA13RD&url=/VTech-CS6529-4-Answering-Cordless-Handsets/dp/B00WHYS0R2/ref=sr_1_2_sspa?keywords=phone&qid=1561524478&s=gateway&sr=8-2-spons&psc=1&qualifier=1561524478&id=96741371645175&widgetName=sp_atf url = /Panasonic-KX-TGD532W-Expandable-Cordless-Answering/dp/B071GQB94T/ref=sr_1_3?keywords=phone&qid=1561524478&s=gateway&sr=8-3 ''' if is_contain_dp: asin = re.findall(regex1, url)[0].split("/")[1] else: asin = url.split("/")[1] return asin 4、定义pipelines.py: class MyspidertestPipeline(object): def __init__(self): # super(self) self.review_file = codecs.open('amazon_reviews.json', 'a', encoding="utf-8") def process_item(self, item, spider): lines = json.dumps(dict(item), ensure_ascii=False) + "\n" self.review_file.write(lines) return item def spider_closed(self, spider): self.review_file.closed() 5、修改settings.py : ITEM_PIPELINES = { 'mySpiderTest.pipelines.MyspidertestPipeline': 300, } 6、运行: # category=phone是传递key参数字,多个用逗号隔开:category=huawei,oppo,vivo scrapy crawl amazon_review -a category=phone
还没有评论,来说两句吧...