Python学习(四):多线程下载今日头条的街拍图片 喜欢ヅ旅行 2022-03-27 03:18 220阅读 0赞 本程序使用了MongoDB数据库保存 MongoDB数据库可以保存字典 使用了进程池Pool 同时下载100页网站的内容 # 使用多进程第街拍图片进行下载,并将图片相关信息保存到MongoDB数据库中 from _md5 import md5 import requests, re, json, pymongo from multiprocessing import Pool from urllib.parse import urlencode class JiePaiSpider(object): # 进程池pool无法序列化pymongo对象,因为pymongo数据库中含有线程锁 # TypeError:can'tpickle_thread.lock objects # 建立pymongo的链接 client = pymongo.MongoClient('localhost') db = client['picture'] def __init__(self): self.headers ={ 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.12 Safari/537.36' } def get_list_json(self,offset): """ 请求列表页的json接口,获取列表页中的图片信息 :param offset: 请求接口的偏移量参数 :return: """ # https://www.toutiao.com/search_content/?offset=0&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&cur_tab=1&from=search_tab&pd=synthesis # 准备接受参数 params = { 'offset': offset, 'format': 'json', 'keyword': '街拍', 'autoload': 'true', 'count': '20', 'cur_tab': '1', 'from': 'search_tab', 'pd': 'synthesis', } api_url = 'https://www.toutiao.com/search_content/?' + urlencode(params) try: response = requests.get(api_url, headers=self.headers) # 响应状态是200,说明GET请求成功 if response.status_code == 200: return response.text else: print("请求异常:url:{}, status_code={}".format(api_url, response.status_code)) return None except Exception as e: print("请求异常:url:{}, error={}".format(api_url, e)) return None def perse_list_json(self, json_str): """ 解析列表页json数据 :param json_str: :return: """ json_dict = json.loads(json_str) if 'data' in json_dict.keys(): # 判断字典json_dict的键中是否包含‘data’,如果有,可以解析,如果没有,可能没哟数据或发生异常 data_list = json_dict.get('data',None) if data_list and len(data_list) > 0: # 说明有数据,可以解析 urls = [] for item in data_list: if 'single_mode' not in item and 'cell_type' not in item: article_url = item['article_url'] urls.append(article_url) return urls def get_detail_page(self, detail_urls): try: response = requests.get(detail_urls, headers=self.headers) # 响应状态是200,说明GET请求成功 if response.status_code == 200: return response.text else: print("请求异常:url:{}, status_code={}".format(detail_urls, response.status_code)) return None except Exception as e: print("请求异常:url:{}, error={}".format(detail_urls, e)) return None def parse_detail_page(self, detail_html): # \(:表示对正则表达式中的(进行转义,转化为一个普通的字符 js_json_str = re.findall(re.compile(r'gallery: JSON\.parse\((.*?)\),', re.S), detail_html)[0].replace('\\', '').strip('"') # 数据保存到MongoDB中 data_dict = json.loads(js_json_str) self.save_dict_to_db(data_dict) # 解析Json,取出图片地址,下载到本地 for item_dict in data_dict['sub_images']: img_url = item_dict['url'] # 根据图片url地址,下载图片 self.download_image(img_url) def download_image(self, img_url): response = requests.get(img_url, headers=self.headers) if response.status_code == 200: # response.text:获取的是文件资源,(json字符串,网页源代码) # 但是图片属于二进制资源,图片数据的传输是以二进制流的形式传输的,不在是字符串 content = response.content # md5()函数的参数需要的是一个bytes字节码,不能是str类型的字符串 # hexdigest():获取随机字符串 img_name = md5(img_url.encode('utf-8')).hexdigest() # 'w':写入普通文本;'wb':专门写入二进制数据(图片,音频,视频) f = open('imgs/{}.jpg'.format(img_name),'wb') f.write(content) f.close() else: print('图片请求失败:{}'.format(img_url)) def save_dict_to_db(self, dic): self.db['img'].insert_one(dic) def start_spider(self,offset): print('正在请求偏移量为{}的图片'.format(offset)) json_str = self.get_list_json(offset) if json_str: urls = self.perse_list_json(json_str) for detail_url in urls: detail_html = self.get_detail_page(detail_url) if detail_html: self.parse_detail_page(detail_html) if __name__ == '__main__': jp = JiePaiSpider() pool = Pool(3) pool.map(jp.start_spider, [x for x in range(0, 101) if x % 20 == 0]) pool.close() pool.join()
还没有评论,来说两句吧...