Python学习（四）：多线程下载今日头条的街拍图片

喜欢ヅ旅行 2022-03-27 03:18 236阅读 0赞

本程序使用了MongoDB数据库保存

MongoDB数据库可以保存字典

使用了进程池Pool

同时下载100页网站的内容

# 使用多进程第街拍图片进行下载，并将图片相关信息保存到MongoDB数据库中
    from _md5 import md5
    
    import requests, re, json, pymongo
    from multiprocessing import Pool
    from urllib.parse import urlencode
    
    
    class JiePaiSpider(object):
        # 进程池pool无法序列化pymongo对象，因为pymongo数据库中含有线程锁
        # TypeError：can'tpickle_thread.lock objects
        # 建立pymongo的链接
        client = pymongo.MongoClient('localhost')
        db = client['picture']
    
        def __init__(self):
            self.headers ={
                'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.12 Safari/537.36'
            }
    
        def get_list_json(self,offset):
            """
            请求列表页的json接口，获取列表页中的图片信息
            :param offset: 请求接口的偏移量参数
            :return:
            """
            # https://www.toutiao.com/search_content/?offset=0&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&cur_tab=1&from=search_tab&pd=synthesis
            # 准备接受参数
            params = {
                'offset': offset,
                'format': 'json',
                'keyword': '街拍',
                'autoload': 'true',
                'count': '20',
                'cur_tab': '1',
                'from': 'search_tab',
                'pd': 'synthesis',
            }
            api_url = 'https://www.toutiao.com/search_content/?' + urlencode(params)
            try:
                response = requests.get(api_url, headers=self.headers)
                # 响应状态是200，说明GET请求成功
                if response.status_code == 200:
                    return response.text
                else:
                    print("请求异常：url:{}, status_code={}".format(api_url, response.status_code))
                    return None
            except Exception as e:
                print("请求异常：url:{}, error={}".format(api_url, e))
                return None
    
        def perse_list_json(self, json_str):
            """
            解析列表页json数据
            :param json_str:
            :return:
            """
            json_dict = json.loads(json_str)
            if 'data' in json_dict.keys():
                # 判断字典json_dict的键中是否包含‘data’，如果有，可以解析，如果没有，可能没哟数据或发生异常
                data_list = json_dict.get('data',None)
                if data_list and len(data_list) > 0:
                    # 说明有数据，可以解析
                    urls = []
                    for item in data_list:
                        if 'single_mode' not in item and 'cell_type' not in item:
                            article_url = item['article_url']
                            urls.append(article_url)
                    return urls
    
        def get_detail_page(self, detail_urls):
            try:
                response = requests.get(detail_urls, headers=self.headers)
                # 响应状态是200，说明GET请求成功
                if response.status_code == 200:
                    return response.text
                else:
                    print("请求异常：url:{}, status_code={}".format(detail_urls, response.status_code))
                    return None
            except Exception as e:
                print("请求异常：url:{}, error={}".format(detail_urls, e))
                return None
    
        def parse_detail_page(self, detail_html):
            # \(:表示对正则表达式中的（进行转义，转化为一个普通的字符
            js_json_str = re.findall(re.compile(r'gallery: JSON\.parse\((.*?)\),', re.S), detail_html)[0].replace('\\', '').strip('"')
            # 数据保存到MongoDB中
            data_dict = json.loads(js_json_str)
            self.save_dict_to_db(data_dict)
    
            # 解析Json，取出图片地址，下载到本地
            for item_dict in data_dict['sub_images']:
                img_url = item_dict['url']
                # 根据图片url地址，下载图片
                self.download_image(img_url)
    
        def download_image(self, img_url):
            response = requests.get(img_url, headers=self.headers)
            if response.status_code == 200:
                # response.text:获取的是文件资源，（json字符串，网页源代码）
                # 但是图片属于二进制资源，图片数据的传输是以二进制流的形式传输的，不在是字符串
                content = response.content
                # md5()函数的参数需要的是一个bytes字节码，不能是str类型的字符串
                # hexdigest():获取随机字符串
                img_name = md5(img_url.encode('utf-8')).hexdigest()
                # 'w':写入普通文本；'wb'：专门写入二进制数据（图片，音频，视频）
                f = open('imgs/{}.jpg'.format(img_name),'wb')
                f.write(content)
                f.close()
            else:
                print('图片请求失败：{}'.format(img_url))
    
        def save_dict_to_db(self, dic):
            self.db['img'].insert_one(dic)
    
        def start_spider(self,offset):
            print('正在请求偏移量为{}的图片'.format(offset))
            json_str = self.get_list_json(offset)
            if json_str:
                urls = self.perse_list_json(json_str)
                for detail_url in urls:
                    detail_html = self.get_detail_page(detail_url)
                    if detail_html:
                        self.parse_detail_page(detail_html)
    
    
    if __name__ == '__main__':
        jp = JiePaiSpider()
        pool = Pool(3)
        pool.map(jp.start_spider, [x for x in range(0, 101) if x % 20 == 0])
        pool.close()
        pool.join()