python网络爬虫_爬图片

╰半夏微凉° 2022-11-29 12:54 249阅读 0赞

##                                             python网络爬虫\_爬图片  ##

1.安装 Beautifulsoup4

#解析返回的html与json数据
     pip install Beautifulsoup4

使用 :  
          运行后输入要搜索的关键字  
          输入要下载的数量  
          输入需要保存的文件夹名称

![20200821104219691.png][]

#解析返回的html与json数据 ;jupyter noteBook
    # !pip install Beautifulsoup4 
    ##使用
    # 运行后输入要搜索的关键字
    # 输入要下载的数量
    # 输入需要保存的文件夹名称
    import re
    import requests
    from urllib import error
    from bs4 import BeautifulSoup
    import os
    import cv2 
    
    num = 0
    numPicture = 0
    file = ''
    List = []
     
     
    def Find(url):
        global List
        print('正在检测图片总数，请稍等.....')
        t = 0
        i = 1
        s = 0
        while t < 1000:
            Url = url + str(t)
            try:
                Result = requests.get(Url, timeout=7)
            except BaseException:
                t = t + 60
                continue
            else:
                result = Result.text
                pic_url = re.findall('"objURL":"(.*?)",', result, re.S)  # 先利用正则表达式找到图片url
                s += len(pic_url)
                if len(pic_url) == 0:
                    break
                else:
                    List.append(pic_url)
                    t = t + 60
        return s
     
    
     
     
    def dowmloadPicture(html, keyword):
        global num
        # t =0
        pic_url = re.findall('"objURL":"(.*?)",', html, re.S)  # 先利用正则表达式找到图片url
        print('找到关键词:' + keyword + '的图片，即将开始下载图片...')
        for each in pic_url:
            print('正在下载第' + str(num + 1) + '张图片，图片地址:' + str(each))
            
            cap=cv2.VideoCapture(each)
            ret=cap.isOpened()
            if (ret):
                ret,img=cap.read()
                if ret:
                    img = cv2.resize(img, (150, 150), interpolation = cv2.INTER_AREA)
                    cv2.imwrite('./'+file+'/'+file+str(num + 1)+".jpg", img)
            cap.release()
            
            num += 1
            if num >= numPicture:
                return
     
     
    if __name__ == '__main__':  # 主函数入口
        word = input("请输入搜索关键词: ")
        #add = 'http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=%E5%BC%A0%E5%A4%A9%E7%88%B1&pn=120'
        url = 'http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=' + word + '&pn='
        tot = Find(url)
        print('经过检测%s类图片共有%d张' % (word, tot))
        numPicture = int(input('请输入想要下载的图片数量 '))
        file = input('请建立一个存储图片的文件夹，输入文件夹名称即可')
        y = os.path.exists(file)
        if y == 1:
            print('该文件已存在，请重新输入')
            file = input('请建立一个存储图片的文件夹，)输入文件夹名称即可')
            os.mkdir(file)
        else:
            os.mkdir(file)
        t = 0
        tmp = url
        while t < numPicture:
            try:
                url = tmp + str(t)
                result = requests.get(url, timeout=10)
                print(url)
            except error.HTTPError as e:
                print('网络错误，请调整网络后重试')
                t = t+60
            else:
                dowmloadPicture(result.text, word)
                t = t + 60
     
        print('当前搜索结束，感谢使用')

[20200821104219691.png]: /images/20221124/b3be30a0637042e9a85ef28da4b04e5b.png