Python爬取图片

灰太狼 2021-04-08 04:26 873阅读 0赞
  1. import requests # 模块导入的俩种方法
  2. from multiprocessing import Pool
  3. import re
  4. def get(url):
  5. ret=requests.get(url)
  6. if ret.status_code==200:
  7. return ret.content.decode('gbk')
  8. def call_back(arg):
  9. ret = com.finditer(arg)
  10. dict_lst=[]
  11. for i in ret:
  12. dic = {
  13. 'png': i.group('png'),
  14. 'name': i.group('name'),
  15. 'place': i.group('place')
  16. }
  17. dict_lst.append(dic)
  18. for i in dict_lst:
  19. res=subget(i['png'])
  20. write_func(i['name'],i['place'],res)
  21. return dict_lst
  22. def subget(url):
  23. if 'https' in url:
  24. ret = requests.get(url)
  25. if ret.status_code == 200:
  26. return ret.content
  27. else:
  28. pass
  29. else:
  30. n_url = 'http://www.xiaohuar.com' + url
  31. ret = requests.get(n_url)
  32. if ret.status_code == 200:
  33. return ret.content
  34. else:
  35. pass
  36. def write_func(path,place,picture):
  37. with open(r'E:\text1\爬虫\text_png\%s-%s.png' %(path,place),'wb') as f:
  38. f.write(picture)
  39. '''我要爬取的网页的特征'''
  40. '''http://www.xiaohuar.com/list-1-0.html'''
  41. '''http://www.xiaohuar.com/list-1-43.html'''
  42. if __name__ =='__main__':
  43. com = re.compile(
  44. '<div class="item_t">(?:.*?)src="(?P<png>.*?)"(?:.*?)<span class="price">(?P<name>.*?)</span>(?:.*?)'
  45. '<a href="http://www.xiaohuar.com/" class="img_album_btn">(?P<place>.*?)</a>', re.S)
  46. pool=Pool(3)
  47. res_lst=[]
  48. for i in range(40):
  49. pool.apply_async(get,args=('http://www.xiaohuar.com/list-1-%s.html' %i,),callback=call_back)
  50. pool.close()
  51. pool.join()

缺点:爬取的速度慢,最多17个网页(好无奈)

发表评论

表情:
评论列表 (有 0 条评论,873人围观)

还没有评论,来说两句吧...

相关阅读