【python爬虫】爬取知乎收藏夹内所有图片

拼搏现实的明天。 2022-09-24 13:19 275阅读 0赞
  1. 比如要爬取:https://www.zhihu.com/collection/26347524

只用自带库,没用框架。

Center

  1. # -*- coding: utf-8 -*-
  2. from __future__ import unicode_literals
  3. import urllib
  4. import urllib2
  5. import re
  6. import socket
  7. import os
  8. class Spider:
  9. def __init__(self, site_url):
  10. self.site_url = site_url
  11. self.p = 0
  12. def get_page(self):
  13. proxy = {'http': '115.215.209.77:8118'}
  14. proxy_support = urllib2.ProxyHandler(proxy)
  15. print proxy_support
  16. opener = urllib2.build_opener(proxy_support)
  17. urllib2.install_opener(opener)
  18. i_headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0'}
  19. self.p += 1
  20. url = self.site_url + "?page=" + str(self.p)
  21. print url
  22. request = urllib2.Request(url, headers=i_headers)
  23. response = urllib2.urlopen(request)
  24. return response.read()
  25. def get_pic(self):
  26. while 1:
  27. page = self.get_page()
  28. img_re = re.compile(r'data-original="(https://.*?)&')
  29. img_list = re.findall(img_re, page)
  30. print 'img_list'
  31. print img_list
  32. if img_list:
  33. x = 1
  34. for img_url in img_list:
  35. print '正在保存第%s页的第%s张'%(self.p, x+1)
  36. try:
  37. urllib.urlretrieve(img_url,'%s/picture_%s_%s.jpg' % (name, self.p, x))
  38. except urllib2.URLError, e:
  39. if isinstance(e.reason, socket.timeout):
  40. raise MyException("下载超时,跳过此图: %r" % e)
  41. continue
  42. else:
  43. continue
  44. x += 1
  45. else:
  46. break
  47. print '请输入收藏夹代号:'
  48. in_URL = raw_input()
  49. in_URL = in_URL.strip()
  50. li = re.findall(r"\d+", in_URL)
  51. name = li[0] # li是一个list,取出其中唯一一个字符串
  52. print '图片保存在当前目录的:%s下' % name
  53. if not os.path.exists('%s' % name):
  54. os.makedirs('%s' % name)
  55. spider = Spider(in_URL)
  56. spider.get_pic()
  57. print '所有收藏夹内图片保存完毕'

转载请注明原地址

发表评论

表情:
评论列表 (有 0 条评论,275人围观)

还没有评论,来说两句吧...

相关阅读