Python爬取图片

心已赠人 2022-08-13 13:52 369阅读 0赞

参考了别人的代码。给代码添加了多线程和Queue的结合应用。

[python] view plain copy

  1. #!/usr/bin/env python
  2. # -*- coding:utf-8 -*-
  3. ‘’’’’
  4. Created on 2013-3-21
  5. @author: naughty
  6. ‘’’
  7. # author: wklken
  8. from sgmllib import SGMLParser
  9. import os
  10. import urllib
  11. import urllib2
  12. import urlparse
  13. from Queue import Queue
  14. from threading import Thread
  15. save_path = ‘/home/zoer’
  16. passUrls = set()
  17. qimg = Queue()
  18. class URLLister(SGMLParser):
  19. def reset(self):
  20. SGMLParser.reset(self)
  21. self.urls = []
  22. self.imgs = []
  23. def start_a(self, attrs):
  24. href = [ v for k, v in attrs if k == “href” and v.startswith(“http”)]
  25. if href:
  26. self.urls.extend(href)
  27. def start_img(self, attrs):
  28. src = [ v for k, v in attrs if k == “src” and v.startswith(“http”) ]
  29. if src:
  30. self.imgs.extend(src)
  31. def get_url_of_page(url, if_img=False):
  32. ‘’’’’
  33. 获取一个页面上的所有链接。
  34. if_img:如果为true,则获取的是页面上的所有图片的链接
  35. ‘’’
  36. urls = []
  37. try:
  38. f = urllib2.urlopen(url, timeout=3).read()
  39. url_listen = URLLister()
  40. url_listen.feed(f)
  41. if if_img:
  42. urls.extend(url_listen.imgs)
  43. else:
  44. urls.extend(url_listen.urls)
  45. except urllib2.URLError, e:
  46. print e
  47. return urls
  48. def get_page_html(begin_url, depth, main_site_domain):
  49. ‘’’’’
  50. 递归处理页面
  51. ‘’’
  52. if depth <= 0:
  53. return
  54. print ‘handle ‘ + begin_url
  55. passUrls.add(begin_url)
  56. #===========================================================================
  57. # 读取本页面上的图片
  58. #===========================================================================
  59. urls = get_url_of_page(begin_url, True)
  60. #===========================================================================
  61. # 添加图片到queue
  62. #===========================================================================
  63. for murl in urls:
  64. firstindex = murl.find(‘?’)
  65. if firstindex != -1:
  66. print firstindex
  67. murl = murl[:firstindex]
  68. print ‘add img url:’ + murl
  69. qimg.put(murl)
  70. #===============================================================================
  71. # 读取本页面上的链接
  72. #===============================================================================
  73. urls = get_url_of_page(begin_url)
  74. if urls:
  75. for murl in urls:
  76. if not murl in passUrls:
  77. get_page_html(murl, depth - 1, main_site_domain)
  78. class DPThread(Thread):
  79. ‘’’’’
  80. 下载线程
  81. ‘’’
  82. def run3(self):
  83. while True:
  84. filename = qimg.get()
  85. filename = filename.split(“/“)[-1]
  86. #dist = os.path.join(save_path, filename)
  87. dist = save_path + ‘/‘ + filename
  88. print dist
  89. print ‘try connecting ‘ + filename
  90. if filename.endswith(‘jpg’) or filename.endswith(‘png’) or filename.endswith(‘gif’) or filename.endswith(‘bmp’) or filename.endswith(‘jpeg’):
  91. print ‘downloading ‘ + filename
  92. dist = dist.replace(‘\\‘, ‘/‘)
  93. urllib.urlretrieve(filename, dist, None)
  94. print “Done: “, filename
  95. qimg.task_done()
  96. def run(self):
  97. while True:
  98. murl = qimg.get()
  99. print ‘one ‘+murl
  100. filename = murl.split(“/“)[-1]
  101. urlopen = urllib.URLopener()
  102. try:
  103. fp = urlopen.open(murl)
  104. data = fp.read()
  105. fp.close()
  106. f = open(save_path + “/“ + filename, ‘w+b’)
  107. f.write(data)
  108. f.close()
  109. except IOError:
  110. print “download error!” + url
  111. qimg.task_done()
  112. if __name__ == “__main__“:
  113. #===========================================================================
  114. # 抓取图片首个页面
  115. #===========================================================================
  116. url = “http://image.baidu.com“
  117. #url=’http://bringgooglereaderback.com/‘
  118. #===========================================================================
  119. # 图片保存路径
  120. #===========================================================================
  121. if not os.path.exists(save_path):
  122. os.mkdir(save_path)
  123. #===========================================================================
  124. # 遍历深度
  125. #===========================================================================
  126. max_depth = 1
  127. main_site_domain = urlparse.urlsplit(url).netloc
  128. get_page_html(url, max_depth, main_site_domain)
  129. for i in range(1):
  130. t = DPThread()
  131. t.setDaemon(True)
  132. t.start()
  133. qimg.join()
  134. print ‘end’

上面代码中有一个run方法和一个run3方法。

run3方法中使用urllib的urlretrieve来读取图片。发现这样速度很慢。所以直接在run方法中用了urllib的URLopener来打开图片地址并读取数据直接写到本地磁盘的文件中。

-——————————————————————————————————-

遇到严重的问题:

由于一开始线程没有设置成Daemon的,所以即使Queue中没有内容了,脚本还是不会退出。设置成Daemon之后,在Queue中每有内容之后,脚本就会退出了。

-————————————————————————————

原因解释如下:

python中得thread的一些机制和C/C++不同:在C/C++中,主线程结束后,其子线程会默认被主线程kill掉。而在python中,主线程结束后,会默认等待子线程结束后,主线程才退出。

python对于thread的管理中有两个函数:join和setDaemon

  • join:如在一个线程B中调用threada.join(),则threada结束后,线程B才会接着threada.join()往后运行。
  • setDaemon:主线程A启动了子线程B,调用b.setDaemaon(True),则主线程结束时,会把子线程B也杀死,与C/C++中得默认效果是一样的。

发表评论

表情:
评论列表 (有 0 条评论,369人围观)

还没有评论,来说两句吧...

相关阅读