Python爬取图片
参考了别人的代码。给代码添加了多线程和Queue的结合应用。
[python] view plain copy
- #!/usr/bin/env python
- # -*- coding:utf-8 -*-
- ‘’’’’
- Created on 2013-3-21
- @author: naughty
- ‘’’
- # author: wklken
- from sgmllib import SGMLParser
- import os
- import urllib
- import urllib2
- import urlparse
- from Queue import Queue
- from threading import Thread
- save_path = ‘/home/zoer’
- passUrls = set()
- qimg = Queue()
- class URLLister(SGMLParser):
- def reset(self):
- SGMLParser.reset(self)
- self.urls = []
- self.imgs = []
- def start_a(self, attrs):
- href = [ v for k, v in attrs if k == “href” and v.startswith(“http”)]
- if href:
- self.urls.extend(href)
- def start_img(self, attrs):
- src = [ v for k, v in attrs if k == “src” and v.startswith(“http”) ]
- if src:
- self.imgs.extend(src)
- def get_url_of_page(url, if_img=False):
- ‘’’’’
- 获取一个页面上的所有链接。
- if_img:如果为true,则获取的是页面上的所有图片的链接
- ‘’’
- urls = []
- try:
- f = urllib2.urlopen(url, timeout=3).read()
- url_listen = URLLister()
- url_listen.feed(f)
- if if_img:
- urls.extend(url_listen.imgs)
- else:
- urls.extend(url_listen.urls)
- except urllib2.URLError, e:
- print e
- return urls
- def get_page_html(begin_url, depth, main_site_domain):
- ‘’’’’
- 递归处理页面
- ‘’’
- if depth <= 0:
- return
- print ‘handle ‘ + begin_url
- passUrls.add(begin_url)
- #===========================================================================
- # 读取本页面上的图片
- #===========================================================================
- urls = get_url_of_page(begin_url, True)
- #===========================================================================
- # 添加图片到queue
- #===========================================================================
- for murl in urls:
- firstindex = murl.find(‘?’)
- if firstindex != -1:
- print firstindex
- murl = murl[:firstindex]
- print ‘add img url:’ + murl
- qimg.put(murl)
- #===============================================================================
- # 读取本页面上的链接
- #===============================================================================
- urls = get_url_of_page(begin_url)
- if urls:
- for murl in urls:
- if not murl in passUrls:
- get_page_html(murl, depth - 1, main_site_domain)
- class DPThread(Thread):
- ‘’’’’
- 下载线程
- ‘’’
- def run3(self):
- while True:
- filename = qimg.get()
- filename = filename.split(“/“)[-1]
- #dist = os.path.join(save_path, filename)
- dist = save_path + ‘/‘ + filename
- print dist
- print ‘try connecting ‘ + filename
- if filename.endswith(‘jpg’) or filename.endswith(‘png’) or filename.endswith(‘gif’) or filename.endswith(‘bmp’) or filename.endswith(‘jpeg’):
- print ‘downloading ‘ + filename
- dist = dist.replace(‘\\‘, ‘/‘)
- urllib.urlretrieve(filename, dist, None)
- print “Done: “, filename
- qimg.task_done()
- def run(self):
- while True:
- murl = qimg.get()
- print ‘one ‘+murl
- filename = murl.split(“/“)[-1]
- urlopen = urllib.URLopener()
- try:
- fp = urlopen.open(murl)
- data = fp.read()
- fp.close()
- f = open(save_path + “/“ + filename, ‘w+b’)
- f.write(data)
- f.close()
- except IOError:
- print “download error!” + url
- qimg.task_done()
- if __name__ == “__main__“:
- #===========================================================================
- # 抓取图片首个页面
- #===========================================================================
- url = “http://image.baidu.com“
- #url=’http://bringgooglereaderback.com/‘
- #===========================================================================
- # 图片保存路径
- #===========================================================================
- if not os.path.exists(save_path):
- os.mkdir(save_path)
- #===========================================================================
- # 遍历深度
- #===========================================================================
- max_depth = 1
- main_site_domain = urlparse.urlsplit(url).netloc
- get_page_html(url, max_depth, main_site_domain)
- for i in range(1):
- t = DPThread()
- t.setDaemon(True)
- t.start()
- qimg.join()
- print ‘end’
上面代码中有一个run方法和一个run3方法。
run3方法中使用urllib的urlretrieve来读取图片。发现这样速度很慢。所以直接在run方法中用了urllib的URLopener来打开图片地址并读取数据直接写到本地磁盘的文件中。
-——————————————————————————————————-
遇到严重的问题:
由于一开始线程没有设置成Daemon的,所以即使Queue中没有内容了,脚本还是不会退出。设置成Daemon之后,在Queue中每有内容之后,脚本就会退出了。
-————————————————————————————
原因解释如下:
python中得thread的一些机制和C/C++不同:在C/C++中,主线程结束后,其子线程会默认被主线程kill掉。而在python中,主线程结束后,会默认等待子线程结束后,主线程才退出。
python对于thread的管理中有两个函数:join和setDaemon
- join:如在一个线程B中调用threada.join(),则threada结束后,线程B才会接着threada.join()往后运行。
- setDaemon:主线程A启动了子线程B,调用b.setDaemaon(True),则主线程结束时,会把子线程B也杀死,与C/C++中得默认效果是一样的。
还没有评论,来说两句吧...