案例_(多线程)爬取糗事百科
讲解都写在代码注释中了,直接上代码
# 使用了线程库
import threading
# 队列
from queue import Queue
# 解析库
from lxml import etree
# 请求处理
import requests
# json处理
import time
class ThreadCrawl(threading.Thread):
def __init__(self, thread_name, page_queue, data_queue):
# threading.Thread.__init__(self)
# 调用父类初始化方法
super(ThreadCrawl, self).__init__()
# 线程名
self.thread_name = thread_name
# 页码队列
self.page_queue = page_queue
# 数据队列
self.data_queue = data_queue
# 请求报头
self.headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"}
def run(self):
print("启动 " + self.thread_name)
while not CRAWL_EXIT:
try:
# 取出一个数字,先进先出
# 可选参数block,默认值为True
# 1. 如果对列为空,block为True的话,不会结束,会进入阻塞状态,直到队列有新的数据
# 2. 如果队列为空,block为False的话,就弹出一个Queue.empty()异常,
page = self.page_queue.get(False)
url = "http://www.qiushibaike.com/8hr/page/" + str(page) + "/"
# print url
content = requests.get(url, headers=self.headers).text
time.sleep(1)
self.data_queue.put(content)
# print len(content)
except:
pass
print("结束 " + self.thread_name)
class ThreadParse(threading.Thread):
def __init__(self, thread_name, data_queue):
super(ThreadParse, self).__init__()
# 线程名
self.thread_name = thread_name
# 数据队列
self.data_queue = data_queue
def run(self):
print("启动" + self.thread_name)
while not PARSE_EXIT:
try:
html = self.data_queue.get(False)
self.parse(html)
except:
pass
print("退出" + self.thread_name)
def parse(self, html):
selector = etree.HTML(html)
# 返回所有段子的节点位置,contant()模糊查询方法,第一个参数是要匹配的标签,第二个参数是这个标签的部分内容
# 每个节点包括一条完整的段子(用户名,段子内容,点赞,评论等)
node_list = selector.xpath('//div[contains(@id,"qiushi_tag_")]')
for node in node_list:
# 爬取所有用户名信息
# 取出标签里的内容,使用.text方法
user_name = node.xpath('./div[@class="author clearfix"]//h2')[0].text
# 爬取段子内容,匹配规则必须加点 不然还是会从整个页面开始匹配
# 注意:如果span标签中有br 在插件中没问题,在代码中会把br也弄进来
duanzi_info = node.xpath('.//div[@class="content"]/span')[0].text.strip()
# 爬取段子的点赞数
vote_num = node.xpath('.//span[@class="stats-vote"]/i')[0].text
# 爬取评论数
comment_num = node.xpath('.//span[@class="stats-comments"]//i')[0].text
# 爬取图片链接
# 属性src的值,所以不需要.text
img_url = node.xpath('.//div[@class="thumb"]//@src')
if len(img_url) > 0:
img_url = img_url[0]
else:
img_url = "无图片"
self.save_info(user_name, duanzi_info, vote_num, comment_num, img_url)
def save_info(self, user_name, duanzi_info, vote_num, comment_num, img_url):
"""把每条段子的相关信息写进字典"""
item = {
"username": user_name,
"content": duanzi_info,
"zan": vote_num,
"comment": comment_num,
"image_url": img_url
}
print(item)
CRAWL_EXIT = False
PARSE_EXIT = False
def main():
# 页码的队列,表示20个页面
pageQueue = Queue(20)
# 放入1~10的数字,先进先出
for i in range(1, 21):
pageQueue.put(i)
# 采集结果(每页的HTML源码)的数据队列,参数为空表示不限制
dataQueue = Queue()
filename = open("duanzi.json", "a")
# 创建锁
lock = threading.Lock()
# 三个采集线程的名字
crawlList = ["采集线程1号", "采集线程2号", "采集线程3号"]
# 存储三个采集线程的列表集合
thread_crawl = []
for threadName in crawlList:
thread = ThreadCrawl(threadName, pageQueue, dataQueue)
thread.start()
thread_crawl.append(thread)
# 三个解析线程的名字
parseList = ["解析线程1号", "解析线程2号", "解析线程3号"]
# 存储三个解析线程
thread_parse = []
for threadName in parseList:
thread = ThreadParse(threadName, dataQueue)
thread.start()
thread_parse.append(thread)
# 等待pageQueue队列为空,也就是等待之前的操作执行完毕
while not pageQueue.empty():
pass
# 如果pageQueue为空,采集线程退出循环
global CRAWL_EXIT
CRAWL_EXIT = True
print("pageQueue为空")
for thread in thread_crawl:
thread.join()
while not dataQueue.empty():
pass
global PARSE_EXIT
PARSE_EXIT = True
for thread in thread_parse:
thread.join()
if __name__ == "__main__":
main()
爬取结果如下:
如果你和我有共同爱好,我们可以加个好友一起交流!
还没有评论,来说两句吧...