「数据分析师的网络爬虫」动态页面和Ajax渲染页面抓取

超、凢脫俗 2022-10-16 10:26 303阅读 0赞

文章目录

  • 内容介绍
  • Ajax抓取示例
  • JS动态加载示例

内容介绍

开发环境为 Python3.6 ,爬虫项目全部内容索引目录

看懂Python爬虫框架,所见即所得一切皆有可能

本文介绍动态页面和Ajax渲染页面数据抓取的示例,以及相应的页面分析过程,你会发现本来想想复杂的网页爬虫居然比那些非动态网页的抓取要更简单。

虽说不会敲代码的 Python数据分析师 不是好的数据分析师,但你不是正儿八经的开发人员,代码敲的那么溜有什么用?学点数据爬虫基础能让繁琐的数据CV工作(Ctrl+C,Ctrl+V)成为自动化就足够了。

Ajax抓取示例

现在越来越多的网页的原始HTML文档不包括任何数据,而是采用Ajax统一加载。发送Ajax请求道网页更新的过程:

  • 发送请求。
  • 解析内容。
  • 渲染网页。

打开浏览器的开发者工具,到Networkk选项卡,使用XHR过滤工具。需要按照对应all_config_file.py文件建立对应相应文件夹修改该配置并且开启相关服务。

在这里插入图片描述

all_config_file.py

  1. #coding=utf-8
  2. __author__ = 'Mr数据杨'
  3. __explain__ = '各目标网站爬虫脚本配置文件'
  4. #加载引用模块
  5. import time
  6. import pymongo
  7. import pandas as pd
  8. def news_page_num():
  9. page_num=input("输入每个网站页面爬取的页面数:")
  10. return int(page_num)
  11. def title_error_num():
  12. title_error_num=input("输入错误标题爬取最大数:")
  13. return int(title_error_num)
  14. def body_error_num():
  15. body_error_num=input("输入错误页面爬取最大数:")
  16. return int(body_error_num)
  17. def mongodb_client():
  18. # 获取mongoClient对象
  19. client = pymongo.MongoClient("localhost", 27017)
  20. # 获取使用的database对象
  21. db = client.news
  22. print("加载MongoDB数据库完毕......")
  23. return db
  24. db=mongodb_client()
  25. def time_today():
  26. # 全局函数
  27. time_today = time.strftime('%Y-%m-%d', time.localtime(time.time()))
  28. print("加载全局日期函数完毕......")
  29. return time_today
  30. # 错误日志信息
  31. def error_text_title(text,time_today):
  32. print("加载错误信息日志完毕......")
  33. with open("logs/" + time_today + " news_title_error.txt", "a") as f:
  34. f.write(text + '\n')
  35. # 错误日志信息
  36. def error_text_body(text,time_today):
  37. with open("logs/" + time_today + " news_body_error.txt", "a") as f:
  38. f.write(text + '\n')
  39. # 找到每个爬取网页的链接
  40. def get_title_links_from_MongoDB(label, type):
  41. result = []
  42. for item in db.news_tmp.find({ 'label': label, 'type': type}, { 'url': 1, '_id': 1}):
  43. result.append(item)
  44. result = pd.DataFrame(result, columns=['url', '_id'])
  45. return result

主程序

  1. #加载引用模块
  2. import urllib
  3. import urllib.request
  4. import requests
  5. import datetime
  6. from bs4 import BeautifulSoup
  7. import all_config_file
  8. from all_config_file import error_text_title
  9. from all_config_file import error_text_body
  10. from all_config_file import get_title_links_from_MongoDB
  11. cqcoal = "http://news.cqcoal.com/manage/newsaction.do?method:webListPageNewsArchivesByTypeid"
  12. print("加载目标网址完毕......")
  13. db = all_config_file.mongodb_client()
  14. time_today = all_config_file.time_today()
  15. def cqcoal_title_start(num):
  16. def start_type(url, label, typeid, pagenum, type):
  17. try:
  18. page_num = 1
  19. while page_num <= pagenum:
  20. print("开始爬取:" + url)
  21. page_num += 1
  22. user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
  23. headers = { 'User-Agent': user_agent}
  24. req = urllib.request.Request(url, headers=headers)
  25. response = requests.get(url, headers=headers, timeout=10)
  26. post_param = { 'pageNum': pagenum, 'pageSize': '20', 'jsonStr': typeid}
  27. # post_param = post_param.format(typeid)
  28. return_data = requests.post(url, data=post_param, verify=False)
  29. content = return_data.text
  30. # print(content)
  31. if label == 'news.cqcoal.com':
  32. one_page = get_cqcoal_page_news(content, type)
  33. print('新闻抓取完毕')
  34. except:
  35. error = str(url+ " label:" + label + " gd:" + str(typeid) + " pagenum:" + str(pagenum) + " type:" + type + ' 未抓取到')
  36. error_text_title(error, time_today)
  37. print (error)
  38. def get_cqcoal_page_news(content, type):
  39. l = content.split("},{")
  40. for i in [*range(len(l))]:
  41. url = "http://news.cqcoal.com/blank/nc.jsp?mid=" + l[i][l[i].find("id") + 4:l[i].find("typeid") - 2]
  42. title = l[i][l[i].find("title") + 8:l[i].find("shorttitle") - 3]
  43. typename = l[i][l[i].find("typename") + 11:l[i].find("typeid2") - 3]
  44. timeStamp = l[i][(l[i].find("pubdate") + 10):(l[i].find("senddate") - 3)]
  45. description = l[i][l[i].find("description") + 14:l[i].find("filename") - 3]
  46. timeStamp = int(timeStamp)
  47. dateArray = datetime.datetime.utcfromtimestamp(timeStamp)
  48. pubdate = dateArray.strftime("%Y-%m-%d")
  49. one_page = { 'title': title, 'url': url, 'date': pubdate, 'type': type, 'label': 'news.cqcoal.com'}
  50. db.news_tmp.insert_one(one_page)
  51. return one_page
  52. ###news.cqcoal.com
  53. def start_Cqcoal_supply_and_demand():
  54. start_type(cqcoal, 'news.cqcoal.com', '{"typeid":"238"}', num, 'supply_and_demand')
  55. def start_Cqcoal_price():
  56. start_type(cqcoal, 'news.cqcoal.com', '{"typeid":"234"}', num, 'price')
  57. def start_Cqcoal_dynamic():
  58. start_type(cqcoal, 'news.cqcoal.com', '{"typeid":"235"}', num, 'dynamic')
  59. def start_Cqcoal_international():
  60. start_type(cqcoal, 'news.cqcoal.com', '{"typeid":"236"}', num, 'international')
  61. def start_Cqcoal_comment():
  62. start_type(cqcoal, 'news.cqcoal.com', '{"typeid":"14"}', num, 'comment')
  63. def start_Cqcoal_transportation():
  64. start_type(cqcoal, 'news.cqcoal.com', '{"typeid":"239"}', num, 'transportation')
  65. def start_Cqcoal_economics():
  66. start_type(cqcoal, 'news.cqcoal.com', 'road_price}', num, 'economics')
  67. def start_Cqcoal_policy():
  68. start_type(cqcoal, 'news.cqcoal.com', '{"typeid":"230"}', num, 'policy')
  69. def start_Cqcoal_correlation():
  70. start_type(cqcoal, 'news.cqcoal.com', '{"typeid":"237"}', num, 'correlation')
  71. def start_Cqcoal_expert():
  72. start_type(cqcoal, 'news.cqcoal.com', '{"typeid":"232"}', num, 'expert')
  73. start_Cqcoal_transportation()
  74. start_Cqcoal_supply_and_demand()
  75. start_Cqcoal_price()
  76. start_Cqcoal_policy()
  77. start_Cqcoal_international()
  78. start_Cqcoal_expert()
  79. start_Cqcoal_economics()
  80. start_Cqcoal_dynamic()
  81. start_Cqcoal_correlation()
  82. start_Cqcoal_comment()
  83. def cqcoal_body_start():
  84. def get_new_body(label, type):
  85. link_list = get_title_links_from_MongoDB(label, type)
  86. if label == 'news.cqcoal.com':
  87. try:
  88. for url in link_list['url']:
  89. news_body, news_body_1 = get_news_Cqcoal_text(url,label, type)
  90. if news_body is not None:
  91. db.news_tmp.update({ 'url': url}, { "$set": { 'newsbody': news_body}})
  92. db.news_tmp.update({ 'url': url}, { "$set": { 'newsbody_1': news_body_1}})
  93. print("网站:" + label + " 类型:" + type + "内容爬取完毕!")
  94. except:
  95. error = str(url + " error:" + ' label:' + label + " type:" + type)
  96. #error_text(error)
  97. print(error)
  98. def get_news_Cqcoal_text(url,label, type):
  99. # html = urllib.request.urlopen(url,timeout=5)
  100. id = url.split('=')[1]
  101. url = 'http://news.cqcoal.com/manage/newsaction.do?method:getNewsAddonarticle'
  102. post_param = { 'id': id}
  103. # return_data = requests.post(url,data =post_param, verify = False)
  104. try:
  105. return_data = requests.post(url, data=post_param, verify=False, timeout=120)
  106. except:
  107. print("error label:", url, " Time out!")
  108. error = str(url + " error:" + ' label:' + label + " type:" + type)
  109. error_text_body(error, time_today)
  110. return None, None
  111. return_data = return_data.text
  112. try:
  113. newsBody = return_data[return_data.find("body") + 7:return_data.find("xh") - 3]
  114. newsBody_1 = return_data[return_data.find("body") + 7:return_data.find("xh") - 3]
  115. print(url + " 记录爬取完毕")
  116. return newsBody, newsBody_1
  117. except:
  118. print("error label:", url, " type:", type)
  119. error = str(url + " error:" + ' label:' + label + " type:" + type)
  120. error_text_body(error, time_today)
  121. return None, None
  122. ###news.cqcoal.com
  123. def start_body_Cqcoal_transportation():
  124. get_new_body('news.cqcoal.com', 'transportation')
  125. def start_body_Cqcoal_supply_and_demand():
  126. get_new_body('news.cqcoal.com', 'supply_and_demand')
  127. def start_body_Cqcoal_price():
  128. get_new_body('news.cqcoal.com', 'price')
  129. def start_body_Cqcoal_policy():
  130. get_new_body('news.cqcoal.com', 'policy')
  131. def start_body_Cqcoal_international():
  132. get_new_body('news.cqcoal.com', 'international')
  133. def start_body_Cqcoal_expert():
  134. get_new_body('news.cqcoal.com', 'expert')
  135. def start_body_Cqcoal_dynamic():
  136. get_new_body('news.cqcoal.com', 'dynamic')
  137. def start_body_Cqcoal_economics():
  138. get_new_body('news.cqcoal.com', 'economics')
  139. def start_body_Cqcoal_correlation():
  140. get_new_body('news.cqcoal.com', 'correlation')
  141. def start_body_Cqcoal_comment():
  142. get_new_body('news.cqcoal.com', 'comment')
  143. start_body_Cqcoal_transportation()
  144. start_body_Cqcoal_supply_and_demand()
  145. start_body_Cqcoal_price()
  146. start_body_Cqcoal_policy()
  147. start_body_Cqcoal_international()
  148. start_body_Cqcoal_expert()
  149. start_body_Cqcoal_economics()
  150. start_body_Cqcoal_dynamic()
  151. start_body_Cqcoal_correlation()
  152. start_body_Cqcoal_comment()

JS动态加载示例

浏览器渲染引擎:渲染引擎的职责就是渲染,即在浏览器窗口中显示所请求的内容。浏览器向服务器发送请求,得到服务器返回的资源文件后,经过需要渲染引擎的处理,将资源文件显示在浏览器窗口中。

目前使用较为广泛的渲染引擎有两种:

  • webkit——使用者有Chrome, Safari
  • Geoko——使用者有Firefox

渲染主流程:渲染引擎首先通过网络获得所请求文档的内容,通常以8K分块的方式完成。下面是渲染引擎在取得内容之后的基本流程:

  • 解析html以构建dom树 -> 构建render树 -> 布局render树 -> 绘制render树渲染引擎开始解析html,并将标签转化为内容树中的dom节点。如果遇到JS,那么此时会启用单独连接进行下载,并且在下载后进行解析。
  • 接着,它解析外部CSS文件及style标签中的样式信息。这些样式信息以及html中的可见性指令将被用来构建另一棵树——render树。
  • Render树由一些包含有颜色和大小等属性的矩形组成,它们将被按照正确的顺序显示到屏幕上。
  • Render树构建好了之后,将会执行布局过程,它将确定每个节点在屏幕上的确切坐标。
  • 再下一步就是绘制,即遍历render树,并使用UI后端层绘制每个节点。

渲染动态网页,有两种选择:

  • 自己从头实现一个浏览器渲染引擎,在合适的时机返回构建的dom树或render树。
  • 这需要进行大量的工作,需要考虑html、js、css等不同格式文件的解析方式以及解析顺序等。

以36氪主页抓取为实例。
在这里插入图片描述

  1. import warnings
  2. warnings.filterwarnings("ignore")
  3. import time
  4. import requests
  5. import pymongo
  6. import pandas as pd
  7. import re
  8. from lxml import etree
  9. # 获取mongoClient对象
  10. client = pymongo.MongoClient("localhost", 27017)
  11. # 获取使用的database对象
  12. db = client.news
  13. today=time.strftime('%Y.%m.%d',time.localtime(time.time()))
  14. def main(page_num):
  15. #开始爬取数据设定抓取页面数字
  16. n=int(page_num)
  17. def start_crawler(pro,col,adress):
  18. i=1
  19. while i<= n:
  20. t = time.time()
  21. url = "https://36kr.com/api/search-column/{}?per_page=20&page={}&_=".format(pro,i)+str(int(t))
  22. i+=1
  23. post_param = { 'per_page':'20',\
  24. 'page':str(i),\
  25. '_':int(t)}
  26. time.sleep(2)
  27. user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
  28. headers = { 'User-Agent' : user_agent }
  29. return_data = requests.get(url,data =post_param, verify = False)
  30. one_page = get_news_link_to_mongodb(return_data,col,url,adress)
  31. print (adress+" "+col+' 新闻标题抓取完毕')
  32. def get_news_link_to_mongodb(return_data,col,url,adress):
  33. content = return_data.json().get('data').get('content')
  34. for i in return_data.json().get('data').get('items'):
  35. one_page = { 'title': i["title"], \
  36. 'url_html': "https://36kr.com/p/"+str(i["id"])+".html",\
  37. 'url_json': "https://36kr.com/p/"+str(i["id"])+".json",\
  38. 'summary':i["summary"],\
  39. 'tags': re.sub(r'[0-9"\[\]]','',i["extraction_tags"]).replace(",,"," ").replace(",",""),\
  40. 'label':col,\
  41. 'adress':adress,\
  42. 'work_date':time.strftime('%Y.%m.%d',time.localtime(time.time()))}
  43. db.kr36.insert_one(one_page)
  44. print("已经完成抓取 "+adress+" "+col+" 板块页面:"+url)
  45. def news_body_start(label,adress,today):
  46. url_list = []
  47. for item in db.kr36.find({ 'label': label,'adress':adress,'work_date':today}, { 'url_json': 1}):
  48. url_list.append(item)
  49. url_list = pd.DataFrame(url_list, columns=['url_json'])
  50. for i in url_list["url_json"]:
  51. html = requests.get(i)
  52. a=html.json().get('props').get('detailArticle|post').get('content')
  53. sel = etree.HTML(a)
  54. clear_content = sel.xpath('string(//*)')
  55. db.kr36.update({ 'url_json': i}, { "$set": { 'newsbody': clear_content}})
  56. print(i+" 抓取完毕")
  57. print (adress+" "+label+" "+"新闻主体抓取完毕")
  58. start_crawler('23','大公司',"36kr")
  59. start_crawler('221','消费',"36kr")
  60. start_crawler('225','娱乐',"36kr")
  61. start_crawler('218','前沿技术',"36kr")
  62. start_crawler('219','汽车交通',"36kr")
  63. start_crawler('208','区块链',"36kr")
  64. start_crawler('103','技能get',"36kr")
  65. news_body_start("大公司","36kr",today)
  66. news_body_start("消费","36kr",today)
  67. news_body_start("娱乐","36kr",today)
  68. news_body_start("前沿技术","36kr",today)
  69. news_body_start("汽车交通","36kr",today)
  70. news_body_start("区块链","36kr",today)
  71. news_body_start("技能get","36kr",today)
  72. #导出模块
  73. name = { 'adress':'36kr'}
  74. search_res = db.kr36.find(name)
  75. list_=[]
  76. for i in search_res:
  77. list_.append(i)
  78. ddf=pd.DataFrame(list_,columns=["title","url_html","tags","labels","adress","summary","newsbody"])
  79. ddf.to_csv("36氪首页news.csv",encoding="utf_8_sig")
  80. if __name__ == '__main__':
  81. page_num=input("输入需要抓取的页数")
  82. main(page_num)

发表评论

表情:
评论列表 (有 0 条评论,303人围观)

还没有评论,来说两句吧...

相关阅读