爬取网站新闻

本是古典 何须时尚 2022-12-10 07:06 376阅读 0赞

军事新闻爬取

  • 光明网军事新闻
  • 国防科技信息网

光明网军事新闻

  1. import json
  2. import redis
  3. import time
  4. import requests
  5. session = requests.session()
  6. import logging.handlers
  7. import pickle
  8. import sys
  9. import re
  10. import datetime
  11. from bs4 import BeautifulSoup
  12. import importlib
  13. import csv
  14. import string
  15. import zhon.hanzi
  16. import sys
  17. importlib.reload(sys)
  18. punce = string.punctuation
  19. puncz = zhon.hanzi.punctuation
  20. f = open('0905/0905原文.csv','a',encoding='utf-8',newline='')
  21. fp = open('0905/0905段落.csv','a',encoding='utf-8',newline='')
  22. csv_article = csv.writer(f)
  23. csv_para = csv.writer(fp)
  24. def getNewsDetail(newsurl):
  25. news_p=[]
  26. p1=''
  27. res = requests.get(newsurl)
  28. res.encoding = 'utf-8'
  29. soup = BeautifulSoup(res.text,'html.parser')
  30. # news_p.append([p.text.strip() for p in soup.select('.u-mainText p')])
  31. for p in soup.select('.u-mainText p'):
  32. p1 = p1 + p.text.strip().replace('\n','')
  33. if len(p1) >= 200 and len(p1) <= 500 :
  34. news_p.append(p1)
  35. p1 = ''
  36. # if p1 != '':
  37. # news_p.append(p1)
  38. news_article = ' '.join([p.text.strip().replace('<br/>','') for p in soup.select('.u-mainText p')])
  39. return news_article, news_p
  40. def spider():
  41. # pages = [57918,57919,234399,234400]
  42. pages = ['','_2','_3','_4','_5',
  43. '_6','_7','_8','_9','_10']
  44. for onepage in pages:
  45. #组合url
  46. url = "http://mil.gmw.cn/node_8979"+onepage+".htm"
  47. print(url)
  48. # 伪装请求头
  49. headers = {
  50. 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
  51. 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
  52. }
  53. result = session.get(url=url,headers=headers).content
  54. soup = BeautifulSoup(result,'html.parser')
  55. if soup is None:
  56. break
  57. #找到新闻列表
  58. result_div = soup.find('div', attrs={ 'class': 'channelLeftPart'}).find_all('div')[1].find_all('ul', attrs={ 'class': 'channel-newsGroup'})
  59. # findall('ul', attrs={'class': 'channel-newsGroup'})
  60. #去下换行
  61. result_replace = str(result_div).replace('\n','').replace('\r','').replace('\t','')
  62. #正则匹配信息
  63. result_list = re.findall('<li>(.*?)</li>',result_replace)
  64. for i in result_list:
  65. # http://mil.gmw.cn/2020-09/04/content_34157244.htm
  66. news_url = 'http://mil.gmw.cn/' + re.findall('<a href="(.*?)" target=',i)[0]
  67. news_name = re.findall('target="_blank">(.*?)</a>',i)[0]
  68. print(news_name)
  69. # news_time = re.findall('<span class="time">\((.*?)\)</span>',i)[0]
  70. # 标题-段落
  71. news_article, news_p = getNewsDetail(news_url)
  72. for p1 in news_p:
  73. if p1!='':
  74. csv_para.writerow([p1.replace("\u00a0", ""), news_name.replace("\u00a0", "")])
  75. # 标题-原文
  76. if news_article!='':
  77. csv_article.writerow([news_name.replace("\u00a0", ""), news_article.replace("\u00a0", "")])
  78. time.sleep(3)
  79. spider()
  80. f.close()

国防科技信息网

  1. # -*- coding:utf-8 -*-
  2. import json
  3. import time
  4. import requests
  5. session = requests.session()
  6. import logging.handlers
  7. import pickle
  8. import sys
  9. import re
  10. import datetime
  11. from bs4 import BeautifulSoup
  12. import importlib
  13. import csv
  14. import string
  15. import zhon.hanzi
  16. import sys
  17. importlib.reload(sys)
  18. punce = string.punctuation
  19. puncz = zhon.hanzi.punctuation
  20. f = open('0906/0906electron原文.csv','w+',encoding='utf-8',newline='')
  21. fp = open('0906/0906electron段落.csv','w+',encoding='utf-8',newline='')
  22. csv_article = csv.writer(f)
  23. csv_para = csv.writer(fp)
  24. allparas = []
  25. def getNewsDetail(newsurl):
  26. news_p=[]
  27. p1=''
  28. # res = requests.get(newsurl)
  29. # res.encoding = 'utf-8'
  30. result = session.get(url=newsurl)
  31. soup = BeautifulSoup(result.text,'html.parser')
  32. # news_p.append([p.text.strip() for p in soup.select('.u-mainText p')])
  33. for p in soup.select('.newsContent p'):
  34. p1 = p1 + p.text.replace('\n','')
  35. if len(p1) >= 200 and len(p1) <= 500 :
  36. news_p.append(p1)
  37. p1 = ''
  38. news_article = ' '.join([p.text.strip().replace('\n','') for p in soup.select('.newsContent p')])
  39. return news_article, news_p
  40. def spider():
  41. # pages = [57918,57919,234399,234400]
  42. # http://www.dsti.net/Information/HyeList/aviation/ 0-487页
  43. # http://www.dsti.net/Information/HyeList/spaceflight 0-48页
  44. # http://www.dsti.net/Information/HyeList/electron/ 1,30
  45. for page in range(1,30):
  46. #组合url
  47. # url = "http://mil.gmw.cn/node_8979"+onepage+".htm"
  48. # url = "http://www.dsti.net/Information/HyeList/spaceflight/" + str(page)
  49. url = "http://www.dsti.net/Information/HyeList/electron/" + str(page)
  50. print(url)
  51. # 伪装请求头
  52. headers = {
  53. 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
  54. 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
  55. }
  56. result = session.get(url=url,headers=headers).content
  57. # soup = BeautifulSoup(result,'html.parser').encode('GBK','ignore').decode('GBK')
  58. soup = BeautifulSoup(result, 'html.parser',from_encoding="gb18030")
  59. if soup is None:
  60. break
  61. # req = requests.get(headers=headers, url=url)
  62. # content = req.content
  63. # soup = content.decode('gbk')
  64. # res = requests.get(url=url, headers=headers)
  65. # res.encoding = 'gb18030'
  66. # soup = BeautifulSoup(res.text, 'html.parser')
  67. #找到新闻列表
  68. result_div = soup.find('div', attrs={ 'class': 'listMidContent'}).find('ul')
  69. # result_div = result_div.encode('GBK','ignore').decode('GBK')
  70. #去下换行
  71. result_replace = str(result_div).replace('\n','').replace('\r','').replace('\t','')
  72. #正则匹配信息
  73. result_list = re.findall('<li><h1>.(.*?)</h1>',result_replace)
  74. for i in result_list:
  75. # http://www.dsti.net/Information/News/120652
  76. news_url = 'http://www.dsti.net/' + re.findall('href="(.*?)" target="_blank">',i)[0]
  77. news_name = re.findall('target="_blank">(.*?)</a>',i)[0]
  78. # news_time = re.findall('<span class="time">\((.*?)\)</span>',i)[0]
  79. # 标题-段落
  80. news_article, news_p = getNewsDetail(news_url)
  81. for p1 in news_p:
  82. if p1!='':
  83. csv_para.writerow([p1.replace("\u00a0", ""), news_name.replace("\u00a0", "")])
  84. # 标题-原文
  85. if news_article!='':
  86. csv_article.writerow([news_name.replace("\u00a0", ""), news_article.replace("\u00a0", "")])
  87. # time.sleep(1)
  88. spider()
  89. f.close()

发表评论

表情:
评论列表 (有 0 条评论,376人围观)

还没有评论,来说两句吧...

相关阅读