【互联网有记忆】爬取微博热搜榜并存入数据库(python爬虫+存储过程后端实现)

淡淡的烟草味﹌ 2023-07-18 06:23 68阅读 0赞

一、爬虫代码

  1. import random, time
  2. import requests, re
  3. import datetime
  4. import mysql.connector
  5. # 定义爬取间隔(minutes)
  6. interval_time = 15
  7. class HotSearchThread:
  8. def __init__(self):
  9. self.curTime = datetime.datetime.now()
  10. print('[start]开始爬取热搜榜..........')
  11. # 爬取html页面数据
  12. def getHtml(self):
  13. url = "https://s.weibo.com/top/summary?cate=realtimehot"
  14. headers = [
  15. {
  16. "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0"
  17. },
  18. {
  19. "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50"
  20. },
  21. {
  22. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36"
  23. },
  24. {
  25. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36"
  26. },
  27. {
  28. "User-Agent": "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27"
  29. }
  30. ]
  31. header = headers[random.randint(0,len(headers)-1)]
  32. response = requests.get(url=url, headers=header)
  33. return response.content.decode()
  34. # 处理html页面数据,得到绘图所需数据
  35. def getData(self, html):
  36. pattern = '<tr.*?<td\s+class="td-01.*?>(\d{1,2})</td>.*?<td\s+class="td-02.*?<a.*?>(.*?)</a>'
  37. data_rank = re.compile(pattern, re.S).findall(html)
  38. pattern_top = '<tr.*?<td\s+class="td-(\d)1.*?icon-top.*?</td>.*?<td\s+class="td-02.*?<a.*?>(.*?)</a>'
  39. data_top = re.compile(pattern_top, re.S).findall(html)
  40. data = data_top + data_rank
  41. tail = (datetime.datetime.strftime(self.curTime,'%Y%m%d_%H%M%S'), interval_time)
  42. for i in range(0,len(data)):
  43. data[i] += tail
  44. # print(data)
  45. print('[success]爬取成功!')
  46. return data
  47. # 存入数据库
  48. def saveDAO(self, data):
  49. # data为 list 类型
  50. # data[i]为 元组 类型,若len(data)为51,则data[0]为置顶
  51. # (ranknum, searchItem, time, duration)
  52. print('[start]开始写入数据库..........')
  53. conn = mysql.connector.connect(host="127.0.0.1", port=3306, user="WeiboHotSearch",
  54. password="ws1234.", database="WeiboHotSearch")
  55. cursor = conn.cursor()
  56. sql1="SELECT count(TABLE_NAME) FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_SCHEMA='WeiboHotSearch' AND TABLE_NAME=%s;"
  57. tablename= "table" + datetime.datetime.strftime(self.curTime,'%Y%m')
  58. val1=(tablename,)
  59. cursor.execute(sql1, val1)
  60. res = cursor.fetchone()
  61. if res[0]==0:
  62. sql2="call CreateTablePro(%s);"
  63. val2=(tablename,)
  64. cursor.execute(sql2, val2)
  65. for item in data:
  66. sql3 = "call SelectDataPro(%s, %s, %s, %s, %s, @cnt);"
  67. sql4 = "select @cnt;"
  68. val3 = (tablename, int(item[0]), item[1], item[2], item[3])
  69. cursor.execute(sql3, val3)
  70. cursor.execute(sql4)
  71. cnt = cursor.fetchone()
  72. if cnt[0]==0:
  73. sql4="call InsertDataPro(%s, %s, %s, %s, %s)"
  74. val4=(tablename, int(item[0]), item[1], item[2], item[3])
  75. cursor.execute(sql4, val4)
  76. print("[Add ]新增条目:", item)
  77. else:
  78. sql5="call UpdateDataPro(%s, %s, %s, %s, %s)"
  79. val5 = (tablename, int(item[0]), item[1], item[2], item[3])
  80. cursor.execute(sql5, val5)
  81. print("[Update]更新条目:", item)
  82. conn.commit()
  83. cursor.close()
  84. conn.close()
  85. def run(self):
  86. print("[start] Time: ", datetime.datetime.strftime(self.curTime,'%Y-%m-%d %H:%M:%S'))
  87. data = self.getData(self.getHtml())
  88. self.saveDAO(data)
  89. print("--------------------------------------------------------------")
  90. print("[over]写入结束!")
  91. print("--------------------------------------------------------------")
  92. print("")
  93. print("")
  94. if __name__ == '__main__':
  95. print("[Initialized] 开机自启动加载完成!等待运行!")
  96. time.sleep(60) # 开机等待60秒再运行,防止数据库服务未启动等异常
  97. while 1:
  98. HotSearchThread().run()
  99. time.sleep(interval_time * 60)

二、数据库实现

watermark_type_ZmFuZ3poZW5naGVpdGk_shadow_10_text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3FxXzQxMTAxMjEz_size_16_color_FFFFFF_t_70

防止数据库单个表太大,按月份自动生成表,使用存储过程进行管理

20200324202818754.png

存储过程:

  1. --- CreateTablePro
  2. CREATE DEFINER=`WeiboHotSearch`@`%` PROCEDURE `CreateTablePro`(IN `tableName` varchar(50))
  3. BEGIN
  4. set @tname = tableName;
  5. set @sql_create_table = concat(
  6. 'CREATE TABLE IF NOT EXISTS ', @tname,
  7. '(num int(2),searchItem varchar(255) primary key,hotTime varchar(255),duration int(5))'
  8. );
  9. PREPARE sql_create_table from @sql_create_table;
  10. EXECUTE sql_create_table;
  11. END
  12. --- SelectDataPro
  13. CREATE DEFINER=`WeiboHotSearch`@`%` PROCEDURE `SelectDataPro`(IN `tableName` varchar(50),IN `num` int(2),IN `searchItem` varchar(255),IN `hotTime` varchar(255),IN `duration` int(5), OUT `cnt` int(5))
  14. BEGIN
  15. set @tname = tableName;
  16. set @num = num;
  17. set @sItem = searchItem;
  18. set @hTime = hotTime;
  19. set @dur = duration;
  20. set @sql_query_data = concat(
  21. 'select count(@sItem) into @cnt from ', @tname, ' where searchItem = @sItem'
  22. );
  23. PREPARE sql_query_data from @sql_query_data;
  24. EXECUTE sql_query_data;
  25. set cnt=@cnt;
  26. END
  27. --- InsertDataPro
  28. CREATE DEFINER=`WeiboHotSearch`@`%` PROCEDURE `InsertDataPro`(IN `tableName` varchar(50),IN `num` int(2),IN `searchItem` varchar(255),IN `hotTime` varchar(255),IN `duration` int(5))
  29. BEGIN
  30. set @tname = tableName;
  31. set @num = num;
  32. set @sItem = searchItem;
  33. set @hTime = hotTime;
  34. set @dur = 0;
  35. set @sql_insert_data = concat(
  36. 'insert into ', @tname,' values( @num, @sItem, @hTime, @dur);'
  37. );
  38. PREPARE sql_insert_data from @sql_insert_data;
  39. EXECUTE sql_insert_data;
  40. END
  41. --- UpdateDataPro
  42. CREATE DEFINER=`WeiboHotSearch`@`%` PROCEDURE `UpdateDataPro`(IN `tableName` varchar(50),IN `num` int(2),IN `searchItem` varchar(255),IN `hotTime` varchar(255),IN `duration` int(5))
  43. BEGIN
  44. set @tname = tableName;
  45. set @num = num;
  46. set @sItem = searchItem;
  47. set @hTime = hotTime;
  48. set @dur = duration;
  49. set @sql_insert_data1 = concat(
  50. 'update ', @tname,' set duration=duration+@dur where searchItem = @sItem'
  51. );
  52. PREPARE sql_insert_data1 from @sql_insert_data1;
  53. EXECUTE sql_insert_data1;
  54. set @sql_insert_data = concat(
  55. 'update ', @tname,' set num=@num, hotTime=@hTime where searchItem = @sItem and num>@num'
  56. );
  57. PREPARE sql_insert_data from @sql_insert_data;
  58. EXECUTE sql_insert_data;
  59. END

其他 :

  1. --- 建表模板
  2. drop table tablename;
  3. CREATE table tablename (
  4. num int(2),
  5. searchItem varchar(255) primary key,
  6. hotTime varchar(255),
  7. duration int(5)
  8. )
  9. --- 判断数据库中是否存在某张表
  10. SELECT count(TABLE_NAME) FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_SCHEMA='WeiboHotSearch' AND TABLE_NAME='test';
  11. --- 存储过程的调用
  12. call CreateTablePro('test')
  13. call InsertDataPro('table202003', 0, '复工复产保卫战', '20200324_165840', 20)
  14. call SelectDataPro('table202003', 0, '复工复产保卫战', '20200324_165840', 20, @cnt);
  15. select @cnt;
  16. call UpdateDataPro('table202003', 30, '方方', '20200324_165840', 20)
  17. --- select 查询验证
  18. select * from table202003 where duration = 20
  19. --- mysql创建并调用含有out参数的存储过程
  20. CREATE PROCEDURE sp_add(a int, b int,out c int)
  21. begin
  22. set c=a+ b;
  23. end;
  24. --- 调用过程:
  25. call sp_add (1,2,@a);
  26. select @a;

三、aliyun部署

  1. # 脚本路径:
  2. /root/myPyProgram/Weibo.py
  3. # 脚本日志:
  4. /root/myPyProgram/logs/Weibo.log
  5. # 设置开机启动
  6. chmon +x /etc/rd.local
  7. vim /etc/rd.local
  8. #追加内容:
  9. /usr/bin/python3 -u /root/myPyProgram/Weibo.py &>> /root/myPyProgram/logs/Weibo.log
  10. # 查看进程状态
  11. ps -aux|grep "Weibo.py" # 会显示开始运行时间,结束时间等
  12. ps -ef|grep "Weibo.py" # 只显示进程相关信息
  13. # 查询进程id
  14. ps -ef | grep Weibo.py | grep -v grep | awk '{print $2}'
  15. # 查询进程id并杀死该进程
  16. ps -ef | grep Weibo.py | grep -v grep | awk '{print $2}' | xargs kill -9
  17. # 杀死进程
  18. kill -9 [进程id]

shell 脚本杀死进程 dsp-admin

  1. echo "开始查询DSP2.0运行的进程编号,查出将其kill"
  2. dsp_admin_id=`ps -ef | grep dsp-admin | grep -v "grep" | awk '{print $2}'`
  3. echo $dsp_admin_id
  4. for id in $dsp_admin_id
  5. do
  6. kill -9 $id
  7. echo "killed $id"
  8. done

四、运行截图

watermark_type_ZmFuZ3poZW5naGVpdGk_shadow_10_text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3FxXzQxMTAxMjEz_size_16_color_FFFFFF_t_70 1

watermark_type_ZmFuZ3poZW5naGVpdGk_shadow_10_text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3FxXzQxMTAxMjEz_size_16_color_FFFFFF_t_70 2

watermark_type_ZmFuZ3poZW5naGVpdGk_shadow_10_text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3FxXzQxMTAxMjEz_size_16_color_FFFFFF_t_70 3

五、相关问题及解决方案

1、日志文件创建了,程序执行了,但是日志为0K,tail -f Weibo.log查看为空:

  1. [python程序输出无内容问题][python]

2、/usr/local/bin/python3不存在,需要添加软连接

  1. [ Linux下开机启动python脚本详解][Linux_python]

3、mysql存储过程相关

  1. [mysql存储过程用表名做参数 并获取execute执行语句的结果 ][mysql_ _execute]
  2. [MySql存储过程动态创建表并插入数据 ][MySql]
  3. [Mysql中表名作为参数的问题][Mysql]

4、linux查看进程状态

  1. [linux命令ps aux|grep xxx详解 ][linux_ps aux_grep xxx]

5、Linux中没有rc.local文件的完美解决方法

  1. [解决办法][Link 1]步骤一
  2. [解决办法步骤二][Link 2]

6、python获取当前系统时间并格式化

  1. [python 如何获取当前系统的时间][python 1]

7、linux脚本设置开机启动(后台)

  1. [链接 ][Link 3]

发表评论

表情:
评论列表 (有 0 条评论,68人围观)

还没有评论,来说两句吧...

相关阅读