python爬虫爬取百度文档

矫情吗;* 2021-07-26 21:06 927阅读 0赞

使用python爬虫爬取百度文档文字

话不多说,直接上代码!

  1. import requests
  2. import re
  3. headers = {
  4. "User-Agent": "Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Mobile Safari/537.36"
  5. } # 模拟手机
  6. def get_num(url):
  7. response = requests.get(url, headers=headers).text
  8. result = re.search(
  9. r'&md5sum=(.*)&sign=(.*)&rtcs_flag=(.*)&rtcs_ver=(.*?)".*rsign":"(.*?)",', response, re.M | re.I) # 寻找参数
  10. reader = {
  11. "md5sum": result.group(1),
  12. "sign": result.group(2),
  13. "rtcs_flag": result.group(3),
  14. "rtcs_ver": result.group(4),
  15. "width": 176,
  16. "type": "org",
  17. "rsign": result.group(5)
  18. }
  19. result_page = re.findall(
  20. r'merge":"(.*?)".*?"page":(.*?)}', response) # 获取每页的标签
  21. doc_url = "https://wkretype.bdimg.com/retype/merge/" + url[29:-5] # 网页的前缀
  22. n = 0
  23. for i in range(len(result_page)): # 最大同时一次爬取10页
  24. if i % 10 is 0:
  25. doc_range = '_'.join([k for k, v in result_page[n:i]])
  26. reader['pn'] = n + 1
  27. reader['rn'] = 10
  28. reader['callback'] = 'sf_edu_wenku_retype_doc_jsonp_%s_10' % (
  29. reader.get('pn'))
  30. reader['range'] = doc_range
  31. n = i
  32. get_page(doc_url, reader)
  33. else: # 剩余不足10页的
  34. doc_range = '_'.join([k for k, v in result_page[n:i + 1]])
  35. reader['pn'] = n + 1
  36. reader['rn'] = i - n + 1
  37. reader['callback'] = 'sf_edu_wenku_retype_doc_jsonp_%s_%s' % (
  38. reader.get('pn'), reader.get('rn'))
  39. reader['range'] = doc_range
  40. get_page(doc_url, reader)
  41. def get_page(url, data):
  42. response = requests.get(url, headers=headers, params=data).text
  43. response = response.encode(
  44. 'utf-8').decode('unicode_escape') # unciode转为utf-8 然后转为中文
  45. response = re.sub(r',"no_blank":true', '', response) # 清洗数据
  46. result = re.findall(r'c":"(.*?)"}', response) # 寻找文本匹配
  47. result = '\n'.join(result)
  48. with open("C:/Users/86135/Desktop/百度文库.txt",'wt') as f:
  49. f.write(result)
  50. if __name__ == '__main__':
  51. url = input("请输入百度文库的地址:")
  52. get_num(url)

爬取结果如下:
在这里插入图片描述

发表评论

表情:
评论列表 (有 0 条评论,927人围观)

还没有评论,来说两句吧...

相关阅读

    相关 python爬虫云盘

     在网上看到的教程,但是我嫌弃那个教程写的乱(虽然最后显示我也没高明多少,哈哈),就随手写了一个 主要是嫌弃盘搜那些恶心的广告,这样直接下载下来,眼睛清爽多了。 用p