Python爬虫 —— 知乎之selenium模拟登陆获取cookies+requests.Session()访问+session序列化-转...

淡淡的烟草味﹌ 2023-10-11 09:26 77阅读 0赞

转自https://www.cnblogs.com/DOLFAMINGO/p/9170429.html

  1. 1 # coding:utf-8
  2. 2 from selenium import webdriver
  3. 3 import requests
  4. 4 import sys
  5. 5 import time
  6. 6 from lxml import etree
  7. 7 import cPickle
  8. 8 import os
  9. 9 # reload(sys)
  10. 10 # sys.setdefaultencoding('utf-8')
  11. 11
  12. 12 class Zhihu:
  13. 13 def __init__(self,homeurl):
  14. 14 self.homeurl = homeurl
  15. 15
  16. 16 def save_session(self,session): #保存session,下次可直接使用,避免再次登录
  17. 17 with open('session.txt','wb') as f:
  18. 18 cPickle.dump(session, f)
  19. 19 print "Cookies have been writed."
  20. 20
  21. 21 def load_session(self): #加载session
  22. 22 with open('session.txt', 'rb') as f:
  23. 23 s = cPickle.load(f)
  24. 24 return s
  25. 25
  26. 26 def GetCookies(self): #初次登录用selenium模拟,并获得cookies
  27. 27 browser = webdriver.Chrome()
  28. 28 browser.get("https://www.zhihu.com/signin")
  29. 29 browser.find_element_by_xpath("//main//div[2]/div[1]/form/div[1]/div[2]/div[1]/input").send_keys("13060882373")
  30. 30 browser.find_element_by_xpath("//main//div[2]/div[1]/form/div[2]/div/div[1]/input").send_keys("xxxxxx")
  31. 31 browser.find_element_by_xpath("//main//div[2]/div[1]/form/button").click()
  32. 32 time.sleep(10)
  33. 33 cookies = browser.get_cookies()
  34. 34 browser.quit()
  35. 35 return cookies
  36. 36
  37. 37 def get_session(self): #获取session
  38. 38 s = requests.Session()
  39. 39 if not os.path.exists('session.txt'): #如果没有session,则创建一个,并且保存到文件中
  40. 40 s.headers.clear()
  41. 41 for cookie in self.GetCookies():
  42. 42 s.cookies.set(cookie['name'], cookie['value'])
  43. 43 self.save_session(s)
  44. 44 else: #如果已存在session,则直接加载使用
  45. 45 s = self.load_session()
  46. 46 return s
  47. 47
  48. 48 def Crawl(self): #开始爬取
  49. 49 s = self.get_session()
  50. 50 html = s.get(self.homeurl).text
  51. 51 html_tree = etree.HTML(html)
  52. 52 items = html_tree.xpath('//main//div[1]/div[2]//div[@class="ContentItem AnswerItem"]/@data-zop')
  53. 53 for item in items:
  54. 54 content = eval(item)
  55. 55 authorName = content['authorName']
  56. 56 title = content['title']
  57. 57 print authorName + "回答了:" + title
  58. 58
  59. 59 zhihu = Zhihu('https://www.zhihu.com/')
  60. 60 zhihu.Crawl()

转载于:https://www.cnblogs.com/Young-shi/p/11518090.html

发表评论

表情:
评论列表 (有 0 条评论,77人围观)

还没有评论,来说两句吧...

相关阅读