python爬虫爬取腾讯网招聘信息

朱雀 2021-07-26 20:01 837阅读 0赞

python爬虫爬取腾讯网招聘信息

话不多说,直接上代码!

  1. from bs4 import BeautifulSoup
  2. import urllib2
  3. import json # 使用了json格式存储
  4. def tengxun(detail,num):
  5. url = 'https://hr.tencent.com/'
  6. # detail = 'position.php?&start=0#a'
  7. request = urllib2.Request(url + detail)
  8. response =urllib2.urlopen(request)
  9. resHtml = response.read()
  10. soup = BeautifulSoup(resHtml,'html.parser', from_encoding='utf-8')
  11. result = soup.select(".even")
  12. result += soup.select(".odd")
  13. # print len(result)
  14. # 处理页面
  15. items = []
  16. for node in result:
  17. item = { }
  18. # 职位名
  19. zname = node.select('td')[0].get_text()
  20. # 职位类别
  21. ztype = node.select('td')[1].get_text()
  22. # 人数
  23. znum = node.select('td')[2].get_text()
  24. # 地点
  25. zlocal = node.select('td')[3].get_text()
  26. # 发布时间
  27. ztime = node.select('td')[4].get_text()
  28. # 链接
  29. detailLink = node.select('td a')[0].attrs['href']
  30. # 获取工作职责、工作要求
  31. request1 = urllib2.Request(url + detailLink)
  32. response1 = urllib2.urlopen(request1)
  33. jobHtml = response1.read()
  34. soup1 = BeautifulSoup(jobHtml, 'html.parser', from_encoding='utf-8')
  35. # print len(soup1.select('ul.squareli'))
  36. # 工作职责
  37. jobRes = ''
  38. for li in soup1.select('ul.squareli')[0].select('li') :
  39. jobRes += li.get_text() + '\n'
  40. # 工作要求
  41. jobReq = ''
  42. for li in soup1.select('ul.squareli')[1].select('li') :
  43. jobReq += li.get_text() + '\n'
  44. # print jobReq
  45. # 将数据存入item中
  46. item['zname']=zname;
  47. item['detailLink'] = detailLink;
  48. item['ztype']=ztype
  49. item['znum'] = znum
  50. item['zlocal'] = zlocal
  51. item['ztime'] = ztime
  52. item['jobRes'] = jobRes
  53. item['jobReq'] = jobReq
  54. # 处理工作职责和工作要求
  55. items.append(item)
  56. origin = []
  57. print(len(items))
  58. # 以json格式输出到文件中
  59. # 禁用ascii编码,按utf-8编码
  60. output = open('tencent.json'+ str(num), 'w')
  61. for i in origin:
  62. items.append(i)
  63. line = json.dumps(items, ensure_ascii=False);
  64. # print line
  65. output.write(line.encode('utf-8'))
  66. output.close()
  67. for i in range(303):
  68. print("进行到第" + str(i) + "页")
  69. url = 'position.php?&start='+ str(i * 10) +'#a'
  70. tengxun(url, i)

发表评论

表情:
评论列表 (有 0 条评论,837人围观)

还没有评论,来说两句吧...

相关阅读

    相关 python 爬虫小说信息

    1.进入小说主页(以下示例是我在网上随便找的一片小说),获取该小说的名称、作者以及相关描述信息 2.获取该小说的所有章节列表信息(最重要的是每个章节的链接地址href) 3