爬取链家网所有二手房楼盘信息 缺乏、安全感 2022-02-04 17:13 383阅读 0赞 代码如下: import requests from lxml import etree import math import time def request_url(url): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' } response = requests.get(url, headers=headers).content.decode('utf-8') tree = etree.HTML(response) return tree def pankong(temp_list): if len(temp_list)>0: return temp_list[0] else: return '' tree = request_url('https://bj.fang.lianjia.com/') href_list = tree.xpath('//div[@class="fc-main clear"]//li//div[@class="city-enum fl"]/a/@href') # print(len(href_list)) for href in href_list: full_href = 'http:'+href+'/loupan/' # print(full_href) # 列表页 tree1 = request_url(full_href) li_list = tree1.xpath('//li[@class="resblock-list post_ulog_exposure_scroll has-results"]') # print(len(li_list)) # 房子总数 num = tree1.xpath('.//div[@class="resblock-have-find"]/span[@class="value"]/text()') page = math.ceil(int(num[0])/10) print('共%d页'%page) for pg in range(1,page+1): full_link = full_href + 'pg%d/'%(pg) print('第%d页链接'%(pg), full_link) html1_tree = request_url(full_link) home_list = html1_tree.xpath('//li[@class="resblock-list post_ulog_exposure_scroll has-results"]') if home_list: for li in home_list: try: # 列表页图片 img_list = li.xpath('.//img[@class="lj-lazy"]/@data-original') img = pankong(img_list) # print(img[0]) # 列表页楼盘名称 name_list = li.xpath('.//div[@class="resblock-name"]/a/text()') name = pankong(name_list) # 城区 chengqu_list = li.xpath('.//div[@class="resblock-location"]//span/text()') chengqu = pankong(chengqu_list) # print(chengqu) # 商圈 shangquan = li.xpath('.//div[@class="resblock-tag"]//span/text()') shangquan1 = ''.join(shangquan) # print(shangquan1) # 地址 address_list = li.xpath('.//div[@class="resblock-location"]/a/text()') address = pankong(address_list) # print(address) # 主力户型 huxing_list = li.xpath('.//a[@class="resblock-room"]/span/text()') huxing = pankong(huxing_list) # print(huxing) # 建筑面积 areas_list = li.xpath('.//div[@class="resblock-area"]/span/text()') areas = pankong(areas_list) # print(areas) # 价格 price = li.xpath('.//div[@class="main-price"]/span/text()') price1 = ''.join(price).replace('\xa0','') # print(price1) # 楼盘标签 biaoqian_list = li.xpath('.//span[@class="resblock-type"]/text()') biaoqian = pankong(biaoqian_list) # print(biaoqian) # 详情页url url = li.xpath('.//a[@class="name "]/@href')[0] full_url = full_href + url.replace('/loupan/','') liebiaoye_list = [] # print(full_url) # print(str(img)+'\t'+str(name)+'\t'+str(shangquan1)+'\t'+str(address)+'\t'+str(huxing)+'\t'+str(areas)+'\t'+str(price1)+'\t'+str(biaoqian)+'\t'+str(full_url)) # message = str(img)+'\t'+str(name)+'\t'+str(shangquan1)+'\t'+str(address)+'\t'+str(huxing)+'\t'+str(areas)+'\t'+str(price1)+'\t'+str(biaoqian)+'\t'+str(full_url) liebiaoye = { 'pic':img, 'home_name':name, 'shangquan':shangquan1, 'address':address, 'huxing':huxing, 'area':areas, 'price':price1, 'biaoqian':biaoqian, 'detail_url':full_url } liebiaoye_list.append(liebiaoye) # print(message) # with open('列表页.txt','a',encoding='utf-8') as fp: # fp.write(str(liebiaoye)+'\n') # 进入详情页 # 楼盘动态 detail_link = full_url + 'dongtai/' # print(detail_link) tree2 = request_url(detail_link) div_list = tree2.xpath('//div[@class="big-left fl"]/div[@class="dongtai-one for-dtpic"]') dongtai_list = [] if not div_list: print('没有数据') continue for div in div_list: # 标题 title_list = div.xpath('.//span[@class="a-title"]/text()') # print(title_list) title = pankong(title_list) # print(title) # 内容 content_list = div.xpath('.//div[@class="a-word"]/a/text()') content = pankong(content_list) # print(content) # 日期 date_list = div.xpath('.//span[@class="a-time"]/text()') date = pankong(date_list) # print(date) dongtai = { 'title':title, 'content':content, 'date':date } dongtai_list.append(dongtai) # with open('楼盘动态.txt','a',encoding='utf-8') as fp: # fp.write(str(dongtai)+'\n') # print(dongtai) # 户型 huxing_link = full_url + 'huxingtu/' # print(huxing_link) huxing_tree = request_url(huxing_link) huxing_li_list = huxing_tree.xpath('//ul[@class="item-list clear"]/li') for huxing_li in huxing_li_list: # 图片 pic_list = huxing_li.xpath('./a[@class="thumb"]/@href') pic = pankong(pic_list) full_pic = 'https://bd.fang.lianjia.com'+pic # print(full_pic) # 室厅卫 huxing_room_list = huxing_li.xpath('.//ul/li[position()=1]/text()') huxing_room = pankong(huxing_room_list) # print(huxing_room) # 面积 huxing_areas_list = huxing_li.xpath('.//ul/li[position()>1]/text()') huxing_areas = pankong(huxing_areas_list) # print(huxing_areas) # 价格 huxing_price_list = huxing_li.xpath('.//span[@class="price"]/text()') huxing_price = pankong(huxing_price_list) # print(huxing_price) # 日期 huxing_date_list = huxing_li.xpath('.//span[@class="p2-time"]/text()') huxing_date = pankong(huxing_date_list) # print(huxing_date) # 户型解读 huxing_type_list = huxing_li.xpath('.//a[@class="thumb"]/span/text()') huxing_type = pankong(huxing_type_list) # print(huxing_type) huxing_list = [] huxing = { 'pic':full_pic, 'room':huxing_room[3], 'ting':huxing_room[5], 'wei':huxing_room[7], 'area':huxing_areas, 'price':huxing_price, 'date':huxing_date, 'type_desc':huxing_type } huxing_list.append(huxing) # with open('户型介绍.txt','a',encoding='utf-8') as fp: # fp.write(str(huxing)+'\n') # print(message1) # 楼盘相册 xiangce_link = full_url+'xiangce/' print(xiangce_link) xiaoguo_tree = request_url(xiangce_link) xiaoguotu_div_list = xiaoguo_tree.xpath('.//div[@class="all-list"]/div[@class="tab-group"][position()<3]') # 效果图 xaingce_list = [] pic = { 'xiaoguo':[], 'shijing':[] } if not xiaoguotu_div_list: continue for xiaoguotu_div in xiaoguotu_div_list[0]: # # src 图片地址 xiaoguo_src_list = xiaoguotu_div.xpath('.//img/@src') xiaoguo_src = pankong(xiaoguo_src_list) # print(xiaoguo_src) pic['xiaoguo'].append(xiaoguo_src) # # 图片链接 # xiaoguo_url_list = xiaoguotu_div.xpath('.//a/@href') # xiaoguo_url = pankong(xiaoguo_url_list) # full_xiaoguo_url = 'https://bd.fang.lianjia.com'+xiaoguo_url # print(full_xiaoguo_url) # for shijing_div in xiaoguotu_div_list[1]: # # 实景图地址 shijing_src_list = shijing_div.xpath('.//img/@src') shijing_src = pankong(shijing_src_list) # print(shijing_src) pic['shijing'].append(shijing_src) xaingce_list.append(pic) # with open('楼盘图册.txt','a',encoding='utf-8') as fp: # fp.write(str(pic)+'\n') # 楼盘详情 xiangqing_link = full_url print(xiangqing_link) xiangqing_tree = request_url(xiangqing_link) # print(xiangqing_tree) # xiangqing_div_list = xiangqing_tree.xpath('.//div[@class="big-left fl"]') p_list1 = xiangqing_tree.xpath('.//span[@class="label"]/text()') p_list2 = xiangqing_tree.xpath('.//span[@class="label-val"]/text()') # print(p_list2) lp_dic = {} lp_list = [] for i in range(0, len(p_list1)): # lp_dic[p_list1[i]] = p_list2[i].strip() # print(lp_dic[p_list1[i]] ) lp_dic = { p_list1[i]: p_list2[i].strip() } lp_list.append(lp_dic) # print(lp_list) # with open('楼盘详情.txt','a',encoding='utf-8') as fp: # fp.write(str(lp_list)+'\n') loupan_dic = {} loupan_dic['列表页'] = liebiaoye_list loupan_dic['楼盘动态'] = dongtai_list loupan_dic['户型介绍'] = huxing_list loupan_dic['楼盘相册'] = xaingce_list loupan_dic['楼盘详情'] = lp_list print(loupan_dic) with open('lianjialp.txt','a',encoding='utf-8') as fp: fp.write(str(loupan_dic)+'\n') except: pass else: print('没有数据了') continue
相关 python爬虫之静态网页爬取--猫眼电影/链家二手房 猫眼电影(xpath) 目标 1、地址: 猫眼电影 - 榜单 - top100榜 2、目标: 电影名称、主演、上映时间 步骤 本是古典 何须时尚/ 2023年06月22日 08:25/ 0 赞/ 10 阅读
相关 爬取链家任意城市二手房数据(天津) 1 !/usr/bin/env python 2 -- coding: utf-8 -- 3 @Time : 2019-08-16 1 桃扇骨/ 2023年06月03日 04:00/ 0 赞/ 5 阅读
相关 Python爬虫新手入门教学(三):爬取链家二手房数据 前言 本文的文字及图片来源于网络,仅供学习、交流使用,不具有任何商业用途,如有问题请及时联系我们以作处理。 Python爬虫、数据分析、网站开发等案例教程视频免费在线观 港控/mmm°/ 2023年01月11日 03:44/ 0 赞/ 183 阅读
相关 Python爬取Q房网全新楼盘房价信息,有钱人真的多 前言 本文的文字及图片来源于网络,仅供学习、交流使用,不具有任何商业用途,如有问题请及时联系我们以作处理。 PS:如有需要Python学习资料的小伙伴可以加点击下方链接 た 入场券/ 2022年12月15日 06:13/ 0 赞/ 156 阅读
相关 python爬取链家二手房信息,确认过眼神我是买不起的人 前言 本文的文字及图片来源于网络,仅供学习、交流使用,不具有任何商业用途,如有问题请及时联系我们以作处理。 PS:如有需要Python学习资料的小伙伴可以加点击下方链接 淡淡的烟草味﹌/ 2022年12月08日 14:19/ 0 赞/ 168 阅读
相关 Python爬虫之链家二手房数据爬取 Python 依赖模块: requests parsel csv 功能要求: ![watermark_type_ZHJvaWRzYW5zZmFs 本是古典 何须时尚/ 2022年09月11日 07:23/ 0 赞/ 352 阅读
相关 Python爬取城市二手房数据 今天要分享的教程是爬取各大城市的二手房数据,抛开以前的文章不谈,下面的内容应该足够你实现这篇爬虫。以下是正文: 1. 确定目标 今天我们的目标官网链接是:https:/ ﹏ヽ暗。殇╰゛Y/ 2022年08月29日 14:55/ 0 赞/ 270 阅读
相关 Python爬虫-利用xpath解析爬取58二手房详细信息 文章目录 前言 介绍 代码 运行结果截图 前言 简单的Python练习,对页面中的某些部分的文字进行爬取 介绍 ![在这里插入图片 超、凢脫俗/ 2022年08月29日 10:43/ 0 赞/ 268 阅读
相关 爬取链家网所有二手房楼盘信息 代码如下: import requests from lxml import etree import math import time 缺乏、安全感/ 2022年02月04日 17:13/ 0 赞/ 384 阅读
还没有评论,来说两句吧...