python爬取链家新房数据

清疚 2022-05-09 02:38 220阅读 0赞

转载：https://blog.csdn.net/clyjjczwdd/article/details/79466032
    
    from bs4 import BeautifulSoup
    import requests
    import time
    import pandas as pd
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2;.NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; InfoPath.3; .NET4.0C; .NET4.0E)',
        'Accept': 'image/webp,image/*,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate',
        'Referer': 'http://www.baidu.com/link?url=_andhfsjjjKRgEWkj7i9cFmYYGsisrnm2A-TN3XZDQXxvGsM9k9ZZSnikW2Yds4s&amp;wd=&amp;eqid=c3435a7d00006bd600000003582bfd1f',
        'Connection': 'keep-alive'}
    page = ('pg')
    
    
    def generate_cityurl(user_in_city):  # 生成url
        cityurl = 'https://' + user_in_city + '.lianjia.com/loupan/'
        return cityurl
        # return demjson.encode(res)
        """
        d = json.loads(res.read().decode()).get('data')
    
        if d is None:
            print("城市首页加载完成")
            return 
        """
    
    
    def areainfo(url):
        page = ('pg')
        for i in range(1, 39):  # 获取1-100页的数据
            if i == 1:
                i = str(i)
                a = (url + page + i + '/')
                r = requests.get(url=a, headers=headers)
                print(a)
                htmlinfo = r.content
            else:
                i = str(i)
                a = (url + page + i + '/')
                print(a)
                r = requests.get(url=a, headers=headers)
                html2 = r.content
                htmlinfo = htmlinfo + html2
        time.sleep(0.5)
        return htmlinfo
    
    
    hlist = []
    
    
    def listinfo(listhtml):
        areasoup = BeautifulSoup(listhtml, 'html.parser')
        ljhouse = areasoup.find_all('div', attrs={'class': 'resblock-desc-wrapper'})
    
        for house in ljhouse:
            loupantitle = house.find("div", attrs={"class": "resblock-name"})
            loupanname = loupantitle.a.get_text()
            loupantag = loupantitle.find_all("span")
            wuye = loupantag[0].get_text()
            xiaoshouzhuangtai = loupantag[1].get_text()
            location = house.find("div", attrs={"class": "resblock-location"}).get_text()
            jishi = house.find("a", attrs={"class": "resblock-room"}).get_text()
            area = house.find("div", attrs={"class": "resblock-area"}).get_text()
            tag = house.find("div", attrs={"class": "resblock-tag"}).get_text()
            jiage = house.find("div", attrs={"class": "resblock-price"})
            price = jiage.find("div", attrs={"class": "main-price"}).get_text()
            total = jiage.find("div", attrs={"class": "second"})
            totalprice = "暂无"
            if total is not None:
                totalprice = total.get_text()
            h = {'title': loupanname, 'wuye': wuye, 'xiaoshouzhuangtai': xiaoshouzhuangtai, 'location': location.replace("\n", ""),
                 'jishi': jishi.replace("\n", ""), 'area': area, 'tag': tag, 'price': price,
                 'totalprice': totalprice};
            hlist.append(h)
    
    
    if __name__ == '__main__':
        user_in_city = input('输入抓取城市：')
        url = generate_cityurl(user_in_city)
        print(url)
        hlist.append(
            {'title': "楼盘名称", 'wuye': "物业类型", 'xiaoshouzhuangtai': "销售状态", 'location': "位置",
             'jishi': "房型", 'area': "面积", 'tag': "标签", 'price': "单价",
             'totalprice': "总价"})
        areahtml = areainfo(url)
        listinfo(areahtml)
        # houseinfo = houseinfo.append(hlist)
        houseinfo = pd.DataFrame(hlist,
                                 columns=['title', 'wuye', 'xiaoshouzhuangtai', 'location',
                                          'jishi', 'area', 'tag', 'price',
                                          'totalprice'])
        houseinfo.to_csv('C:\\Users\\czw\\Desktop/链家新房.csv', index=False, encoding="utf_8_sig")

\--------------------- 本文来自 瞬间的未来式 的CSDN 博客 ，全文地址请点击：https://blog.csdn.net/clyjjczwdd/article/details/79466032?utm\_source=copy