You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
mport requests
import threading
import pandas as pd
import time
from lxml import etree
from fake_useragent import UserAgent
from random import choice
from queue import Queue
#随机生成useragent
ua = UserAgent()
#生成1-50页url
def url_creat():
url = 'https://nj.lianjia.com/ershoufang/pg{}/'
#生成前100页url列表
links=[url.format(i) for i in range(1,51)]
return links
for li in li_List:
try:
#标题
title=li.xpath('./div/div/a/text()')[0]
#网址
link=li.xpath('./div/div/a/@href')[0]
#位置
postion=li.xpath('./div/div[2]/div/a/text()')[0]+li.xpath('./div/div[2]/div/a[2]/text()')[0]
#类型
types=li.xpath('./div/div[3]/div/text()')[0].split(' | ')[0]
#面积
area=li.xpath('./div/div[3]/div/text()')[0].split(' | ')[1]
#房屋信息
info=li.xpath('./div/div[3]/div/text()')[0].split(' | ')[2:-1]
info=''.join(info)
#总价
count_price=li.xpath('.//div/div[6]/div/span/text()')[0]+'万'
#单价
angle_price=li.xpath('.//div/div[6]/div[2]/span/text()')[0]
dic={'标题':title,"位置":postion,'房屋类型':types,'面积':area,"单价":angle_price,'总价':count_price,'介绍':info,"网址":link}
print(dic)
#将房屋信息加入总列表中
results.append(dic)
except IndexError:
print(f"Error parsing data from {url}")
continue
# 将结果放入队列中
result_queue.put(results)
except requests.RequestException as e:
print(f"Request error for {url}: {e}")
def run():
links = url_creat()
threads=[]
result_queue = Queue()
lock = threading.Lock()
all_results=[]
#多线程爬取
for i in links:
x=threading.Thread(target=url_parse,args=(i,result_queue))
x.start()
threads.append(x)
#设置请求之间的时间间隔
time.sleep(1)
for t in threads:
t.join()
# 收集所有结果
while not result_queue.empty():
page_results = result_queue.get() # 取出的是列表的列表
all_results.extend(page_results[0]) # 扩展第一个元素(即包含字典的列表)
#将全部房屋信息转化为excel
data=pd.DataFrame(all_results)
data.to_excel('房屋信息822.xlsx',index=False)
if name == 'main':
run()
The text was updated successfully, but these errors were encountered:
mport requests
import threading
import pandas as pd
import time
from lxml import etree
from fake_useragent import UserAgent
from random import choice
from queue import Queue
代理IP池,隐藏或改变爬虫的真实IP地址。
proxies = [
('http', 'http://120.55.14.64:80'),
('http', 'http://47.96.37.159:80'),
('http', 'http://60.174.0.184:8089'),
('http', 'http://182.204.178.124:8089')
]
#随机生成useragent
ua = UserAgent()
#生成1-50页url
def url_creat():
url = 'https://nj.lianjia.com/ershoufang/pg{}/'
#生成前100页url列表
links=[url.format(i) for i in range(1,51)]
return links
#解析url并提取数据
def url_parse(url,result_queue):
try:
headers = {
'Accept-Encoding': 'gzip, deflate, br, zstd',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Cookie': 'lianjia_uuid=1482b2e7-3112-4886-a41e-9d2d2e86b44d; _jzqc=1; _jzqckmp=1; sajssdk_2015_cross_new_user=1; _ga=GA1.2.452213069.1724131249; _gid=GA1.2.1832912406.1724131249; _ga_654P0WDKYN=GS1.2.1724131249.1.0.1724131249.0.0.0; hip=apJG191KDRPikxgtTyi3OmoLYtJFJa9yCiORXQVgZ51QYD0SMeG3eATx2a1EuGGf7_iZ0YxmbRitHPS-wygw6eU7_uOz42xUXwELiqydaMqCOjZqUVrz7n77aek0s8UJGOuDAVObHUnwVJIEA9dtMtsaU2qLrjx0DsDaDb26BNk6uSjJ2owluyHyCg%3D%3D; _jzqx=1.1724131236.1724138261.2.jzqsr=cn%2Ebing%2Ecom|jzqct=/.jzqsr=hip%2Elianjia%2Ecom|jzqct=/; _ga_DX18CJBZRT=GS1.2.1724138272.1.0.1724138272.0.0.0; _ga_EYZV9X59TQ=GS1.2.1724138272.1.0.1724138272.0.0.0; select_city=320100; lianjia_ssid=aed34a25-3548-4cb0-9828-61f1b13e8284; Qs_lvt_200116=1724138352; Hm_lvt_9152f8221cb6243a53c83b956842be8a=1724138353; HMACCOUNT=67BCC534482CC88A; jzqa=1.3305053847575342600.1724131236.1724138261.1724140333.3; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%221916e3adab62f-06c1cb6122e849-4c657b58-1024000-1916e3adab777c%22%2C%22%24device_id%22%3A%221916e3adab62f-06c1cb6122e849-4c657b58-1024000-1916e3adab777c%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D; Qs_pv_200116=2599661905406085600%2C889815816991505000%2C2899766764332378600%2C91402511096122320%2C1000057718938937700; Hm_lpvt_9152f8221cb6243a53c83b956842be8a=1724140491; _jzqb=1.5.10.1724140333.1; _ga_E91JCCJY3Z=GS1.2.1724140344.3.1.1724140502.0.0.0; _ga_MFYNHLJT0H=GS1.2.1724140344.3.1.1724140502.0.0.0',
'Host': 'dig.lianjia.com',
'Pragma': 'no-cache',
'Referer': 'https://nj.lianjia.com/',
'sec-ch-ua': '"Not)A;Brand";v="99", "Microsoft Edge";v="127", "Chromium";v="127"',
'sec-ch-ua-mobile': '?1',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'no-cors',
'Sec-Fetch-Site': 'same-site',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent':ua.random
}
proxy = choice(proxies)
response=requests.get(url=url,headers=headers,proxies={proxy[0]: proxy[1]})
response.raise_for_status() # 检查请求是否成功
tree=etree.HTML(response.text)
#ul列表下的全部li标签
li_List=tree.xpath("//*[@Class='sellListContent']/li")
results=[]
def run():
links = url_creat()
threads=[]
result_queue = Queue()
lock = threading.Lock()
all_results=[]
#多线程爬取
for i in links:
x=threading.Thread(target=url_parse,args=(i,result_queue))
x.start()
threads.append(x)
#设置请求之间的时间间隔
time.sleep(1)
for t in threads:
t.join()
if name == 'main':
run()
The text was updated successfully, but these errors were encountered: