Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

求看多线程爬取代码问题 #802

Open
QIANJUFEN opened this issue Sep 4, 2024 · 0 comments
Open

求看多线程爬取代码问题 #802

QIANJUFEN opened this issue Sep 4, 2024 · 0 comments

Comments

@QIANJUFEN
Copy link

mport requests
import threading
import pandas as pd
import time
from lxml import etree
from fake_useragent import UserAgent
from random import choice
from queue import Queue

代理IP池,隐藏或改变爬虫的真实IP地址。

proxies = [
('http', 'http://120.55.14.64:80'),
('http', 'http://47.96.37.159:80'),
('http', 'http://60.174.0.184:8089'),
('http', 'http://182.204.178.124:8089')
]

#随机生成useragent
ua = UserAgent()
#生成1-50页url
def url_creat():
url = 'https://nj.lianjia.com/ershoufang/pg{}/'
#生成前100页url列表
links=[url.format(i) for i in range(1,51)]
return links

#解析url并提取数据
def url_parse(url,result_queue):
try:
headers = {
'Accept-Encoding': 'gzip, deflate, br, zstd',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Cookie': 'lianjia_uuid=1482b2e7-3112-4886-a41e-9d2d2e86b44d; _jzqc=1; _jzqckmp=1; sajssdk_2015_cross_new_user=1; _ga=GA1.2.452213069.1724131249; _gid=GA1.2.1832912406.1724131249; _ga_654P0WDKYN=GS1.2.1724131249.1.0.1724131249.0.0.0; hip=apJG191KDRPikxgtTyi3OmoLYtJFJa9yCiORXQVgZ51QYD0SMeG3eATx2a1EuGGf7_iZ0YxmbRitHPS-wygw6eU7_uOz42xUXwELiqydaMqCOjZqUVrz7n77aek0s8UJGOuDAVObHUnwVJIEA9dtMtsaU2qLrjx0DsDaDb26BNk6uSjJ2owluyHyCg%3D%3D; _jzqx=1.1724131236.1724138261.2.jzqsr=cn%2Ebing%2Ecom|jzqct=/.jzqsr=hip%2Elianjia%2Ecom|jzqct=/; _ga_DX18CJBZRT=GS1.2.1724138272.1.0.1724138272.0.0.0; _ga_EYZV9X59TQ=GS1.2.1724138272.1.0.1724138272.0.0.0; select_city=320100; lianjia_ssid=aed34a25-3548-4cb0-9828-61f1b13e8284; Qs_lvt_200116=1724138352; Hm_lvt_9152f8221cb6243a53c83b956842be8a=1724138353; HMACCOUNT=67BCC534482CC88A; jzqa=1.3305053847575342600.1724131236.1724138261.1724140333.3; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%221916e3adab62f-06c1cb6122e849-4c657b58-1024000-1916e3adab777c%22%2C%22%24device_id%22%3A%221916e3adab62f-06c1cb6122e849-4c657b58-1024000-1916e3adab777c%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D; Qs_pv_200116=2599661905406085600%2C889815816991505000%2C2899766764332378600%2C91402511096122320%2C1000057718938937700; Hm_lpvt_9152f8221cb6243a53c83b956842be8a=1724140491; _jzqb=1.5.10.1724140333.1; _ga_E91JCCJY3Z=GS1.2.1724140344.3.1.1724140502.0.0.0; _ga_MFYNHLJT0H=GS1.2.1724140344.3.1.1724140502.0.0.0',
'Host': 'dig.lianjia.com',
'Pragma': 'no-cache',
'Referer': 'https://nj.lianjia.com/',
'sec-ch-ua': '"Not)A;Brand";v="99", "Microsoft Edge";v="127", "Chromium";v="127"',
'sec-ch-ua-mobile': '?1',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'no-cors',
'Sec-Fetch-Site': 'same-site',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent':ua.random
}
proxy = choice(proxies)
response=requests.get(url=url,headers=headers,proxies={proxy[0]: proxy[1]})
response.raise_for_status() # 检查请求是否成功
tree=etree.HTML(response.text)
#ul列表下的全部li标签
li_List=tree.xpath("//*[@Class='sellListContent']/li")
results=[]

    for li in li_List:
        try:
    #标题
            title=li.xpath('./div/div/a/text()')[0]
    #网址
            link=li.xpath('./div/div/a/@href')[0]
    #位置
            postion=li.xpath('./div/div[2]/div/a/text()')[0]+li.xpath('./div/div[2]/div/a[2]/text()')[0]
    #类型
            types=li.xpath('./div/div[3]/div/text()')[0].split(' | ')[0]
    #面积
            area=li.xpath('./div/div[3]/div/text()')[0].split(' | ')[1]
    #房屋信息
            info=li.xpath('./div/div[3]/div/text()')[0].split(' | ')[2:-1]
            info=''.join(info)
    #总价
            count_price=li.xpath('.//div/div[6]/div/span/text()')[0]+'万'
    #单价
            angle_price=li.xpath('.//div/div[6]/div[2]/span/text()')[0]
            dic={'标题':title,"位置":postion,'房屋类型':types,'面积':area,"单价":angle_price,'总价':count_price,'介绍':info,"网址":link}
            print(dic)
    #将房屋信息加入总列表中
            results.append(dic)
        except IndexError:  
            print(f"Error parsing data from {url}")  
            continue  
        # 将结果放入队列中
    result_queue.put(results)  
except requests.RequestException as e:  
    print(f"Request error for {url}: {e}")  

def run():
links = url_creat()
threads=[]
result_queue = Queue()
lock = threading.Lock()
all_results=[]
#多线程爬取
for i in links:
x=threading.Thread(target=url_parse,args=(i,result_queue))
x.start()
threads.append(x)
#设置请求之间的时间间隔
time.sleep(1)
for t in threads:
t.join()

# 收集所有结果  
while not result_queue.empty():   
    page_results = result_queue.get()  # 取出的是列表的列表  
    all_results.extend(page_results[0])  # 扩展第一个元素(即包含字典的列表)  
#将全部房屋信息转化为excel
data=pd.DataFrame(all_results)
data.to_excel('房屋信息822.xlsx',index=False)

if name == 'main':
run()

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant