-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathcrawlerTool.py
30 lines (26 loc) · 1.1 KB
/
crawlerTool.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
# 读取data.txt文件中的数据
with open('data.txt', 'r', encoding='utf-8') as file:
data = file.readlines()
# 发送HTTP GET请求获取页面内容,并提取<meta>标签的content
def get_content(url):
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
meta_tag = soup.find('meta', attrs={'name': 'description'})
content = meta_tag.get('content') if meta_tag else ''
return content
# 创建文本文件并写入数据
with open('output.txt', 'w', encoding='utf-8') as txtfile:
# 提取数据并写入文本文件
for i, line in enumerate(tqdm(data[1:], desc='提取进度'), 1):
line = line.strip()
article_id, title = line.split(': ')
url = f'https://www.xiaohongshu.com/explore/{article_id}'
content = get_content(url)
txtfile.write(f'序号: {i}\n')
txtfile.write(f'链接: {url}\n')
txtfile.write(f'标题: {title}\n')
txtfile.write(f'内容: {content}\n\n')
print("提取完成并已保存到output.txt文件中。")