-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
46 lines (34 loc) · 1.4 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import requests
from bs4 import BeautifulSoup
import json
from urllib.parse import urlparse, urljoin
def get_all_links(url):
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
links = [a.get('href') for a in soup.find_all('a', href=True)]
return links
def crawl_website(start_url, max_depth=3):
visited_urls = set()
queue = [(start_url, 0)]
try:
while queue:
current_url, depth = queue.pop(0)
if current_url not in visited_urls and depth <= max_depth:
print(f"crawling: {current_url}")
links = get_all_links(current_url)
visited_urls.add(current_url)
for link in links:
absolute_url = urljoin(current_url, link)
if urlparse(absolute_url).scheme in ('http', 'https'):
queue.append((absolute_url, depth + 1))
except Exception as e:
print(f"Error during crawling: {e}")
return visited_urls
if __name__ == "__main__":
start_url = "https://ronalds.ggradio.pro"
max_depth = 3
crawled_urls = crawl_website(start_url, max_depth)
with open("urls.json", "w") as json_file:
json.dump(list(crawled_urls), json_file, indent=2)
print("Crawled URLs saved to urls.json")
print(f"Error saving crawled URLs to urls.json")