-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawler.py
79 lines (66 loc) · 3.08 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import requests
import time
import concurrent.futures
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from urllib.robotparser import RobotFileParser
from reppy.robots import Robots
class Crawler:
"""
A web crawler that recursively crawls a website to build a site map.
Attributes:
target_url (str): The target website URL to crawl.
visited_links (set): A set of visited URLs.
site_map (set): A set of URLs in the crawled site map.
robot_parser (Robots): A robots.txt parser object from reppy.robots.
delay (int): The delay in seconds between requests to the target website.
Methods:
init_robot_parser(): Initialize the robots.txt parser using reppy.robots.
can_fetch(url): Check if the URL is allowed by robots.txt.
crawl(url): Recursively crawl the website to build a site map.
extract_links(url): Extract all internal links from a given URL.
extract_forms(url): Extract all forms from a given URL.
"""
def __init__(self, target_url, delay=1):
self.target_url = target_url
self.visited_links = set()
self.site_map = set()
self.robot_parser = self.init_robot_parser()
self.delay = delay
def init_robot_parser(self):
"""Initialize the robots.txt parser using reppy.robots."""
robot_parser = Robots.fetch(urljoin(self.target_url, 'robots.txt'))
return robot_parser
def can_fetch(self, url):
"""Check if the URL is allowed by robots.txt."""
return self.robot_parser.can_fetch('*', url)
def crawl(self, url=None):
"""Recursively crawl the website to build a site map."""
if url is None:
url = self.target_url
# Check if the URL has been visited or not allowed by robots.txt
if url in self.visited_links or not self.can_fetch(url):
return
self.visited_links.add(url)
time.sleep(self.delay) # Add a delay between requests for rate-limiting
# Extract internal links from the current URL
page_links = self.extract_links(url)
self.site_map.add(url)
# Use a ThreadPoolExecutor to process multiple links concurrently
with concurrent.futures.ThreadPoolExecutor() as executor:
executor.map(self.crawl, page_links)
def extract_links(self, url):
"""Extract all internal links from a given URL."""
try:
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
links = [urljoin(url, link.get('href')) for link in soup.find_all('a')]
return [link for link in links if urlparse(link).netloc == urlparse(self.target_url).netloc]
except requests.exceptions.RequestException as e:
print(f"Error occurred while fetching the URL {url}: {e}")
return []
def extract_forms(self, url):
"""Extract all forms from a given URL."""
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
return soup.find_all('form')