crawler.py

import requests
import time
import concurrent.futures
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from urllib.robotparser import RobotFileParser
from reppy.robots import Robots

class Crawler:
    """
    A web crawler that recursively crawls a website to build a site map.

    Attributes:
        target_url (str): The target website URL to crawl.
        visited_links (set): A set of visited URLs.
        site_map (set): A set of URLs in the crawled site map.
        robot_parser (Robots): A robots.txt parser object from reppy.robots.
        delay (int): The delay in seconds between requests to the target website.
    
    Methods:
        init_robot_parser(): Initialize the robots.txt parser using reppy.robots.
        can_fetch(url): Check if the URL is allowed by robots.txt.
        crawl(url): Recursively crawl the website to build a site map.
        extract_links(url): Extract all internal links from a given URL.
        extract_forms(url): Extract all forms from a given URL.
    """

    def __init__(self, target_url, delay=1):
        self.target_url = target_url
        self.visited_links = set()
        self.site_map = set()
        self.robot_parser = self.init_robot_parser()
        self.delay = delay
    
    def init_robot_parser(self):
        """Initialize the robots.txt parser using reppy.robots."""
        robot_parser = Robots.fetch(urljoin(self.target_url, 'robots.txt'))
        return robot_parser

    def can_fetch(self, url):
        """Check if the URL is allowed by robots.txt."""
        return self.robot_parser.can_fetch('*', url)

    def crawl(self, url=None):
        """Recursively crawl the website to build a site map."""
        if url is None:
            url = self.target_url
        
        # Check if the URL has been visited or not allowed by robots.txt
        if url in self.visited_links or not self.can_fetch(url):
            return

        self.visited_links.add(url)
        time.sleep(self.delay)  # Add a delay between requests for rate-limiting

        # Extract internal links from the current URL
        page_links = self.extract_links(url)
        self.site_map.add(url)

        # Use a ThreadPoolExecutor to process multiple links concurrently
        with concurrent.futures.ThreadPoolExecutor() as executor:
            executor.map(self.crawl, page_links)

    def extract_links(self, url):
        """Extract all internal links from a given URL."""
        try:
            response = requests.get(url)
            soup = BeautifulSoup(response.content, 'html.parser')
            links = [urljoin(url, link.get('href')) for link in soup.find_all('a')]
            return [link for link in links if urlparse(link).netloc == urlparse(self.target_url).netloc]
        except requests.exceptions.RequestException as e:
            print(f"Error occurred while fetching the URL {url}: {e}")
            return []

    def extract_forms(self, url):
        """Extract all forms from a given URL."""
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        return soup.find_all('form')