-
Notifications
You must be signed in to change notification settings - Fork 0
/
external_link_checker.py
128 lines (112 loc) · 4.67 KB
/
external_link_checker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import csv
import argparse
import urllib3
import os
from dotenv import load_dotenv
from google.cloud import webrisk_v1
# Load environment variables from .env file
load_dotenv()
# Disable SSL warnings
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# Set of visited URLs to avoid processing the same URL multiple times
visited_urls = set()
# Dictionary to store external links and their source pages and safety status
external_links = {}
# Dictionary to store proxy settings
proxies_dic = {
#"http": "http://your_proxy:port",
#"https": "http://your_proxy:port"
}
# Get the Google API key from environment variables
google_api_key = os.getenv('GOOGLE_API_KEY')
def is_external_link(url, base_url):
"""
Check if a URL is an external link.
:param url: URL to check
:param base_url: The base URL of the website
:return: True if the URL is external, False otherwise
"""
return urlparse(url).netloc != urlparse(base_url).netloc
def check_url_safety(url):
"""
Check the safety of a URL using Google Safe Browsing Web Risk API.
:param url: URL to check
:return: 'Safe' if the URL is safe, 'Unsafe' if the URL is risky
"""
if google_api_key:
client = webrisk_v1.WebRiskServiceClient(
client_options={"api_key": google_api_key}
)
uri = url
threat_types = [webrisk_v1.ThreatType.MALWARE,
webrisk_v1.ThreatType.SOCIAL_ENGINEERING,
webrisk_v1.ThreatType.UNWANTED_SOFTWARE]
try:
response = client.search_uris(uri=uri, threat_types=threat_types)
if response.threat:
return 'Unsafe'
return 'Safe'
except Exception as e:
print(f"Error checking URL safety {url}: {e}")
return 'Unknown'
else:
# If the API key is not set, assume all URLs are safe (or handle differently as needed)
return 'Not Checked'
def scrape_links(url, base_url):
"""
Recursively scrape links from the given URL.
:param url: The current URL to scrape
:param base_url: The base URL of the website
"""
try:
# Skip already visited URLs
if url in visited_urls:
return
visited_urls.add(url) # Mark the URL as visited
response = requests.get(url, proxies=proxies_dic)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
for a_tag in soup.find_all('a', href=True):
link = urljoin(base_url, a_tag['href'])
if link not in visited_urls:
if is_external_link(link, base_url):
# Check if the link is an HTTP or HTTPS URL
parsed_link = urlparse(link)
if parsed_link.scheme in ['http', 'https']:
safety_status = check_url_safety(link)
else:
safety_status = 'Not Applicable'
external_links[link] = (url, safety_status) # Store the external link, its source page, and safety status
else:
scrape_links(link, base_url) # Recursively scrape internal links only
except requests.exceptions.RequestException as e:
print(f"Error scraping {url}: {e}")
def save_to_csv(external_links, output_file):
"""
Save the collected external links, their source pages, and safety status to a CSV file.
:param external_links: Dictionary of external links, their source pages, and safety status
:param output_file: The output CSV file path
"""
with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
csvwriter = csv.writer(csvfile)
csvwriter.writerow(['External Link', 'Source Page', 'Safety Status'])
for link, (source, safety_status) in external_links.items():
csvwriter.writerow([link, source, safety_status])
def main(base_url, output_file):
"""
Main function to start the scraping process and save results to a CSV file.
:param base_url: The base URL of the website to scrape
:param output_file: The output CSV file path
"""
scrape_links(base_url, base_url)
save_to_csv(external_links, output_file)
print(f"Saved {len(external_links)} external links to {output_file}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Scrape external links from a website.")
parser.add_argument("base_url", help="The base URL of the website to scrape")
parser.add_argument("output_file", help="The output CSV file")
args = parser.parse_args()
main(args.base_url, args.output_file)