-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.py
199 lines (159 loc) · 7.77 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
import requests
from bs4 import BeautifulSoup
import re
from googlesearch import search
import os
import sys
# Function to validate and extract emails from mailto links
def extract_emails_from_href(soup):
emails = set()
# Find all <a> tags with href starting with mailto:
for a in soup.find_all('a', href=re.compile(r'^mailto:')):
email = a['href'].replace('mailto:', '').strip()
if email:
emails.add(email)
return emails
# Function to scrape emails from the entire page content
def extract_emails_from_page(soup):
# Regex pattern to match typical email addresses
email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
return set(re.findall(email_pattern, soup)) # Return as a set to ensure unique results
# Function to validate and extract phone numbers from tel links
def extract_phone_numbers_from_href(soup):
phone_numbers = set()
# Find all <a> tags with href starting with tel:
for a in soup.find_all('a', href=re.compile(r'^tel:')):
phone_number = a['href'].replace('tel:', '').strip()
if phone_number:
phone_numbers.add(phone_number)
return phone_numbers
# Function to scrape phone numbers from the entire page content
def extract_phone_numbers_from_page(soup):
# Define a set of keywords to look for in class, id, or attributes
keywords = ['phone', 'number', 'telefono', 'numero']
# Find elements where class, id, or attributes contain any of the keywords as substrings
phone_elements = soup.find_all(lambda tag: any(
any(keyword in (attr or '') for attr in (tag.get('class') or []) + [(tag.get('id') or '')] + list(tag.attrs.keys()))
for keyword in keywords
))
# Additionally, search inside header and footer elements
header_elements = soup.find_all('header')
footer_elements = soup.find_all('footer')
# Combine all the elements (phone, header, and footer)
relevant_elements = phone_elements + header_elements + footer_elements
# Initialize an empty set to store found phone numbers
phone_numbers = set()
# Regex pattern to match phone numbers (e.g., international and local formats)
phone_pattern = r'\+?\d[\d\s\-()]{7,15}'
# Extract phone numbers from the text of each selected element
for element in relevant_elements:
text_content = element.get_text(strip=True)
numbers_found = re.findall(phone_pattern, text_content)
phone_numbers.update(numbers_found)
return phone_numbers
# Main function to scrape contact info from a URL
def scrape_contact_info(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'es-ES,es;q=0.5',
'Connection': 'keep-alive'
}
try:
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status() # Raise an error for bad responses (4xx, 5xx)
# Ensure the response content is in HTML
soup = BeautifulSoup(response.content, 'html.parser') # Use response.content to ensure binary data is handled correctly
page_content = response.text # The full HTML content as a string
# Find all valid emails and phone numbers in the page text
emails = extract_emails_from_href(soup)
phone_numbers = extract_phone_numbers_from_href(soup)
# If no emails found, scrape the entire page for emails
if not emails:
emails = extract_emails_from_page(page_content)
# If no phone numbers found, scrape the entire page for phone numbers
if not phone_numbers:
phone_numbers = extract_phone_numbers_from_page(soup)
return emails, phone_numbers
except requests.RequestException as e:
print(f"Error scraping {url}: {e}")
return set(), set()
# Function to create the output directory if it doesn't exist
def create_output_directory(directory='output'):
if not os.path.exists(directory):
os.makedirs(directory)
# Function to sanitize query for use in a filename
def sanitize_filename(query):
# Replace spaces with underscores and remove special characters except alphanumeric and underscores
sanitized = re.sub(r'[^\w\s-]', '', query) # Remove special characters
sanitized = sanitized.replace(' ', '_') # Replace spaces with underscores
return sanitized
# Function to save the results to a text file (in overwrite mode)
# Function to save the results to a text file (in overwrite mode)
def save_to_file(results, filename):
output_directory = 'output'
create_output_directory(output_directory) # Ensure the output directory exists
# Create the full file path
filepath = os.path.join(output_directory, filename)
# Check for existing files and add a suffix if necessary
base_filename, file_extension = os.path.splitext(filepath)
counter = 1
while os.path.exists(filepath):
filepath = f"{base_filename}_{counter}{file_extension}"
counter += 1
# Only write to file if there are valid entries
if results: # Check if results is not empty
with open(filepath, 'w') as f: # Ensure overwrite with 'w' mode
# Create sets to keep track of unique emails and phone numbers across all results
all_emails = set()
all_phones = set()
for entry in results:
# Only save unique emails and phone numbers
unique_emails = set(entry['emails']) - all_emails
unique_phones = set(entry['phones']) - all_phones
# If there are no new unique emails or phones, skip this entry
if not unique_emails and not unique_phones:
continue
# Update the global sets with the new unique entries
all_emails.update(unique_emails)
all_phones.update(unique_phones)
f.write(f"Website Name: {entry['name']}\n")
f.write(f"URL: {entry['url']}\n")
f.write(f"Emails: {', '.join(unique_emails) if unique_emails else 'None'}\n")
f.write(f"Phone Numbers: {', '.join(unique_phones) if unique_phones else 'None'}\n")
f.write("\n" + "-"*50 + "\n\n")
else:
print(f"No valid entries found. The file '{filename}' will not be created.")
# Main script
def main():
# Check if the search query is provided as a command-line argument
if len(sys.argv) < 2:
print("Usage: python app.py \"search query\"")
sys.exit(1) # Exit the program with an error code
# Get the search query from the command-line argument
query = sys.argv[1].strip()
# Sanitize query to use it as part of the filename
sanitized_query = sanitize_filename(query)
filename = f"{sanitized_query}.txt"
num_results = 30
results = []
# Perform Google search to get URLs
search_results = search(query, num_results=num_results, lang="es")
for url in search_results:
print(f"Scraping {url}")
emails, phone_numbers = scrape_contact_info(url)
# Only add entries that have at least one email or phone number
if emails or phone_numbers:
results.append({
'name': url.split('//')[-1].split('/')[0], # Extract domain name
'url': url,
'emails': emails,
'phones': phone_numbers
})
# Save the results to a file using the sanitized filename
save_to_file(results, filename)
# Additional message if no entries were valid
if not results:
print("No valid entries were found. The output file was not created.")
if __name__ == "__main__":
main()