-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawler.py
158 lines (125 loc) · 4.77 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import requests
import sys
import re
import json
from bs4 import BeautifulSoup
from collections import deque
from PageNode import PageNode
""" GLOBAL VARS """
RESULTS_FILE_PATH = "results.json"
ROOT_URL_ARG_IDX = 0
DEPTH_ARG_IDX = 1
"""
A web crawler CLI:
Given a URL string and depth, the crawler will scan the web page for any images, continue to every link inside that page
and scan it as well. The crawling should stop once it reached the desired depth (denoted by the input param, where
depth=0 is the first page).
The Images source paths will be saved to the 'results.json' file in the following format:
{
results: [
{
imageUrl: string,
sourceUrl: string
depth: number
]
}
"""
def main():
argument_list = sys.argv[1:]
root_url, max_depth = argument_list[ROOT_URL_ARG_IDX], int(argument_list[DEPTH_ARG_IDX])
if not is_file_exists(RESULTS_FILE_PATH):
init_results_file(RESULTS_FILE_PATH)
crawl_page(root_url, max_depth)
def is_file_exists(file_name):
"""
Test if a file already exist in the file system
:param file_name: the file path to test
:return: True if it exist, False elsewhere.
"""
with open(file_name, 'wb+') as file:
test = file.read(1)
file.seek(0)
return len(test) >= 1
def init_results_file(file_name):
"""
Initialize the local results file, assuming it is empty
:param file_name: String - The local file path
:return: None
"""
with open(file_name, 'r+') as file:
file_data = {"results": []}
file.seek(0)
json.dump(file_data, file, indent=4)
# ---------- The crawler function -------------- #
def crawl_page(root_url, max_crawling_depth):
"""
The Web crawler's main function - Based on the BFS traversing algorithm.
The crawler starts by searching for images of a given web page (declared as the root node with depth 0),
and moves on to its adjacent web pages (i.e. - URL links to pages within it), adding images
from their pages as well. The process will continue until the required depth is reached.
:param root_url: The starting page url
:param max_crawling_depth: The maximum depth of neighbors pages to crawl
:return:
"""
current_page = PageNode(root_url, 0)
pages_to_crawl = deque()
pages_to_crawl.append(current_page)
visited_urls = set(current_page.get_source_url())
while current_page.get_node_depth() <= max_crawling_depth:
current_page = pages_to_crawl.popleft()
current_depth = current_page.get_node_depth()
if current_depth <= max_crawling_depth:
extract_images_source_paths(current_page)
adjacent_urls = extract_adjacent_urls(current_page)
for adjacent_url in adjacent_urls:
if adjacent_url not in visited_urls:
visited_urls.add(adjacent_url)
pages_to_crawl.append(PageNode(adjacent_url, current_depth + 1))
def extract_images_source_paths(page_node):
"""
Given a URL path, write all images source paths to the results file.
:param page_node: A PageNode instance
:return:
"""
html_data = get_page_body(page_node)
soup = BeautifulSoup(html_data, 'html.parser')
for item in soup.find_all('img'):
if item.has_attr('src'):
current_image = {"imageUrl": item['src'],
"sourceUrl": page_node.get_source_url(),
"depth": str(page_node.get_node_depth())}
write_to_results_file(current_image, RESULTS_FILE_PATH)
def extract_adjacent_urls(page_node):
"""
Given a URL path, find all URL paths included inside its body, and return them
:param page_node: the given page to extract URLs from
:return: returns a list of URL paths contained in the given page body element.
"""
soup = BeautifulSoup(get_page_body(page_node), 'html.parser')
adjacent_urls = []
for link in soup.find_all('a', attrs={'href': re.compile("^https://")}):
adjacent_urls.append(link.get('href'))
return adjacent_urls
def get_page_body(page_node):
"""
Returns the html body of a given url
:param page_node: Page node holding the URL to access
:return: String - The HTML response body
"""
url = page_node.get_source_url()
response = requests.get(url)
return response.text
def write_to_results_file(new_data, file_name=RESULTS_FILE_PATH):
"""
Append new data to the given json file
:param new_data: the data to append
:param file_name: the json file path
:return:
"""
with open(file_name, 'r+') as file:
file_data = json.load(file)
file_data["results"].append(new_data)
file.seek(0)
json.dump(file_data, file, indent=4)
if __name__ == "__main__":
main()