-
Notifications
You must be signed in to change notification settings - Fork 0
/
filter_urls.py
113 lines (93 loc) · 3.88 KB
/
filter_urls.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import re
from urllib.parse import urljoin
from requesting_urls import get_html
## -- Task 2 -- ##
def find_urls(
html: str,
base_url: str = "https://en.wikipedia.org",
output: str = None,
) -> set:
"""Find all the url links in a html text using regex
Arguments:
html (str): html string to parse
Returns:
urls (set) : set with all the urls found in html text
"""
# create and compile regular expression(s)
urls = set() # set of urls
# 1. find all the anchor tags, then
# 2. find the urls href attributes
# anc_pat finds all the <anc alt="..." src="..."> snippets
# this finds <a and collects everything up to the closing '>'
anc_pat = re.compile(r"<a[^>]+>", flags=re.IGNORECASE)
# href finds the text between quotes of the `href` attribute
url_pat = re.compile(r'href="([^"]+)"', flags=re.IGNORECASE)
# finding all the a tags
for anc_tag in anc_pat.findall(html):
anc_tags = url_pat.findall(anc_tag)
addstring = ""
# then find the attribute href of the ancor
for href_tekst in anc_tags:
addstring = re.sub(r'#[^>]+', '', href_tekst) #ignoring the the rest of the text when # appears
if href_tekst[0] == '/' and href_tekst[1] != '/': #if its only one /
addstring = re.sub(r'^/', base_url+"/", addstring) # adding the base url
elif href_tekst[0] == '/' and href_tekst[1] == '/':
addstring = re.sub(r'^//', "https://", addstring)
#if the addstring is not empty, add to the set
if addstring != "":
urls.add(addstring)
# Write to file if requested
if output:
text_file = open(f"{output}", "w")
print(f"Writing to: {output}")
for tags in urls:
print(f"{tags}", file=text_file)
#returning the set of urls
return urls
def find_articles(html: str, output=None) -> set:
"""Finds all the wiki articles inside a html text. Make call to find urls, and filter
arguments:
- text (str) : the html text to parse
returns:
- (set) : a set with urls to all the articles found
"""
urls = find_urls(html) # calling the function find_urls to pick up all urls of the html we pass in
pattern = re.compile(r'.*wikipedia.org/wiki.*$', flags=re.IGNORECASE) # making a pattern to find the articles of the urls
articles = set() # Set of Articles
# looping through all the urls
for url in urls:
# finding the matching url, out of the pattern we have
match_articles = pattern.findall(url)
# for each aricle matching, adding to the set articles
for article in match_articles:
articles.add(article)
# Write to file if wanted
if output:
text_file = open(f"{output}", "w")
print(f"Writing to: {output}")
for article in articles:
print(f"{article}", file=text_file)
#returning set of articles
return articles
## Regex example
def find_img_src(html: str):
"""Find all src attributes of img tags in an HTML string
Args:
html (str): A string containing some HTML.
Returns:
src_set (set): A set of strings containing image URLs
The set contains every found src attibute of an img tag in the given HTML.
"""
# img_pat finds all the <img alt="..." src="..."> snippets
# this finds <img and collects everything up to the closing '>'
img_pat = re.compile(r"<img[^>]+>", flags=re.IGNORECASE)
# src finds the text between quotes of the `src` attribute
src_pat = re.compile(r'src="([^"]+)"', flags=re.IGNORECASE)
src_set = set()
# first, find all the img tags
for img_tag in img_pat.findall(html):
# then, find the src attribute of the img, if any
match = src_pat.search(img_tag)
if match:
src_set.add(match.group(1))
return src_set