-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathboard_page.py
110 lines (94 loc) · 3.68 KB
/
board_page.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
"""
Page listing all job listings
"""
from urllib.request import urlopen
from bs4 import BeautifulSoup
from models import ATSBaseURL
class BoardPage:
"""
The page listing all job listings
"""
def __init__(self, page_soup: BeautifulSoup, url: str) -> None:
self.url = url
self._iterate_to_base_soup(page_soup)
self._identify_base_url(url)
def _iterate_to_base_soup(self, page_soup: BeautifulSoup) -> None:
"""
Recursively loop through iframes until the base board page is found
Assumption: the base soup probably has a host of ATSBaseURL links
"""
ats_base_urls = list(ATSBaseURL.__members__.values())
self.page_soup = page_soup
# First, try and parse through iframes because they're handled differently
iframes = page_soup.find_all("iframe")
for iframe in iframes:
# Fetch the right source & open
iframe_src = iframe.attrs.get("src", "")
if "http" not in iframe_src:
iframe_src = iframe.attrs.get("data-src", "")
for ats_base_url in ats_base_urls:
if ats_base_url in iframe_src:
with urlopen(iframe_src) as response:
self.page_soup = BeautifulSoup(response, "html.parser")
return
noframes = page_soup.find_all("noframes")
for noframe in noframes:
links = noframe.find_all("a", href=True)
for link in links:
link_href = link.attrs.get("href", "")
for ats_base_url in ats_base_urls:
if ats_base_url in link_href:
with urlopen(link_href) as response:
self.page_soup = BeautifulSoup(response, "html.parser")
return
def _identify_base_url(self, url: str) -> bool:
"""
Takes in a URL and figures out which ATS platform the page is hosted on
"""
# Attempt to match current url
ats_base_urls = list(ATSBaseURL.__members__.values())
for ats_base_url in ats_base_urls:
if url in ats_base_url:
self.ats_base_url = ats_base_url
return True
ats_base_urls = list(ATSBaseURL.__members__.values())
for ats_base_url in ats_base_urls:
if ats_base_url in str(self.page_soup):
self.ats_base_url = ats_base_url
return True
# Not an ATS system; set to none
self.ats_base_url = None
return False
def scrape_all_relevant_roles(self, role_keywords: str) -> list[str]:
"""
Scrapes self.page_soup for all relevant roles; returns a list of links
"""
role_links = []
for role_keyword in role_keywords:
role_tags = self.page_soup.find_all(
lambda tag, keyword=role_keyword: len(tag.find_all()) == 0
and keyword in tag.text
)
for role_tag in role_tags:
href = role_tag.attrs.get("href", "")
if href and self.ats_base_url:
if self.ats_base_url in href:
role_links.append(href)
elif href.startswith("/"):
role_links.append(f"https://{self.ats_base_url}{href}")
return role_links
def get_page_soup(self) -> BeautifulSoup:
"""
Returns the bs4 obj
"""
return self.page_soup
def get_url(self) -> str:
"""
Returns the base url
"""
return self.url
def get_ats_base_url(self) -> str:
"""
Returns the ATS system
"""
return self.ats_base_url