-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlisting_page.py
107 lines (91 loc) · 3.32 KB
/
listing_page.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
"""
Page listing all job listings
"""
from urllib.request import urlopen
from bs4 import BeautifulSoup
from models import ATSBaseURL
class ListingPage:
"""
The page listing all job listings
"""
def __init__(
self, page_soup: BeautifulSoup, url: str, base_url: ATSBaseURL = None
) -> None:
self.url = url
self._iterate_to_base_soup(page_soup)
if base_url:
self.ats_base_url = base_url
else:
self._identify_base_url(url)
def _iterate_to_base_soup(self, page_soup: BeautifulSoup) -> None:
"""
Recursively loop through iframes until the base board page is found
Assumption: the base soup probably has a host of ATSBaseURL links
"""
ats_base_urls = list(ATSBaseURL.__members__.values())
self.page_soup = page_soup
# First, try and parse through iframes because they're handled differently
iframes = page_soup.find_all("iframe")
for iframe in iframes:
# Fetch the right source & open
iframe_src = iframe.attrs.get("src", "")
if "http" not in iframe_src:
iframe_src = iframe.attrs.get("data-src", "")
for ats_base_url in ats_base_urls:
if ats_base_url in iframe_src:
with urlopen(iframe_src) as response:
self.page_soup = BeautifulSoup(response, "html.parser")
return
noframes = page_soup.find_all("noframes")
for noframe in noframes:
links = noframe.find_all("a", href=True)
for link in links:
link_href = link.attrs.get("href", "")
for ats_base_url in ats_base_urls:
if ats_base_url in link_href:
with urlopen(link_href) as response:
self.page_soup = BeautifulSoup(response, "html.parser")
return
def _identify_base_url(self, url: str) -> bool:
"""
Takes in a URL and figures out which ATS platform the page is hosted on
"""
# Attempt to match current url
ats_base_urls = list(ATSBaseURL.__members__.values())
for ats_base_url in ats_base_urls:
if url in ats_base_url:
self.ats_base_url = ats_base_url
return True
ats_base_urls = list(ATSBaseURL.__members__.values())
for ats_base_url in ats_base_urls:
if ats_base_url in str(self.page_soup):
self.ats_base_url = ats_base_url
return True
# Not an ATS system; set to none
self.ats_base_url = None
return False
def scrape_job_title(self) -> str:
"""
Scrapes self.page_soup for the right job title; returns a string
"""
return ""
def scrape_job_description(self) -> str:
"""
Scrapes self.page_soup for the right job description; returns a string
"""
return ""
def get_page_soup(self) -> BeautifulSoup:
"""
Returns the bs4 obj
"""
return self.page_soup
def get_url(self) -> str:
"""
Returns the base url
"""
return self.url
def get_ats_base_url(self) -> str:
"""
Returns the ATS system
"""
return self.ats_base_url