-
Notifications
You must be signed in to change notification settings - Fork 0
/
SIteAvailabilityScraper.py
198 lines (157 loc) · 8.09 KB
/
SIteAvailabilityScraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
import urllib.request
import re
import math
from datetime import date, timedelta, datetime
from bs4 import BeautifulSoup, Tag
# TODO: Add in pagination logic so we can get a large range of dates of availability for doing operations such as "any weekend in the next month". We'll probably have to store dates as actual dates, using the CST timezone as our base, so we can use libraries to get stuff such as what day of the week it is, so we can do "weekend" operations
class SiteAvailabilityScraper:
DATE_FORMAT = '%m/%d/%Y'
def __init__(self, park_id, start_date, end_date):
"""
start_date and end_date can only be 14 days apart. If the end date is more than 14 days past the start date, it
will just show you the 14 days after the start_date. Really the end date does nothing.
:param park_id: integer of the park id. Find this by going on the page and viewing the calendar search for the park you want
:param start_date: string in form "mm/dd/yyyy" (month and day must be zero-padded)
:param end_date: string in form "mm/dd/yyyy" (month and day must be zero-padded)
"""
self.park_id = park_id
self.start_date = datetime.strptime(start_date, self.DATE_FORMAT).date()
self.end_date = datetime.strptime(end_date, self.DATE_FORMAT).date()
if (self.end_date - self.start_date).days < 1:
raise ValueError("Your end date needs to be at least one day past your start date")
@staticmethod
def __generate_url(park_id, start_date, end_date):
return "http://texas.reserveworld.com/GeneralAvailabilityCalendar.aspx?campId=" \
+ park_id + "&arrivalDate=" + start_date.strftime(SiteAvailabilityScraper.DATE_FORMAT)\
+ "&DepartureDate=" + end_date.strftime(SiteAvailabilityScraper.DATE_FORMAT)
def get_availability_list(self):
def __load_site(url):
page = urllib.request.urlopen(url)
return BeautifulSoup(page, "html.parser")
def __extract_header(td):
anchor_tags = td.find("a")
if anchor_tags is not None:
return anchor_tags.string.strip()
return td.string.strip()
def __process_site_row(site_row):
row = list(map(lambda x: x.string.strip(), site_row.find_all("td")))
site_type = SiteType(row[0])
for i in range(first_date_index, last_date_index + 1):
site_type.add_availability(SiteDate(headers[i], row[i]))
return site_type.name, site_type
def __get_header_tag(soup):
table = soup.find(id="ctl07_tblMain")
return table.find("tr", "altCampArea")
def __extract_headers_from_row(header_row):
headers_html = header_row.find_all("td")
return list(map(__extract_header, headers_html))
def __get_indicies_of_date_range(headers):
regex = '^\d{1,2}/\d{2}$'
first_date_index = SiteAvailabilityHelper.get_index_of_first_match(headers, regex)
last_date_index = SiteAvailabilityHelper.get_index_of_last_match(headers, regex)
return first_date_index, last_date_index
def __get_date_ranges():
date_delta = self.end_date - self.start_date
biweeks = date_delta.days / 14
date_ranges = []
if biweeks > 1:
whole_biweeks = int(math.floor(biweeks))
for i in range(0, whole_biweeks):
range_start = self.start_date + timedelta(14 * i)
date_ranges.append((range_start, range_start + timedelta(13)))
remaining_days = date_delta.days % 14
if remaining_days > 0:
range_start = self.start_date + timedelta(14 * whole_biweeks)
date_ranges.append((range_start, range_start + timedelta(remaining_days)))
else:
date_ranges.append((self.start_date, self.end_date))
return date_ranges
for date_range in __get_date_ranges():
start_range, end_range = date_range
current_soup = __load_site(self.__generate_url(self.park_id, start_range, end_range))
header_row = __get_header_tag(current_soup)
headers = __extract_headers_from_row(header_row)
first_date_index, last_date_index = __get_indicies_of_date_range(headers)
availability_list = list(map(__process_site_row, filter(lambda x: type(x) is Tag,
header_row.next_siblings)))
return AvailabilityResults(availability_list)
class SiteAvailabilityHelper:
@staticmethod
def get_index_of_first_match(lis, regex):
"""
Finds the the index of the first string in a list of strings that matches the provided regex string
:param lis: list of strings
:param regex: regular expression to match to
:return: None if there was no matches. Int of index of the last match if there was a match
"""
for i in range(0, len(lis)):
if re.search(regex, lis[i]):
return i
return None
@staticmethod
def get_index_of_last_match(lis, regex):
"""
Finds the the index of the last string in a list of strings that matches the provided regex string
:param lis: list of strings
:param regex: regular expression to match to
:return: None if there was no matches. Int of index of the last match if there was a match
"""
index_to_return = None
for i in range(0, len(lis)):
if re.search(regex, lis[i]):
index_to_return = i
return index_to_return
# TODO: Add in a SiteRule object that will take the rule, and the value, and store that as a map in the SiteType object
class SiteType:
def __init__(self, site_name):
self.name = site_name
self.site_availability = dict()
def add_availability(self, site_date):
self.site_availability[site_date.date] = site_date
def get_availability(self):
return self.site_availability
class SiteDate:
def __init__(self, date, num_available):
self.date = date
self.num_available = num_available
class DateAvailability:
def __init__(self, date):
self.total_available = 0
self.date = date
self._site_availability = dict()
def add_site(self, site_name, num_free_sites):
self._site_availability[site_name] = num_free_sites
self.total_available += int(num_free_sites)
class AvailabilityResults:
def __init__(self, site_types):
self._site_types_list = site_types
self._site_availability = None
self._date_availability = None
@property
def site_types(self):
return list(site_availability.keys())
@property
def site_availability(self):
if self._site_availability is None:
self._site_availability = dict()
for site_type in self._site_types_list:
self._site_availability[site_type[0]] = site_type[1]
return self._site_availability
@property
def date_availability(self):
if self._date_availability is None:
self._date_availability = dict()
for site_name, site_type in self._site_types_list:
for availability_date, site_date in site_type.site_availability.items():
if availability_date not in self._date_availability:
self._date_availability[availability_date] = DateAvailability(availability_date)
self._date_availability[availability_date].add_site(site_name, site_date.num_available)
return self._date_availability
# This example is for Enchanted Rock State Natural Area
scraper = SiteAvailabilityScraper("79", "03/02/2018", "03/05/2018")
availability_list = scraper.get_availability_list()
site_availability = availability_list.site_availability
overflow_sites = site_availability['OVERFLOW SITES']
# Need to modify this to take in a date object
march_second_availability = overflow_sites.get_availability()['03/02']
date_availability = availability_list.date_availability