-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfootyroom_grabber.py
211 lines (154 loc) · 5.35 KB
/
footyroom_grabber.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
from lxml import html
import requests
import sys
from bs4 import BeautifulSoup
import os
import re
from datetime import datetime
import urllib
import time
import json
from utils import debug, info, warning, error, unescape
import re
# import utils
from models import Match, Goal, League, Media
class FootyRoomGrabber():
def start(self):
leagues = self.get_leagues()
for league in leagues:
debug("Processing league: " + league[0].name, False)
urls = self.get_page_urls(league[0], league[1])
print(" => Found {} url pages. Processing urls...".format(len(urls)))
counter = 1
for url in urls:
debug("Processing page {}: ".format(counter), False)
# Returns tuple (match_url, match_img)
match_urls = self.process_page(url)
print(" Found {}".format(len(match_urls)))
for match in match_urls:
debug("Processing match {} ".format(match[0]), False)
media_urls = self.process_match(match[0], match[1])
print (" Found {}".format(len(media_urls)))
print (media_urls)
counter += 1
def get_leagues(self):
url = "http://footyroom.com/"
r = requests.get(url)
soup = BeautifulSoup(r.content, "lxml")
if r.status_code != 200:
error("Failed to process url!")
return False
# Get main content hodlder
main_section = soup.find("section", {"class": "all-leagues-content"})
debug("main_section: {}".format(main_section != None))
# Get children
league_group = main_section.find_all("div", recursive=False)
other_league_groups = main_section.find_all("section", recursive=False)
# debug("main_children: {}".format(match_children != None))
leagues_to_rtn = []
for lg_group in league_group:
# Get ul inside div
lst = lg_group.find("ul")
# Get country from first li of ul
country_li = lst.find("li", {"class": "all-leagues-header"})
country = country_li.text
leagues = lst.find_all("li", { "class": "" })
for league_li in leagues:
league = League(country, league_li.text)
url = league_li.find("a")["href"]
# self.process_league(league, url)
leagues_to_rtn.append((league, url))
# Find leagues
return leagues_to_rtn
def get_page_urls(self, league, url):
r = requests.get(url)
soup = BeautifulSoup(r.content, "lxml")
# Get stageTree value
# This is basically the league identifier used to retrieved matches
# associated with said league
stageTree = self.get_stage_tree(soup)
urls = []
for i in range(1, 1000):
urls.append(
"http://footyroom.com/posts-pagelet?page={}&stageTree={}".format(i, stageTree))
return urls
def process_page(self, url):
r = requests.get(url)
soup = BeautifulSoup(r.content, "lxml")
main_div = soup.find("div", {"class": "posts-page clearfix "})
children_div = main_div.find_all(
"div", {"class": "card col-xs-12 col-ms-6 col-md-4 "})
matches = []
for child in children_div:
div = child.find("div", {"class": "card-image"})
match_url = div.find("a")["href"]
match_img = div.find("img")["src"]
matches.append((match_url, match_img))
return matches
def process_match(self, url, img_url):
r = requests.get(url)
soup = BeautifulSoup(r.content, "lxml")
script_tags = soup.find_all("script")
urls = []
for tag in script_tags:
if "DataStore.media " in tag.text:
urls = urls + self.extract_media_urls(tag.text)
return urls
def extract_media_urls(self, text):
media_urls = []
#\{(.*?)\}
patt = re.compile('\{(.*?)\}')
patt2 = re.compile('<source src=\"(.*?)\"')
# print(text.encode('utf-8').decode('unicode_escape'))
for m in re.finditer(patt, text):
media = json.loads(m.group(0))
if "attachment" in media and "iframe" in media["attachment"]:
# Extract URL from attachment value
# Split by src=" [it's known the element is an iframe, so we need to get the src prop]
tks = media["attachment"].split('src=')
# Might be quotation mark or '
splitter = tks[1][0]
# Get the second token, from the start+1 to the first occurence of "
# STOP AT = tks[1][1:].find(tks[1][0])
media_url = tks[1][1:tks[1][1:].find(tks[1][0])]
if media_url[0:2] == "//":
media_url = media_url[2:]
media_urls.append(media_url)
break
elif "attachment" in media and "video" in media["attachment"]:
# <video controls preload = "auto" width = "100%" height = "100%" >
# <source src = "http://twii.edgeboss.net/download/twii/manutd/video_mufc_20180115_stoke_goals.ogg" type = "video/ogg" > < / video >
patt = re.compile('\"(.*?)\"')
defined = False
for m in re.finditer(patt, media["attachment"]):
token = m.group(1)
if 'http' in token.lower():
defined = True
media_urls.append(token)
if defined == False:
error("COULD NOT FIND VIDEO ATTACHMENT")
break
else:
media_urls.append(re.sub(r'\\(.)', r'\1', media["source"]))
break
# for m in re.finditer(patt2, re.sub(r'\\(.)', r'\1', text)):
# url = m.group(1)
# if "http" in url:
# media_urls.append(url)
return media_urls
def get_stage_tree(self, soup):
script_tags = soup.find_all("script")
stageTree = None
show = False
for tag in script_tags:
if stageTree != None:
break
patt = re.compile('("([^"]|"")*")')
for m in re.finditer(patt, tag.text):
# print(m.group(0), '*', m.group(1))
if show:
stageTree = m.group(1)
break
if "stagetree" in m.group(1).lower():
show = True
return stageTree.replace('"', '')