-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrapper.py
67 lines (42 loc) · 2.04 KB
/
scrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
from urllib.request import urlopen
import re
BASE_URL = 'https://regularshow.fandom.com'
DATA_DIR = 'data/'
def get_page_html(page_url: str):
return urlopen(page_url).read().decode("utf-8")
def get_episode_links(contents_url: str) -> list:
html = get_page_html(contents_url)
pattern = re.compile(r'<a[^>]*?href="([^"]*?/Transcript)"[^>]*?>')
return [BASE_URL + link for link in pattern.findall(html)]
def get_episode_title(html) -> str:
span_title_pattern = re.compile(r'<span[^>]*class="mw-page-title-main"[^>]*>(.*?)</span>', re.DOTALL)
h1_title_pattern = re.compile(r'<h1[^>]*id="firstHeading"[^>]*>(.*?)</h1>', re.DOTALL)
match = span_title_pattern.search(html)
if match:
title = match.group(1)[:-11]
else:
match = h1_title_pattern.search(html).group(1)
title = re.compile(r'"(.*?)"').findall(match)[0]
return re.sub(r'[<>:"/\\|?*]', '', title)
def paragraphs_outside_tables(html: str) -> list:
paragraph_pattern = re.compile(r'<p>(.*?)</p>', re.DOTALL)
table_pattern = re.compile(r'<table.*?</table>', re.DOTALL)
table_ranges = {(m.start(), m.end()) for m in table_pattern.finditer(html)}
return [p for p in paragraph_pattern.findall(html) if all(not (s <= html.find(p) < e) for s, e in table_ranges)]
def cleanup_html(html) -> str:
return re.sub('<.*?>', '', html).replace(' ', ' ')
def load_episode(episode_url: str) -> tuple:
html = get_page_html(episode_url)
paragraphs = paragraphs_outside_tables(html)
raw_transcript = ''.join(paragraphs)
return get_episode_title(html), cleanup_html(raw_transcript)
def main():
contents_page1 = BASE_URL + '/wiki/Category:Transcripts'
contents_page2 = contents_page1 + '?from=Space+Escape%2FTranscript'
links = get_episode_links(contents_page1) + get_episode_links(contents_page2)
for link in links:
title, transcript = load_episode(link)
with open(DATA_DIR + title + ".txt", "w", encoding='utf-8') as f:
f.write(transcript)
if __name__ == "__main__":
main()