-
Notifications
You must be signed in to change notification settings - Fork 1
/
corpus.py
43 lines (37 loc) · 1.4 KB
/
corpus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
#Animesh Agrawal animesha 50254531
#Micheal Kirk kirkmc 49847974
#Rachel Lam rslam 24554220
import json
import os
from urllib.parse import urlparse
class Corpus:
"""
This class is responsible for handling corpus related functionalities like mapping a url to its local file name
"""
# The corpus directory name
WEBPAGES_RAW_NAME = "WEBPAGES_RAW"
# The corpus JSON mapping file
JSON_FILE_NAME = os.path.join(".", WEBPAGES_RAW_NAME, "bookkeeping.json")
def __init__(self):
self.file_url_map = json.load(open(self.JSON_FILE_NAME), encoding="utf-8")
self.url_file_map = dict()
self.corp_length = 0
for key in self.file_url_map:
self.corp_length += 1
self.url_file_map[self.file_url_map[key]] = key
def get_corpus_length(self):
return self.corp_length
def get_file_name(self, url):
"""
Given a url, this method looks up for a local file in the corpus and, if existed, returns the file address. Otherwise
returns None
"""
url = url.strip()
parsed_url = urlparse(url)
url = url[len(parsed_url.scheme) + 3:]
if url in self.url_file_map:
addr = self.url_file_map[url].split("/")
dir = addr[0]
file = addr[1]
return os.path.join(".", self.WEBPAGES_RAW_NAME, dir, file)
return None