Skip to content

Commit

Permalink
UrlStore: use bytes instead of str for url paths (#88)
Browse files Browse the repository at this point in the history
* UrlStore: use bytes in keys

* use black

* urlpaths as bytes

* fix black

* simplify unvisited

* change variable name

* restore tests

* url paths as binary only
  • Loading branch information
adbar committed May 30, 2024
1 parent 720ccbb commit 9ac3d3c
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 21 deletions.
4 changes: 2 additions & 2 deletions courlan/sampling.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,9 @@ def _make_sample(
output_urls = []
for domain in urlstore.urldict: # key=cmp_to_key(locale.strcoll)
urlpaths = [
p.urlpath
p.path()
for p in urlstore._load_urls(domain)
if p.urlpath not in ("/", None)
if p.urlpath not in (b"/", None)
]
# too few or too many URLs
if (
Expand Down
32 changes: 17 additions & 15 deletions courlan/urlstore.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,13 @@ class UrlPathTuple:
__slots__ = ("urlpath", "visited")

def __init__(self, urlpath: str, visited: bool) -> None:
self.urlpath: str = urlpath
self.urlpath: bytes = urlpath.encode("utf-8")
self.visited: bool = visited

def path(self) -> str:
"Get the URL path as string."
return self.urlpath.decode("utf-8")


class UrlStore:
"Defines a class to store domain-classified URLs and perform checks against it."
Expand Down Expand Up @@ -180,16 +184,16 @@ def _store_urls(
if self.urldict[domain].state is State.BUSTED:
return
urls = self._load_urls(domain)
known = {u.urlpath for u in urls}
known = {u.path() for u in urls}
else:
urls = deque()
known = set()

# check if the link or its variants are known
if to_right is not None:
urls.extend(t for t in to_right if not is_known_link(t.urlpath, known))
urls.extend(t for t in to_right if not is_known_link(t.path(), known))
if to_left is not None:
urls.extendleft(t for t in to_left if not is_known_link(t.urlpath, known))
urls.extendleft(t for t in to_left if not is_known_link(t.path(), known))

with self._lock:
if self.compressed:
Expand All @@ -216,14 +220,14 @@ def _search_urls(
# init
last_domain: Optional[str] = None
known_paths: Dict[str, Optional[bool]] = {}
remaining_urls = {u: None for u in urls}
remaining_urls = dict.fromkeys(urls)
# iterate
for url in sorted(remaining_urls):
hostinfo, urlpath = get_host_and_path(url)
# examine domain
if hostinfo != last_domain:
last_domain = hostinfo
known_paths = {u.urlpath: u.visited for u in self._load_urls(hostinfo)}
known_paths = {u.path(): u.visited for u in self._load_urls(hostinfo)}
# run checks: case 1: the path matches, case 2: visited URL
if urlpath in known_paths and (
switch == 1 or (switch == 2 and known_paths[urlpath])
Expand Down Expand Up @@ -294,7 +298,7 @@ def reset(self) -> None:

def get_known_domains(self) -> List[str]:
"Return all known domains as a list."
return list(self.urldict)
return list(self.urldict.keys())

def get_unvisited_domains(self) -> List[str]:
"""Find all domains for which there are unvisited URLs
Expand All @@ -316,14 +320,12 @@ def unvisited_websites_number(self) -> int:

def find_known_urls(self, domain: str) -> List[str]:
"""Get all already known URLs for the given domain (ex. "https://example.org")."""
return [domain + u.urlpath for u in self._load_urls(domain)]
return [domain + u.path() for u in self._load_urls(domain)]

def find_unvisited_urls(self, domain: str) -> List[str]:
"Get all unvisited URLs for the given domain."
if not self.is_exhausted_domain(domain):
return [
domain + u.urlpath for u in self._load_urls(domain) if not u.visited
]
return [domain + u.path() for u in self._load_urls(domain) if not u.visited]
return []

def filter_unknown_urls(self, urls: List[str]) -> List[str]:
Expand All @@ -342,7 +344,7 @@ def is_known(self, url: str) -> bool:
"Check if the given URL has already been stored."
hostinfo, urlpath = get_host_and_path(url)
# returns False if domain or URL is new
return urlpath in {u.urlpath for u in self._load_urls(hostinfo)}
return urlpath in {u.path() for u in self._load_urls(hostinfo)}

# DOWNLOADS

Expand All @@ -360,7 +362,7 @@ def get_url(self, domain: str, as_visited: bool = True) -> Optional[str]:
with self._lock:
self.urldict[domain].count += 1
self._store_urls(domain, url_tuples, timestamp=datetime.now())
return domain + url.urlpath
return domain + url.path()
# nothing to draw from
with self._lock:
self.urldict[domain].state = State.ALL_VISITED
Expand Down Expand Up @@ -419,7 +421,7 @@ def establish_download_schedule(
):
break
if not url.visited:
urlpaths.append(url.urlpath)
urlpaths.append(url.path())
url.visited = True
with self._lock:
self.urldict[domain].count += 1
Expand Down Expand Up @@ -507,7 +509,7 @@ def print_urls(self) -> None:
print(
"\n".join(
[
domain + u.urlpath + "\t" + str(u.visited)
f"{domain}{u.path()}\t{str(u.visited)}"
for u in self._load_urls(domain)
]
),
Expand Down
10 changes: 6 additions & 4 deletions tests/urlstore_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def test_urlstore():
assert len(my_urls.urldict) == 1 and "http://example.org" not in my_urls.urldict
assert len(my_urls.urldict["https://example.org"].tuples) == 2
firstelem = my_urls.urldict["https://example.org"].tuples[0]
assert firstelem.urlpath == "/" and firstelem.visited is False
assert firstelem.urlpath == b"/" and firstelem.visited is False
# reset
num, _, _ = gc.get_count()
my_urls.reset()
Expand Down Expand Up @@ -187,15 +187,15 @@ def test_urlstore():
my_urls.add_urls(appendleft=extension_urls)
url_tuples = my_urls._load_urls(example_domain)
assert len(url_tuples) == len(example_urls) + 11
assert url_tuples[-1].urlpath == "/1/9" and url_tuples[0].urlpath == "/1/10"
assert url_tuples[-1].urlpath == b"/1/9" and url_tuples[0].urlpath == b"/1/10"

# duplicates
my_urls.add_urls(extension_urls)
my_urls.add_urls(appendleft=extension_urls)
assert len(my_urls._load_urls(example_domain)) == len(example_urls) + len(
extension_urls
)
assert url_tuples[-1].urlpath == "/1/9" and url_tuples[0].urlpath == "/1/10"
assert url_tuples[-1].urlpath == b"/1/9" and url_tuples[0].urlpath == b"/1/10"

# get_url
assert my_urls.urldict[example_domain].timestamp is None
Expand All @@ -221,7 +221,9 @@ def test_urlstore():

url_tuples = my_urls._load_urls(example_domain)
# positions
assert url1.endswith(url_tuples[0].urlpath) and url2.endswith(url_tuples[1].urlpath)
assert url1.endswith(url_tuples[0].urlpath.decode("utf-8")) and url2.endswith(
url_tuples[1].urlpath.decode("utf-8")
)
# timestamp
assert my_urls.urldict[example_domain].timestamp is not None
# nothing left
Expand Down

0 comments on commit 9ac3d3c

Please sign in to comment.