UrlStore: use bytes instead of str for url paths (#88)

* UrlStore: use bytes in keys * use black * urlpaths as bytes * fix black * simplify unvisited * change variable name * restore tests * url paths as binary only
adbar · May 30, 2024 · 9ac3d3c · 9ac3d3c
1 parent 720ccbb
commit 9ac3d3c
Show file tree

Hide file tree

Showing 3 changed files with 25 additions and 21 deletions.
diff --git a/courlan/sampling.py b/courlan/sampling.py
@@ -23,9 +23,9 @@ def _make_sample(
     output_urls = []
     for domain in urlstore.urldict:  # key=cmp_to_key(locale.strcoll)
         urlpaths = [
-            p.urlpath
+            p.path()
             for p in urlstore._load_urls(domain)
-            if p.urlpath not in ("/", None)
+            if p.urlpath not in (b"/", None)
         ]
         # too few or too many URLs
         if (

diff --git a/courlan/urlstore.py b/courlan/urlstore.py
@@ -63,9 +63,13 @@ class UrlPathTuple:
     __slots__ = ("urlpath", "visited")
 
     def __init__(self, urlpath: str, visited: bool) -> None:
-        self.urlpath: str = urlpath
+        self.urlpath: bytes = urlpath.encode("utf-8")
         self.visited: bool = visited
 
+    def path(self) -> str:
+        "Get the URL path as string."
+        return self.urlpath.decode("utf-8")
+
 
 class UrlStore:
     "Defines a class to store domain-classified URLs and perform checks against it."
@@ -180,16 +184,16 @@ def _store_urls(
             if self.urldict[domain].state is State.BUSTED:
                 return
             urls = self._load_urls(domain)
-            known = {u.urlpath for u in urls}
+            known = {u.path() for u in urls}
         else:
             urls = deque()
             known = set()
 
         # check if the link or its variants are known
         if to_right is not None:
-            urls.extend(t for t in to_right if not is_known_link(t.urlpath, known))
+            urls.extend(t for t in to_right if not is_known_link(t.path(), known))
         if to_left is not None:
-            urls.extendleft(t for t in to_left if not is_known_link(t.urlpath, known))
+            urls.extendleft(t for t in to_left if not is_known_link(t.path(), known))
 
         with self._lock:
             if self.compressed:
@@ -216,14 +220,14 @@ def _search_urls(
         # init
         last_domain: Optional[str] = None
         known_paths: Dict[str, Optional[bool]] = {}
-        remaining_urls = {u: None for u in urls}
+        remaining_urls = dict.fromkeys(urls)
         # iterate
         for url in sorted(remaining_urls):
             hostinfo, urlpath = get_host_and_path(url)
             # examine domain
             if hostinfo != last_domain:
                 last_domain = hostinfo
-                known_paths = {u.urlpath: u.visited for u in self._load_urls(hostinfo)}
+                known_paths = {u.path(): u.visited for u in self._load_urls(hostinfo)}
             # run checks: case 1: the path matches, case 2: visited URL
             if urlpath in known_paths and (
                 switch == 1 or (switch == 2 and known_paths[urlpath])
@@ -294,7 +298,7 @@ def reset(self) -> None:
 
     def get_known_domains(self) -> List[str]:
         "Return all known domains as a list."
-        return list(self.urldict)
+        return list(self.urldict.keys())
 
     def get_unvisited_domains(self) -> List[str]:
         """Find all domains for which there are unvisited URLs
@@ -316,14 +320,12 @@ def unvisited_websites_number(self) -> int:
 
     def find_known_urls(self, domain: str) -> List[str]:
         """Get all already known URLs for the given domain (ex. "https://example.org")."""
-        return [domain + u.urlpath for u in self._load_urls(domain)]
+        return [domain + u.path() for u in self._load_urls(domain)]
 
     def find_unvisited_urls(self, domain: str) -> List[str]:
         "Get all unvisited URLs for the given domain."
         if not self.is_exhausted_domain(domain):
-            return [
-                domain + u.urlpath for u in self._load_urls(domain) if not u.visited
-            ]
+            return [domain + u.path() for u in self._load_urls(domain) if not u.visited]
         return []
 
     def filter_unknown_urls(self, urls: List[str]) -> List[str]:
@@ -342,7 +344,7 @@ def is_known(self, url: str) -> bool:
         "Check if the given URL has already been stored."
         hostinfo, urlpath = get_host_and_path(url)
         # returns False if domain or URL is new
-        return urlpath in {u.urlpath for u in self._load_urls(hostinfo)}
+        return urlpath in {u.path() for u in self._load_urls(hostinfo)}
 
     # DOWNLOADS
 
@@ -360,7 +362,7 @@ def get_url(self, domain: str, as_visited: bool = True) -> Optional[str]:
                         with self._lock:
                             self.urldict[domain].count += 1
                         self._store_urls(domain, url_tuples, timestamp=datetime.now())
-                    return domain + url.urlpath
+                    return domain + url.path()
         # nothing to draw from
         with self._lock:
             self.urldict[domain].state = State.ALL_VISITED
@@ -419,7 +421,7 @@ def establish_download_schedule(
                 ):
                     break
                 if not url.visited:
-                    urlpaths.append(url.urlpath)
+                    urlpaths.append(url.path())
                     url.visited = True
                     with self._lock:
                         self.urldict[domain].count += 1
@@ -507,7 +509,7 @@ def print_urls(self) -> None:
             print(
                 "\n".join(
                     [
-                        domain + u.urlpath + "\t" + str(u.visited)
+                        f"{domain}{u.path()}\t{str(u.visited)}"
                         for u in self._load_urls(domain)
                     ]
                 ),

diff --git a/tests/urlstore_tests.py b/tests/urlstore_tests.py
@@ -37,7 +37,7 @@ def test_urlstore():
     assert len(my_urls.urldict) == 1 and "http://example.org" not in my_urls.urldict
     assert len(my_urls.urldict["https://example.org"].tuples) == 2
     firstelem = my_urls.urldict["https://example.org"].tuples[0]
-    assert firstelem.urlpath == "/" and firstelem.visited is False
+    assert firstelem.urlpath == b"/" and firstelem.visited is False
     # reset
     num, _, _ = gc.get_count()
     my_urls.reset()
@@ -187,15 +187,15 @@ def test_urlstore():
     my_urls.add_urls(appendleft=extension_urls)
     url_tuples = my_urls._load_urls(example_domain)
     assert len(url_tuples) == len(example_urls) + 11
-    assert url_tuples[-1].urlpath == "/1/9" and url_tuples[0].urlpath == "/1/10"
+    assert url_tuples[-1].urlpath == b"/1/9" and url_tuples[0].urlpath == b"/1/10"
 
     # duplicates
     my_urls.add_urls(extension_urls)
     my_urls.add_urls(appendleft=extension_urls)
     assert len(my_urls._load_urls(example_domain)) == len(example_urls) + len(
         extension_urls
     )
-    assert url_tuples[-1].urlpath == "/1/9" and url_tuples[0].urlpath == "/1/10"
+    assert url_tuples[-1].urlpath == b"/1/9" and url_tuples[0].urlpath == b"/1/10"
 
     # get_url
     assert my_urls.urldict[example_domain].timestamp is None
@@ -221,7 +221,9 @@ def test_urlstore():
 
     url_tuples = my_urls._load_urls(example_domain)
     # positions
-    assert url1.endswith(url_tuples[0].urlpath) and url2.endswith(url_tuples[1].urlpath)
+    assert url1.endswith(url_tuples[0].urlpath.decode("utf-8")) and url2.endswith(
+        url_tuples[1].urlpath.decode("utf-8")
+    )
     # timestamp
     assert my_urls.urldict[example_domain].timestamp is not None
     # nothing left