urlstore: simplify batch generation, change timelimit to time_limit a…

…dd max_size arg (#91) * urlstore: simplify download batch and add max_size arg * use same args, itemgetter * update actions workflow * fix tests * fix tests * lint * lint * remove lines
adbar · Apr 25, 2024 · c811a02 · c811a02
1 parent a69aa76
commit c811a02
Show file tree

Hide file tree

Showing 3 changed files with 44 additions and 34 deletions.
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -26,9 +26,9 @@ jobs:
         - os: ubuntu-20.04
           python-version: 3.7
         - os: macos-latest
-          python-version: 3.8
+          python-version: "3.10"
         - os: windows-latest
-          python-version: 3.8
+          python-version: "3.10"
     steps:
     # Python and pip setup
     - name: Set up Python ${{ matrix.python-version }}
@@ -65,7 +65,6 @@ jobs:
         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
 
     - name: Code format with black
-      if: ${{ matrix.python-version != 'pypy3.8' }}
       run: |
         python -m pip install --upgrade black
         black --check --diff courlan
@@ -74,7 +73,6 @@ jobs:
       run: python -m pip install -e "."
 
     - name: Type checking with mypy
-      if: ${{ matrix.python-version != 'pypy3.8' }}
       run: |
         python -m pip install --upgrade mypy types-urllib3
         mypy -p courlan

diff --git a/courlan/urlstore.py b/courlan/urlstore.py
@@ -13,6 +13,7 @@
 from collections import defaultdict, deque
 from datetime import datetime, timedelta
 from enum import Enum
+from operator import itemgetter
 from threading import Lock
 from typing import (
     Any,
@@ -149,7 +150,7 @@ def _load_urls(self, domain: str) -> Deque[UrlPathTuple]:
         return deque()
 
     def _set_done(self) -> None:
-        if not self.done and all(self.is_exhausted_domain(d) for d in self.urldict):
+        if not self.done and all(v.state != State.OPEN for v in self.urldict.values()):
             with self._lock:
                 self.done = True
 
@@ -231,9 +232,6 @@ def _search_urls(
         # preserve input order
         return list(remaining_urls)
 
-    def _timestamp(self, domain: str) -> Optional[datetime]:
-        return self.urldict[domain].timestamp
-
     # ADDITIONS AND DELETIONS
 
     def add_urls(
@@ -301,12 +299,7 @@ def get_known_domains(self) -> List[str]:
     def get_unvisited_domains(self) -> List[str]:
         """Find all domains for which there are unvisited URLs
         and potentially adjust done meta-information."""
-        unvisited = []
-        if not self.done:
-            unvisited = [d for d, v in self.urldict.items() if v.state == State.OPEN]
-            if not unvisited:
-                self._set_done()
-        return unvisited
+        return [d for d, v in self.urldict.items() if v.state == State.OPEN]
 
     def is_exhausted_domain(self, domain: str) -> bool:
         "Tell if all known URLs for the website have been visited."
@@ -374,20 +367,26 @@ def get_url(self, domain: str, as_visited: bool = True) -> Optional[str]:
         self._set_done()
         return None
 
-    def get_download_urls(self, timelimit: int = 10) -> Optional[List[str]]:
+    def get_download_urls(
+        self, max_urls: int = 10000, time_limit: int = 10
+    ) -> Optional[List[str]]:
         """Get a list of immediately downloadable URLs according to the given
         time limit per domain."""
-        potential = self.get_unvisited_domains()
-        targets = []
-        for domain in potential:
-            timestamp = self._timestamp(domain)
+        urls = []
+        for website, entry in self.urldict.items():
+            if entry.state != State.OPEN:
+                continue
             if (
-                timestamp is None
-                or (datetime.now() - timestamp).total_seconds() > timelimit
+                not entry.timestamp
+                or (datetime.now() - entry.timestamp).total_seconds() > time_limit
             ):
-                targets.append(domain)
-        # get corresponding URLs and filter out None values
-        return list(filter(None, [self.get_url(domain) for domain in targets]))
+                url = self.get_url(website)
+                if url is not None:
+                    urls.append(url)
+                    if len(urls) >= max_urls:
+                        break
+        self._set_done()
+        return urls
 
     def establish_download_schedule(
         self, max_urls: int = 100, time_limit: int = 10
@@ -420,9 +419,9 @@ def establish_download_schedule(
                         self.urldict[domain].count += 1
             # determine timestamps
             now = datetime.now()
-            original_timestamp = self._timestamp(domain)
+            original_timestamp = self.urldict[domain].timestamp
             if (
-                original_timestamp is None
+                not original_timestamp
                 or (now - original_timestamp).total_seconds() > time_limit
             ):
                 schedule_secs = 0.0
@@ -439,7 +438,7 @@ def establish_download_schedule(
             self._store_urls(domain, url_tuples, timestamp=total_diff)
         # sort by first tuple element (time in secs)
         self._set_done()
-        return sorted(targets, key=lambda x: x[0])  # type: ignore[arg-type]
+        return sorted(targets, key=itemgetter(1))  # type: ignore[arg-type]
 
     # CRAWLING
 

diff --git a/tests/urlstore_tests.py b/tests/urlstore_tests.py
@@ -16,7 +16,6 @@
 import pytest
 
 from courlan import UrlStore
-from courlan.core import filter_links
 from courlan.urlstore import State, load_store
 
 
@@ -287,18 +286,32 @@ def test_urlstore():
     assert my_urls.total_url_number() == 20014
 
     # get download URLs
-    downloadable_urls = my_urls.get_download_urls(timelimit=0)
+    downloadable_urls = my_urls.get_download_urls(time_limit=0, max_urls=1)
     assert (
-        len(downloadable_urls) == 2
+        len(downloadable_urls) == 1
         and downloadable_urls[0] == "https://www.example.org/1"
     )
     assert (
         datetime.now() - my_urls.urldict["https://www.example.org"].timestamp
     ).total_seconds() < 0.25
     assert my_urls.urldict["https://www.example.org"].count == 3
-    assert my_urls.urldict["https://test.org"].count == 1
-    downloadable_urls = my_urls.get_download_urls()  # limit=10
+
+    # does not work on Windows?
+    # if os.name != "nt":
+    test_urls = UrlStore()
+    test_urls.add_urls(
+        ["https://www.example.org/1", "https://test.org/1", "https://test.org/2"]
+    )
+    downloadable_urls = test_urls.get_download_urls(time_limit=0)
+    assert (
+        len(downloadable_urls) == 2
+        and downloadable_urls[0].startswith("https://www.example.org")
+        and downloadable_urls[1].startswith("https://test.org")
+        and test_urls.urldict["https://test.org"].count == 1
+    )
+    downloadable_urls = test_urls.get_download_urls()
     assert len(downloadable_urls) == 0
+
     other_store = UrlStore()
     downloadable_urls = other_store.get_download_urls()
     assert not downloadable_urls and other_store.done is True
@@ -320,14 +333,14 @@ def test_urlstore():
     assert (
         len(schedule) == 1
         and round(schedule[0][0]) == 1
-        and schedule[0][1] == "https://www.example.org/2"
+        and schedule[0][1].startswith("https://www.example.org")
     )
     schedule = my_urls.establish_download_schedule(max_urls=6, time_limit=1)
     assert len(schedule) == 6 and round(max(s[0] for s in schedule)) == 4
     assert my_urls.urldict["https://www.example.org"].count == 7
     assert (
         my_urls.urldict["https://test.org"].count
-        == 4
+        == 3
         == sum(u.visited is True for u in my_urls.urldict["https://test.org"].tuples)
     )
     assert my_urls.download_threshold_reached(8) is False