Skip to content

Commit

Permalink
urlstore: simplify batch generation, change timelimit to time_limit a…
Browse files Browse the repository at this point in the history
…dd max_size arg (#91)

* urlstore: simplify download batch and add max_size arg

* use same args, itemgetter

* update actions workflow

* fix tests

* fix tests

* lint

* lint

* remove lines
  • Loading branch information
adbar committed Apr 25, 2024
1 parent a69aa76 commit c811a02
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 34 deletions.
6 changes: 2 additions & 4 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,9 @@ jobs:
- os: ubuntu-20.04
python-version: 3.7
- os: macos-latest
python-version: 3.8
python-version: "3.10"
- os: windows-latest
python-version: 3.8
python-version: "3.10"
steps:
# Python and pip setup
- name: Set up Python ${{ matrix.python-version }}
Expand Down Expand Up @@ -65,7 +65,6 @@ jobs:
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
- name: Code format with black
if: ${{ matrix.python-version != 'pypy3.8' }}
run: |
python -m pip install --upgrade black
black --check --diff courlan
Expand All @@ -74,7 +73,6 @@ jobs:
run: python -m pip install -e "."

- name: Type checking with mypy
if: ${{ matrix.python-version != 'pypy3.8' }}
run: |
python -m pip install --upgrade mypy types-urllib3
mypy -p courlan
Expand Down
45 changes: 22 additions & 23 deletions courlan/urlstore.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from collections import defaultdict, deque
from datetime import datetime, timedelta
from enum import Enum
from operator import itemgetter
from threading import Lock
from typing import (
Any,
Expand Down Expand Up @@ -149,7 +150,7 @@ def _load_urls(self, domain: str) -> Deque[UrlPathTuple]:
return deque()

def _set_done(self) -> None:
if not self.done and all(self.is_exhausted_domain(d) for d in self.urldict):
if not self.done and all(v.state != State.OPEN for v in self.urldict.values()):
with self._lock:
self.done = True

Expand Down Expand Up @@ -231,9 +232,6 @@ def _search_urls(
# preserve input order
return list(remaining_urls)

def _timestamp(self, domain: str) -> Optional[datetime]:
return self.urldict[domain].timestamp

# ADDITIONS AND DELETIONS

def add_urls(
Expand Down Expand Up @@ -301,12 +299,7 @@ def get_known_domains(self) -> List[str]:
def get_unvisited_domains(self) -> List[str]:
"""Find all domains for which there are unvisited URLs
and potentially adjust done meta-information."""
unvisited = []
if not self.done:
unvisited = [d for d, v in self.urldict.items() if v.state == State.OPEN]
if not unvisited:
self._set_done()
return unvisited
return [d for d, v in self.urldict.items() if v.state == State.OPEN]

def is_exhausted_domain(self, domain: str) -> bool:
"Tell if all known URLs for the website have been visited."
Expand Down Expand Up @@ -374,20 +367,26 @@ def get_url(self, domain: str, as_visited: bool = True) -> Optional[str]:
self._set_done()
return None

def get_download_urls(self, timelimit: int = 10) -> Optional[List[str]]:
def get_download_urls(
self, max_urls: int = 10000, time_limit: int = 10
) -> Optional[List[str]]:
"""Get a list of immediately downloadable URLs according to the given
time limit per domain."""
potential = self.get_unvisited_domains()
targets = []
for domain in potential:
timestamp = self._timestamp(domain)
urls = []
for website, entry in self.urldict.items():
if entry.state != State.OPEN:
continue
if (
timestamp is None
or (datetime.now() - timestamp).total_seconds() > timelimit
not entry.timestamp
or (datetime.now() - entry.timestamp).total_seconds() > time_limit
):
targets.append(domain)
# get corresponding URLs and filter out None values
return list(filter(None, [self.get_url(domain) for domain in targets]))
url = self.get_url(website)
if url is not None:
urls.append(url)
if len(urls) >= max_urls:
break
self._set_done()
return urls

def establish_download_schedule(
self, max_urls: int = 100, time_limit: int = 10
Expand Down Expand Up @@ -420,9 +419,9 @@ def establish_download_schedule(
self.urldict[domain].count += 1
# determine timestamps
now = datetime.now()
original_timestamp = self._timestamp(domain)
original_timestamp = self.urldict[domain].timestamp
if (
original_timestamp is None
not original_timestamp
or (now - original_timestamp).total_seconds() > time_limit
):
schedule_secs = 0.0
Expand All @@ -439,7 +438,7 @@ def establish_download_schedule(
self._store_urls(domain, url_tuples, timestamp=total_diff)
# sort by first tuple element (time in secs)
self._set_done()
return sorted(targets, key=lambda x: x[0]) # type: ignore[arg-type]
return sorted(targets, key=itemgetter(1)) # type: ignore[arg-type]

# CRAWLING

Expand Down
27 changes: 20 additions & 7 deletions tests/urlstore_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
import pytest

from courlan import UrlStore
from courlan.core import filter_links
from courlan.urlstore import State, load_store


Expand Down Expand Up @@ -287,18 +286,32 @@ def test_urlstore():
assert my_urls.total_url_number() == 20014

# get download URLs
downloadable_urls = my_urls.get_download_urls(timelimit=0)
downloadable_urls = my_urls.get_download_urls(time_limit=0, max_urls=1)
assert (
len(downloadable_urls) == 2
len(downloadable_urls) == 1
and downloadable_urls[0] == "https://www.example.org/1"
)
assert (
datetime.now() - my_urls.urldict["https://www.example.org"].timestamp
).total_seconds() < 0.25
assert my_urls.urldict["https://www.example.org"].count == 3
assert my_urls.urldict["https://test.org"].count == 1
downloadable_urls = my_urls.get_download_urls() # limit=10

# does not work on Windows?
# if os.name != "nt":
test_urls = UrlStore()
test_urls.add_urls(
["https://www.example.org/1", "https://test.org/1", "https://test.org/2"]
)
downloadable_urls = test_urls.get_download_urls(time_limit=0)
assert (
len(downloadable_urls) == 2
and downloadable_urls[0].startswith("https://www.example.org")
and downloadable_urls[1].startswith("https://test.org")
and test_urls.urldict["https://test.org"].count == 1
)
downloadable_urls = test_urls.get_download_urls()
assert len(downloadable_urls) == 0

other_store = UrlStore()
downloadable_urls = other_store.get_download_urls()
assert not downloadable_urls and other_store.done is True
Expand All @@ -320,14 +333,14 @@ def test_urlstore():
assert (
len(schedule) == 1
and round(schedule[0][0]) == 1
and schedule[0][1] == "https://www.example.org/2"
and schedule[0][1].startswith("https://www.example.org")
)
schedule = my_urls.establish_download_schedule(max_urls=6, time_limit=1)
assert len(schedule) == 6 and round(max(s[0] for s in schedule)) == 4
assert my_urls.urldict["https://www.example.org"].count == 7
assert (
my_urls.urldict["https://test.org"].count
== 4
== 3
== sum(u.visited is True for u in my_urls.urldict["https://test.org"].tuples)
)
assert my_urls.download_threshold_reached(8) is False
Expand Down

0 comments on commit c811a02

Please sign in to comment.