Skip to content

Commit

Permalink
UrlStore: add method get_unvisited_domains() (#40)
Browse files Browse the repository at this point in the history
* UrlStore: add method get_unvisited_domains()

* test fix
  • Loading branch information
adbar committed May 31, 2023
1 parent 6cb411a commit 53b8de9
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 6 deletions.
8 changes: 7 additions & 1 deletion courlan/urlstore.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,8 @@ def filter_unknown_urls(self, urls: List[str]) -> List[str]:
"Take a list of URLs and return the currently unknown ones."
return self._search_urls(urls, switch=1)

# DOMAINS / HOSTNAMES

def get_known_domains(self) -> List[str]:
"Return all known domains as a list."
return list(self.urldict)
Expand All @@ -245,6 +247,10 @@ def is_exhausted_domain(self, domain: str) -> bool:
return self.urldict[domain].all_visited
raise KeyError("website not in store")

def get_unvisited_domains(self) -> List[str]:
"Return all domains which have not been all visited."
return [d for d in self.urldict if not self.urldict[d].all_visited]

# URL-BASED QUERIES

def has_been_visited(self, url: str) -> bool:
Expand All @@ -264,7 +270,7 @@ def filter_unvisited_urls(self, urls: List[str]) -> List[Union[Any, str]]:

def unvisited_websites_number(self) -> int:
"Return the number of websites for which there are still URLs to visit."
return len([d for d in self.urldict if not self.urldict[d].all_visited])
return len(self.get_unvisited_domains())

# DOWNLOADS

Expand Down
15 changes: 10 additions & 5 deletions tests/urlstore_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import uuid

from datetime import datetime
from time import sleep

import pytest

Expand Down Expand Up @@ -168,6 +169,7 @@ def test_urlstore():

url1 = my_urls.get_url(example_domain)
timestamp = my_urls.urldict[example_domain].timestamp
sleep(0.1)
url2 = my_urls.get_url(example_domain)
assert url1 != url2 and url1 == "https://www.example.org/1/10/"
assert my_urls.urldict[example_domain].count == 2
Expand All @@ -178,7 +180,7 @@ def test_urlstore():
# as_visited=False
timestamp = my_urls.urldict[example_domain].timestamp
url3 = my_urls.get_url(example_domain, as_visited=False)
assert url3 != url1 and url3 != url2
assert url3 not in (url1, url2)
assert my_urls.urldict[example_domain].count == 2
assert timestamp == my_urls.urldict[example_domain].timestamp
assert url3 in set(my_urls.find_unvisited_urls(example_domain))
Expand Down Expand Up @@ -224,7 +226,7 @@ def test_urlstore():
)
assert my_urls.has_been_visited("http://tovisit.com/page") is True
assert my_urls.urldict["http://tovisit.com"].all_visited is True
assert my_urls.filter_unvisited_urls(["http://tovisit.com/page"]) == []
assert not my_urls.filter_unvisited_urls(["http://tovisit.com/page"])
assert my_urls.filter_unvisited_urls(["http://tovisit.com/otherpage"]) == [
"http://tovisit.com/otherpage"
]
Expand All @@ -243,7 +245,10 @@ def test_urlstore():
== 10011
)
assert len(my_urls.find_unvisited_urls(example_domain)) == 10009
assert my_urls.unvisited_websites_number() == 4
assert (
my_urls.unvisited_websites_number() == len(my_urls.get_unvisited_domains()) == 4
)
assert my_urls.total_url_number() == 20013

# get download URLs
downloadable_urls = my_urls.get_download_urls(timelimit=0)
Expand All @@ -260,11 +265,11 @@ def test_urlstore():
assert len(downloadable_urls) == 0
other_store = UrlStore()
downloadable_urls = other_store.get_download_urls()
assert downloadable_urls == [] and other_store.done is True
assert not downloadable_urls and other_store.done is True

# schedule
schedule = other_store.establish_download_schedule()
assert schedule == []
assert not schedule
# store exhaustion
other_store = UrlStore()
other_store.add_urls(
Expand Down

0 comments on commit 53b8de9

Please sign in to comment.