From 53b8de9a462a6ee9240982d747a591cb8157b412 Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Wed, 31 May 2023 16:27:48 +0200 Subject: [PATCH] UrlStore: add method get_unvisited_domains() (#40) * UrlStore: add method get_unvisited_domains() * test fix --- courlan/urlstore.py | 8 +++++++- tests/urlstore_tests.py | 15 ++++++++++----- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/courlan/urlstore.py b/courlan/urlstore.py index 03d194e4..ea7a981e 100644 --- a/courlan/urlstore.py +++ b/courlan/urlstore.py @@ -235,6 +235,8 @@ def filter_unknown_urls(self, urls: List[str]) -> List[str]: "Take a list of URLs and return the currently unknown ones." return self._search_urls(urls, switch=1) + # DOMAINS / HOSTNAMES + def get_known_domains(self) -> List[str]: "Return all known domains as a list." return list(self.urldict) @@ -245,6 +247,10 @@ def is_exhausted_domain(self, domain: str) -> bool: return self.urldict[domain].all_visited raise KeyError("website not in store") + def get_unvisited_domains(self) -> List[str]: + "Return all domains which have not been all visited." + return [d for d in self.urldict if not self.urldict[d].all_visited] + # URL-BASED QUERIES def has_been_visited(self, url: str) -> bool: @@ -264,7 +270,7 @@ def filter_unvisited_urls(self, urls: List[str]) -> List[Union[Any, str]]: def unvisited_websites_number(self) -> int: "Return the number of websites for which there are still URLs to visit." - return len([d for d in self.urldict if not self.urldict[d].all_visited]) + return len(self.get_unvisited_domains()) # DOWNLOADS diff --git a/tests/urlstore_tests.py b/tests/urlstore_tests.py index d6e68d44..0d00a719 100644 --- a/tests/urlstore_tests.py +++ b/tests/urlstore_tests.py @@ -10,6 +10,7 @@ import uuid from datetime import datetime +from time import sleep import pytest @@ -168,6 +169,7 @@ def test_urlstore(): url1 = my_urls.get_url(example_domain) timestamp = my_urls.urldict[example_domain].timestamp + sleep(0.1) url2 = my_urls.get_url(example_domain) assert url1 != url2 and url1 == "https://www.example.org/1/10/" assert my_urls.urldict[example_domain].count == 2 @@ -178,7 +180,7 @@ def test_urlstore(): # as_visited=False timestamp = my_urls.urldict[example_domain].timestamp url3 = my_urls.get_url(example_domain, as_visited=False) - assert url3 != url1 and url3 != url2 + assert url3 not in (url1, url2) assert my_urls.urldict[example_domain].count == 2 assert timestamp == my_urls.urldict[example_domain].timestamp assert url3 in set(my_urls.find_unvisited_urls(example_domain)) @@ -224,7 +226,7 @@ def test_urlstore(): ) assert my_urls.has_been_visited("http://tovisit.com/page") is True assert my_urls.urldict["http://tovisit.com"].all_visited is True - assert my_urls.filter_unvisited_urls(["http://tovisit.com/page"]) == [] + assert not my_urls.filter_unvisited_urls(["http://tovisit.com/page"]) assert my_urls.filter_unvisited_urls(["http://tovisit.com/otherpage"]) == [ "http://tovisit.com/otherpage" ] @@ -243,7 +245,10 @@ def test_urlstore(): == 10011 ) assert len(my_urls.find_unvisited_urls(example_domain)) == 10009 - assert my_urls.unvisited_websites_number() == 4 + assert ( + my_urls.unvisited_websites_number() == len(my_urls.get_unvisited_domains()) == 4 + ) + assert my_urls.total_url_number() == 20013 # get download URLs downloadable_urls = my_urls.get_download_urls(timelimit=0) @@ -260,11 +265,11 @@ def test_urlstore(): assert len(downloadable_urls) == 0 other_store = UrlStore() downloadable_urls = other_store.get_download_urls() - assert downloadable_urls == [] and other_store.done is True + assert not downloadable_urls and other_store.done is True # schedule schedule = other_store.establish_download_schedule() - assert schedule == [] + assert not schedule # store exhaustion other_store = UrlStore() other_store.add_urls(