Skip to content

Commit

Permalink
urlstore: simplify code of convenicence functions (#90)
Browse files Browse the repository at this point in the history
* urlstore: simplify code

* simplify unvisited
  • Loading branch information
adbar committed Apr 24, 2024
1 parent accbb1b commit a69aa76
Showing 1 changed file with 8 additions and 9 deletions.
17 changes: 8 additions & 9 deletions courlan/urlstore.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,10 @@ class DomainEntry:
"Class to record host-related information and URL paths."
__slots__ = ("count", "rules", "state", "timestamp", "total", "tuples")

def __init__(self) -> None:
def __init__(self, state: State = State.OPEN) -> None:
self.count: int = 0
self.rules: Optional[RobotFileParser] = None
self.state: State = State.OPEN
self.state: State = state
self.timestamp: Optional[Any] = None
self.total: int = 0
self.tuples: Deque[UrlPathTuple] = deque()
Expand Down Expand Up @@ -279,8 +279,7 @@ def discard(self, domains: List[str]) -> None:
"Declare domains void and prune the store."
with self._lock:
for d in domains:
self.urldict[d] = DomainEntry()
self.urldict[d].state = State.BUSTED
self.urldict[d] = DomainEntry(state=State.BUSTED)
self._set_done()
num = gc.collect()
LOGGER.debug("%s objects in GC after UrlStore.discard", num)
Expand All @@ -304,15 +303,15 @@ def get_unvisited_domains(self) -> List[str]:
and potentially adjust done meta-information."""
unvisited = []
if not self.done:
unvisited = [d for d in self.urldict if not self.is_exhausted_domain(d)]
unvisited = [d for d, v in self.urldict.items() if v.state == State.OPEN]
if not unvisited:
self._set_done()
return unvisited

def is_exhausted_domain(self, domain: str) -> bool:
"Tell if all known URLs for the website have been visited."
if domain in self.urldict:
return self.urldict[domain].state in (State.ALL_VISITED, State.BUSTED)
return self.urldict[domain].state != State.OPEN
return False
# raise KeyError("website not in store")

Expand Down Expand Up @@ -475,15 +474,15 @@ def get_crawl_delay(self, website: str, default: float = 5) -> float:

def get_all_counts(self) -> List[int]:
"Return all download counts for the hosts in store."
return [self.urldict[d].count for d in self.urldict]
return [v.count for v in self.urldict.values()]

def total_url_number(self) -> int:
"Find number of all URLs in store."
return sum(self.urldict[d].total for d in self.urldict)
return sum(v.total for v in self.urldict.values())

def download_threshold_reached(self, threshold: float) -> bool:
"Find out if the download limit (in seconds) has been reached for one of the websites in store."
return any(self.urldict[d].count >= threshold for d in self.urldict)
return any(v.count >= threshold for v in self.urldict.values())

def dump_urls(self) -> List[str]:
"Return a list of all known URLs."
Expand Down

0 comments on commit a69aa76

Please sign in to comment.