Skip to content

Commit

Permalink
shorter code: booleans
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar committed Jul 27, 2022
1 parent 0618f0b commit 78095fb
Show file tree
Hide file tree
Showing 6 changed files with 27 additions and 32 deletions.
4 changes: 2 additions & 2 deletions courlan/clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def clean_query(
teststr = qelem.lower()
# control param
if (
strict is True
strict
and teststr not in ALLOWED_PARAMS
and teststr not in CONTROL_PARAMS
):
Expand Down Expand Up @@ -147,7 +147,7 @@ def normalize_url(
# Leading /../'s in the path are removed
newpath = PATH2.sub("", newpath)
# fragment
newfragment = "" if strict is True else parsed_url.fragment
newfragment = "" if strict else parsed_url.fragment
# lowercase + remove fragments
parsed_url = parsed_url._replace(
scheme=parsed_url.scheme.lower(),
Expand Down
2 changes: 1 addition & 1 deletion courlan/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def parse_args(args: Any) -> Any:

def process_args(args: Any) -> None:
"""Start processing according to the arguments"""
if args.sample is False:
if not args.sample:
with open(
args.inputfile, "r", encoding="utf-8", errors="ignore"
) as inputfh, open(args.outputfile, "w", encoding="utf-8") as outputfh:
Expand Down
10 changes: 5 additions & 5 deletions courlan/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def check_url(
url = scrub_url(url)

# get potential redirect, can raise ValueError
if with_redirects is True:
if with_redirects:
url = redirection_test(url)

# spam & structural elements
Expand All @@ -94,15 +94,15 @@ def check_url(
raise ValueError

# strict content filtering
if strict is True and path_filter(parsed_url.path, parsed_url.query) is False:
if strict and path_filter(parsed_url.path, parsed_url.query) is False:
LOGGER.debug("rejected, path filter: %s", url)
raise ValueError

# normalize
url = normalize_url(parsed_url, strict, language)

# domain info: use blacklist in strict mode only
if strict is True:
if strict:
domain = extract_domain(url, blacklist=BLACKLIST, fast=True)
else:
domain = extract_domain(url, fast=True)
Expand All @@ -128,7 +128,7 @@ def sample_urls(
) -> List[str]:
"""Sample a list of URLs by domain name, optionally using constraints on their number"""
# logging
if verbose is True:
if verbose:
LOGGER.setLevel(logging.DEBUG)
else:
LOGGER.setLevel(logging.ERROR)
Expand Down Expand Up @@ -244,7 +244,7 @@ def extract_links(
# external/internal links
if external_bool != is_external(link, reference):
continue
if is_known_link(link, validlinks) is True:
if is_known_link(link, validlinks):
continue
validlinks.add(link)
# return
Expand Down
9 changes: 4 additions & 5 deletions courlan/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ def lang_filter(url: str, language: Optional[str] = None, strict: bool = False)
score = langcodes_score(language, occurrence, score)
# don't perform the test if there are too many candidates: > 2
# second test: prepended language cues
if strict is True and language in LANGUAGE_MAPPINGS:
if strict and language in LANGUAGE_MAPPINGS:
match = HOST_LANG_FILTER.match(url)
if match:
candidate = match[1].lower()
Expand Down Expand Up @@ -182,16 +182,15 @@ def type_filter(url: str, strict: bool = False, with_nav: bool = False) -> bool:
raise ValueError
# wordpress website structure
if WORDPRESS_CONTENT.search(url) and (
with_nav is not True or not is_navigation_page(url)
not with_nav or not is_navigation_page(url)
):
raise ValueError
# not suitable: ads, adult and embedded content
if UNDESIRABLE.search(url):
raise ValueError
# type hidden in parameters + video content
if strict is True:
if FILE_TYPE.search(url) or ADULT_AND_VIDEOS.search(url):
raise ValueError
if strict and (FILE_TYPE.search(url) or ADULT_AND_VIDEOS.search(url)):
raise ValueError
except ValueError:
return False
# default
Expand Down
30 changes: 13 additions & 17 deletions courlan/urlstore.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def dump_unvisited_urls(num: Any, frame: Any) -> None:
[
domain + u.urlpath
for u in self._load_urls(domain)
if u.visited is False
if not u.visited
]
),
file=sys.stderr,
Expand All @@ -84,8 +84,8 @@ def dump_unvisited_urls(num: Any, frame: Any) -> None:
signal.signal(signal.SIGTERM, dump_unvisited_urls)

# def _filter_urlpaths(self, domain, urls):
# if self.validation is True or self.language is not None:
# return [u for u in urls if self._filter_url(domain + u) is True]
# if self.validation or self.language is not None:
# return [u for u in urls if self._filter_url(domain + u)]
# return urls

def _buffer_urls(
Expand Down Expand Up @@ -140,14 +140,14 @@ def _store_urls(
# use lock
with self._lock:
# compression
if self.compressed is True:
if self.compressed:
self.urldict[domain].tuples = bz2.compress( # type: ignore
pickle.dumps(urls, protocol=4)
)
else:
self.urldict[domain].tuples = urls
# adjust all_visited status
self.urldict[domain].all_visited = all(u.visited is True for u in urls)
self.urldict[domain].all_visited = all(u.visited for u in urls)
# timestamp/backoff value
if timestamp is not None:
self.urldict[domain].timestamp = timestamp
Expand All @@ -173,7 +173,7 @@ def _search_urls(
}
# run checks: case 1: the path matches, case 2: visited URL
if urlpath in known_paths and (
switch == 1 or (switch == 2 and known_paths[urlpath] is True)
switch == 1 or (switch == 2 and known_paths[urlpath])
):
del remaining_urls[url]
# preserve input order
Expand Down Expand Up @@ -239,26 +239,26 @@ def has_been_visited(self, url: str) -> bool:
def find_unvisited_urls(self, domain: str) -> List[str]:
"Get all unvisited URLs for the given domain."
values = self._load_urls(domain)
return [domain + u.urlpath for u in values if u.visited is False]
return [domain + u.urlpath for u in values if not u.visited]

def filter_unvisited_urls(self, urls: List[str]) -> List[Union[Any, str]]:
"Take a list of URLs and return the currently unvisited ones."
return self._search_urls(urls, switch=2)

def unvisited_websites_number(self) -> int:
"Return the number of websites for which there are still URLs to visit."
return len([d for d in self.urldict if self.urldict[d].all_visited is False])
return len([d for d in self.urldict if not self.urldict[d].all_visited])

# DOWNLOADS

def get_url(self, domain: str) -> Optional[str]:
"Retrieve a single URL and consider it to be visited (with corresponding timestamp)."
# not fully used
if self.urldict[domain].all_visited is False:
if not self.urldict[domain].all_visited:
url_tuples = self._load_urls(domain)
# get first non-seen url
for url in url_tuples:
if url.visited is False:
if not url.visited:
url.visited = True
with self._lock:
self.urldict[domain].count += 1
Expand All @@ -273,9 +273,7 @@ def get_download_urls(self, timelimit: int = 10) -> Optional[List[str]]:
"""Get a list of immediately downloadable URLs according to the given
time limit per domain."""
with self._lock:
potential = [
d for d in self.urldict if self.urldict[d].all_visited is False
]
potential = [d for d in self.urldict if not self.urldict[d].all_visited]
if not potential:
self.done = True
return None
Expand All @@ -297,9 +295,7 @@ def establish_download_schedule(
backoff schedule (in seconds)."""
# see which domains are free
with self._lock:
potential = [
d for d in self.urldict if self.urldict[d].all_visited is False
]
potential = [d for d in self.urldict if not self.urldict[d].all_visited]
if not potential:
self.done = True
return []
Expand All @@ -318,7 +314,7 @@ def establish_download_schedule(
or (len(targets) + len(urlpaths)) >= max_urls
):
break
if url.visited is False:
if not url.visited:
urlpaths.append(url.urlpath)
url.visited = True
with self._lock:
Expand Down
4 changes: 2 additions & 2 deletions courlan/urlutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def get_tldinfo(
url: str, fast: bool = False
) -> Union[Tuple[None, None], Tuple[str, str]]:
"""Cached function to extract top-level domain info"""
if fast is True:
if fast:
# try with regexes
domain_match = DOMAIN_REGEX.match(url)
if domain_match:
Expand Down Expand Up @@ -135,7 +135,7 @@ def is_external(url: str, reference: str, ignore_suffix: bool = True) -> bool:
stripped_ref, ref = get_tldinfo(reference, fast=True)
stripped_domain, domain = get_tldinfo(url, fast=True)
# comparison
if ignore_suffix is True:
if ignore_suffix:
return stripped_domain != stripped_ref
return domain != ref

Expand Down

0 comments on commit 78095fb

Please sign in to comment.