diff --git a/courlan/clean.py b/courlan/clean.py index 2e3c982..ef3d868 100644 --- a/courlan/clean.py +++ b/courlan/clean.py @@ -107,7 +107,7 @@ def clean_query( teststr = qelem.lower() # control param if ( - strict is True + strict and teststr not in ALLOWED_PARAMS and teststr not in CONTROL_PARAMS ): @@ -147,7 +147,7 @@ def normalize_url( # Leading /../'s in the path are removed newpath = PATH2.sub("", newpath) # fragment - newfragment = "" if strict is True else parsed_url.fragment + newfragment = "" if strict else parsed_url.fragment # lowercase + remove fragments parsed_url = parsed_url._replace( scheme=parsed_url.scheme.lower(), diff --git a/courlan/cli.py b/courlan/cli.py index 06c2d9a..8eed55a 100644 --- a/courlan/cli.py +++ b/courlan/cli.py @@ -70,7 +70,7 @@ def parse_args(args: Any) -> Any: def process_args(args: Any) -> None: """Start processing according to the arguments""" - if args.sample is False: + if not args.sample: with open( args.inputfile, "r", encoding="utf-8", errors="ignore" ) as inputfh, open(args.outputfile, "w", encoding="utf-8") as outputfh: diff --git a/courlan/core.py b/courlan/core.py index c9f338c..1981db9 100644 --- a/courlan/core.py +++ b/courlan/core.py @@ -69,7 +69,7 @@ def check_url( url = scrub_url(url) # get potential redirect, can raise ValueError - if with_redirects is True: + if with_redirects: url = redirection_test(url) # spam & structural elements @@ -94,7 +94,7 @@ def check_url( raise ValueError # strict content filtering - if strict is True and path_filter(parsed_url.path, parsed_url.query) is False: + if strict and path_filter(parsed_url.path, parsed_url.query) is False: LOGGER.debug("rejected, path filter: %s", url) raise ValueError @@ -102,7 +102,7 @@ def check_url( url = normalize_url(parsed_url, strict, language) # domain info: use blacklist in strict mode only - if strict is True: + if strict: domain = extract_domain(url, blacklist=BLACKLIST, fast=True) else: domain = extract_domain(url, fast=True) @@ -128,7 +128,7 @@ def sample_urls( ) -> List[str]: """Sample a list of URLs by domain name, optionally using constraints on their number""" # logging - if verbose is True: + if verbose: LOGGER.setLevel(logging.DEBUG) else: LOGGER.setLevel(logging.ERROR) @@ -244,7 +244,7 @@ def extract_links( # external/internal links if external_bool != is_external(link, reference): continue - if is_known_link(link, validlinks) is True: + if is_known_link(link, validlinks): continue validlinks.add(link) # return diff --git a/courlan/filters.py b/courlan/filters.py index b3c31b1..759153d 100644 --- a/courlan/filters.py +++ b/courlan/filters.py @@ -150,7 +150,7 @@ def lang_filter(url: str, language: Optional[str] = None, strict: bool = False) score = langcodes_score(language, occurrence, score) # don't perform the test if there are too many candidates: > 2 # second test: prepended language cues - if strict is True and language in LANGUAGE_MAPPINGS: + if strict and language in LANGUAGE_MAPPINGS: match = HOST_LANG_FILTER.match(url) if match: candidate = match[1].lower() @@ -182,16 +182,15 @@ def type_filter(url: str, strict: bool = False, with_nav: bool = False) -> bool: raise ValueError # wordpress website structure if WORDPRESS_CONTENT.search(url) and ( - with_nav is not True or not is_navigation_page(url) + not with_nav or not is_navigation_page(url) ): raise ValueError # not suitable: ads, adult and embedded content if UNDESIRABLE.search(url): raise ValueError # type hidden in parameters + video content - if strict is True: - if FILE_TYPE.search(url) or ADULT_AND_VIDEOS.search(url): - raise ValueError + if strict and (FILE_TYPE.search(url) or ADULT_AND_VIDEOS.search(url)): + raise ValueError except ValueError: return False # default diff --git a/courlan/urlstore.py b/courlan/urlstore.py index 6680b4f..2c3b9b9 100644 --- a/courlan/urlstore.py +++ b/courlan/urlstore.py @@ -71,7 +71,7 @@ def dump_unvisited_urls(num: Any, frame: Any) -> None: [ domain + u.urlpath for u in self._load_urls(domain) - if u.visited is False + if not u.visited ] ), file=sys.stderr, @@ -84,8 +84,8 @@ def dump_unvisited_urls(num: Any, frame: Any) -> None: signal.signal(signal.SIGTERM, dump_unvisited_urls) # def _filter_urlpaths(self, domain, urls): - # if self.validation is True or self.language is not None: - # return [u for u in urls if self._filter_url(domain + u) is True] + # if self.validation or self.language is not None: + # return [u for u in urls if self._filter_url(domain + u)] # return urls def _buffer_urls( @@ -140,14 +140,14 @@ def _store_urls( # use lock with self._lock: # compression - if self.compressed is True: + if self.compressed: self.urldict[domain].tuples = bz2.compress( # type: ignore pickle.dumps(urls, protocol=4) ) else: self.urldict[domain].tuples = urls # adjust all_visited status - self.urldict[domain].all_visited = all(u.visited is True for u in urls) + self.urldict[domain].all_visited = all(u.visited for u in urls) # timestamp/backoff value if timestamp is not None: self.urldict[domain].timestamp = timestamp @@ -173,7 +173,7 @@ def _search_urls( } # run checks: case 1: the path matches, case 2: visited URL if urlpath in known_paths and ( - switch == 1 or (switch == 2 and known_paths[urlpath] is True) + switch == 1 or (switch == 2 and known_paths[urlpath]) ): del remaining_urls[url] # preserve input order @@ -239,7 +239,7 @@ def has_been_visited(self, url: str) -> bool: def find_unvisited_urls(self, domain: str) -> List[str]: "Get all unvisited URLs for the given domain." values = self._load_urls(domain) - return [domain + u.urlpath for u in values if u.visited is False] + return [domain + u.urlpath for u in values if not u.visited] def filter_unvisited_urls(self, urls: List[str]) -> List[Union[Any, str]]: "Take a list of URLs and return the currently unvisited ones." @@ -247,18 +247,18 @@ def filter_unvisited_urls(self, urls: List[str]) -> List[Union[Any, str]]: def unvisited_websites_number(self) -> int: "Return the number of websites for which there are still URLs to visit." - return len([d for d in self.urldict if self.urldict[d].all_visited is False]) + return len([d for d in self.urldict if not self.urldict[d].all_visited]) # DOWNLOADS def get_url(self, domain: str) -> Optional[str]: "Retrieve a single URL and consider it to be visited (with corresponding timestamp)." # not fully used - if self.urldict[domain].all_visited is False: + if not self.urldict[domain].all_visited: url_tuples = self._load_urls(domain) # get first non-seen url for url in url_tuples: - if url.visited is False: + if not url.visited: url.visited = True with self._lock: self.urldict[domain].count += 1 @@ -273,9 +273,7 @@ def get_download_urls(self, timelimit: int = 10) -> Optional[List[str]]: """Get a list of immediately downloadable URLs according to the given time limit per domain.""" with self._lock: - potential = [ - d for d in self.urldict if self.urldict[d].all_visited is False - ] + potential = [d for d in self.urldict if not self.urldict[d].all_visited] if not potential: self.done = True return None @@ -297,9 +295,7 @@ def establish_download_schedule( backoff schedule (in seconds).""" # see which domains are free with self._lock: - potential = [ - d for d in self.urldict if self.urldict[d].all_visited is False - ] + potential = [d for d in self.urldict if not self.urldict[d].all_visited] if not potential: self.done = True return [] @@ -318,7 +314,7 @@ def establish_download_schedule( or (len(targets) + len(urlpaths)) >= max_urls ): break - if url.visited is False: + if not url.visited: urlpaths.append(url.urlpath) url.visited = True with self._lock: diff --git a/courlan/urlutils.py b/courlan/urlutils.py index 1dd3fa7..ea5a723 100644 --- a/courlan/urlutils.py +++ b/courlan/urlutils.py @@ -21,7 +21,7 @@ def get_tldinfo( url: str, fast: bool = False ) -> Union[Tuple[None, None], Tuple[str, str]]: """Cached function to extract top-level domain info""" - if fast is True: + if fast: # try with regexes domain_match = DOMAIN_REGEX.match(url) if domain_match: @@ -135,7 +135,7 @@ def is_external(url: str, reference: str, ignore_suffix: bool = True) -> bool: stripped_ref, ref = get_tldinfo(reference, fast=True) stripped_domain, domain = get_tldinfo(url, fast=True) # comparison - if ignore_suffix is True: + if ignore_suffix: return stripped_domain != stripped_ref return domain != ref