shorter code: booleans

adbar · Jul 27, 2022 · 78095fb · 78095fb
1 parent 0618f0b
commit 78095fb
Show file tree

Hide file tree

Showing 6 changed files with 27 additions and 32 deletions.
diff --git a/courlan/clean.py b/courlan/clean.py
@@ -107,7 +107,7 @@ def clean_query(
             teststr = qelem.lower()
             # control param
             if (
-                strict is True
+                strict
                 and teststr not in ALLOWED_PARAMS
                 and teststr not in CONTROL_PARAMS
             ):
@@ -147,7 +147,7 @@ def normalize_url(
     # Leading /../'s in the path are removed
     newpath = PATH2.sub("", newpath)
     # fragment
-    newfragment = "" if strict is True else parsed_url.fragment
+    newfragment = "" if strict else parsed_url.fragment
     # lowercase + remove fragments
     parsed_url = parsed_url._replace(
         scheme=parsed_url.scheme.lower(),

diff --git a/courlan/cli.py b/courlan/cli.py
@@ -70,7 +70,7 @@ def parse_args(args: Any) -> Any:
 
 def process_args(args: Any) -> None:
     """Start processing according to the arguments"""
-    if args.sample is False:
+    if not args.sample:
         with open(
             args.inputfile, "r", encoding="utf-8", errors="ignore"
         ) as inputfh, open(args.outputfile, "w", encoding="utf-8") as outputfh:

diff --git a/courlan/core.py b/courlan/core.py
@@ -69,7 +69,7 @@ def check_url(
         url = scrub_url(url)
 
         # get potential redirect, can raise ValueError
-        if with_redirects is True:
+        if with_redirects:
             url = redirection_test(url)
 
         # spam & structural elements
@@ -94,15 +94,15 @@ def check_url(
             raise ValueError
 
         # strict content filtering
-        if strict is True and path_filter(parsed_url.path, parsed_url.query) is False:
+        if strict and path_filter(parsed_url.path, parsed_url.query) is False:
             LOGGER.debug("rejected, path filter: %s", url)
             raise ValueError
 
         # normalize
         url = normalize_url(parsed_url, strict, language)
 
         # domain info: use blacklist in strict mode only
-        if strict is True:
+        if strict:
             domain = extract_domain(url, blacklist=BLACKLIST, fast=True)
         else:
             domain = extract_domain(url, fast=True)
@@ -128,7 +128,7 @@ def sample_urls(
 ) -> List[str]:
     """Sample a list of URLs by domain name, optionally using constraints on their number"""
     # logging
-    if verbose is True:
+    if verbose:
         LOGGER.setLevel(logging.DEBUG)
     else:
         LOGGER.setLevel(logging.ERROR)
@@ -244,7 +244,7 @@ def extract_links(
         # external/internal links
         if external_bool != is_external(link, reference):
             continue
-        if is_known_link(link, validlinks) is True:
+        if is_known_link(link, validlinks):
             continue
         validlinks.add(link)
     # return

diff --git a/courlan/filters.py b/courlan/filters.py
@@ -150,7 +150,7 @@ def lang_filter(url: str, language: Optional[str] = None, strict: bool = False)
                 score = langcodes_score(language, occurrence, score)
         # don't perform the test if there are too many candidates: > 2
     # second test: prepended language cues
-    if strict is True and language in LANGUAGE_MAPPINGS:
+    if strict and language in LANGUAGE_MAPPINGS:
         match = HOST_LANG_FILTER.match(url)
         if match:
             candidate = match[1].lower()
@@ -182,16 +182,15 @@ def type_filter(url: str, strict: bool = False, with_nav: bool = False) -> bool:
             raise ValueError
         # wordpress website structure
         if WORDPRESS_CONTENT.search(url) and (
-            with_nav is not True or not is_navigation_page(url)
+            not with_nav or not is_navigation_page(url)
         ):
             raise ValueError
         # not suitable: ads, adult and embedded content
         if UNDESIRABLE.search(url):
             raise ValueError
         # type hidden in parameters + video content
-        if strict is True:
-            if FILE_TYPE.search(url) or ADULT_AND_VIDEOS.search(url):
-                raise ValueError
+        if strict and (FILE_TYPE.search(url) or ADULT_AND_VIDEOS.search(url)):
+            raise ValueError
     except ValueError:
         return False
     # default

diff --git a/courlan/urlstore.py b/courlan/urlstore.py
@@ -71,7 +71,7 @@ def dump_unvisited_urls(num: Any, frame: Any) -> None:
                         [
                             domain + u.urlpath
                             for u in self._load_urls(domain)
-                            if u.visited is False
+                            if not u.visited
                         ]
                     ),
                     file=sys.stderr,
@@ -84,8 +84,8 @@ def dump_unvisited_urls(num: Any, frame: Any) -> None:
             signal.signal(signal.SIGTERM, dump_unvisited_urls)
 
     # def _filter_urlpaths(self, domain, urls):
-    #    if self.validation is True or self.language is not None:
-    #        return [u for u in urls if self._filter_url(domain + u) is True]
+    #    if self.validation or self.language is not None:
+    #        return [u for u in urls if self._filter_url(domain + u)]
     #    return urls
 
     def _buffer_urls(
@@ -140,14 +140,14 @@ def _store_urls(
         # use lock
         with self._lock:
             # compression
-            if self.compressed is True:
+            if self.compressed:
                 self.urldict[domain].tuples = bz2.compress(  # type: ignore
                     pickle.dumps(urls, protocol=4)
                 )
             else:
                 self.urldict[domain].tuples = urls
             # adjust all_visited status
-            self.urldict[domain].all_visited = all(u.visited is True for u in urls)
+            self.urldict[domain].all_visited = all(u.visited for u in urls)
             # timestamp/backoff value
             if timestamp is not None:
                 self.urldict[domain].timestamp = timestamp
@@ -173,7 +173,7 @@ def _search_urls(
                     }
             # run checks: case 1: the path matches, case 2: visited URL
             if urlpath in known_paths and (
-                switch == 1 or (switch == 2 and known_paths[urlpath] is True)
+                switch == 1 or (switch == 2 and known_paths[urlpath])
             ):
                 del remaining_urls[url]
         # preserve input order
@@ -239,26 +239,26 @@ def has_been_visited(self, url: str) -> bool:
     def find_unvisited_urls(self, domain: str) -> List[str]:
         "Get all unvisited URLs for the given domain."
         values = self._load_urls(domain)
-        return [domain + u.urlpath for u in values if u.visited is False]
+        return [domain + u.urlpath for u in values if not u.visited]
 
     def filter_unvisited_urls(self, urls: List[str]) -> List[Union[Any, str]]:
         "Take a list of URLs and return the currently unvisited ones."
         return self._search_urls(urls, switch=2)
 
     def unvisited_websites_number(self) -> int:
         "Return the number of websites for which there are still URLs to visit."
-        return len([d for d in self.urldict if self.urldict[d].all_visited is False])
+        return len([d for d in self.urldict if not self.urldict[d].all_visited])
 
     # DOWNLOADS
 
     def get_url(self, domain: str) -> Optional[str]:
         "Retrieve a single URL and consider it to be visited (with corresponding timestamp)."
         # not fully used
-        if self.urldict[domain].all_visited is False:
+        if not self.urldict[domain].all_visited:
             url_tuples = self._load_urls(domain)
             # get first non-seen url
             for url in url_tuples:
-                if url.visited is False:
+                if not url.visited:
                     url.visited = True
                     with self._lock:
                         self.urldict[domain].count += 1
@@ -273,9 +273,7 @@ def get_download_urls(self, timelimit: int = 10) -> Optional[List[str]]:
         """Get a list of immediately downloadable URLs according to the given
         time limit per domain."""
         with self._lock:
-            potential = [
-                d for d in self.urldict if self.urldict[d].all_visited is False
-            ]
+            potential = [d for d in self.urldict if not self.urldict[d].all_visited]
         if not potential:
             self.done = True
             return None
@@ -297,9 +295,7 @@ def establish_download_schedule(
         backoff schedule (in seconds)."""
         # see which domains are free
         with self._lock:
-            potential = [
-                d for d in self.urldict if self.urldict[d].all_visited is False
-            ]
+            potential = [d for d in self.urldict if not self.urldict[d].all_visited]
             if not potential:
                 self.done = True
                 return []
@@ -318,7 +314,7 @@ def establish_download_schedule(
                     or (len(targets) + len(urlpaths)) >= max_urls
                 ):
                     break
-                if url.visited is False:
+                if not url.visited:
                     urlpaths.append(url.urlpath)
                     url.visited = True
                     with self._lock:

diff --git a/courlan/urlutils.py b/courlan/urlutils.py
@@ -21,7 +21,7 @@ def get_tldinfo(
     url: str, fast: bool = False
 ) -> Union[Tuple[None, None], Tuple[str, str]]:
     """Cached function to extract top-level domain info"""
-    if fast is True:
+    if fast:
         # try with regexes
         domain_match = DOMAIN_REGEX.match(url)
         if domain_match:
@@ -135,7 +135,7 @@ def is_external(url: str, reference: str, ignore_suffix: bool = True) -> bool:
     stripped_ref, ref = get_tldinfo(reference, fast=True)
     stripped_domain, domain = get_tldinfo(url, fast=True)
     # comparison
-    if ignore_suffix is True:
+    if ignore_suffix:
         return stripped_domain != stripped_ref
     return domain != ref