fix(parsing): catch url erros resulting resulting from parsed image l…

…inks
AndyTheFactory · Nov 1, 2023 · 9140a04 · 9140a04
1 parent 41152eb
commit 9140a04
Show file tree

Hide file tree

Showing 2 changed files with 34 additions and 14 deletions.
diff --git a/newspaper/extractors/content_extractor.py b/newspaper/extractors/content_extractor.py
@@ -1,4 +1,16 @@
+import copy
+import logging
+import re
+from collections import defaultdict
+from datetime import datetime
+import json
+
+from dateutil.parser import parse as date_parser
+from tldextract import tldextract
+from urllib.parse import urlparse, urlunparse
+
 from newspaper import urls
+from newspaper.urls import urljoin_if_valid
 from newspaper.extractors.defines import (
     MOTLEY_REPLACEMENT,
     TITLE_REPLACEMENTS,
@@ -15,17 +27,6 @@
     url_stopwords,
 )
 
-import copy
-import logging
-import re
-from collections import defaultdict
-from datetime import datetime
-import json
-
-from dateutil.parser import parse as date_parser
-from tldextract import tldextract
-from urllib.parse import urljoin, urlparse, urlunparse
-
 log = logging.getLogger(__name__)
 
 
@@ -450,7 +451,7 @@ def get_meta_img_url(self, article_url, doc):
         top_meta_image = try_one or try_two or try_three or try_four
 
         if top_meta_image:
-            return urljoin(article_url, top_meta_image)
+            return urljoin_if_valid(article_url, top_meta_image)
         return ""
 
     def get_meta_type(self, doc):
@@ -559,7 +560,7 @@ def get_img_urls(self, article_url, doc):
         img_kwargs = {"tag": "img"}
         img_tags = self.parser.getElementsByTag(doc, **img_kwargs)
         urls_ = [img_tag.get("src") for img_tag in img_tags if img_tag.get("src")]
-        img_links = set([urljoin(article_url, url) for url in urls_])
+        img_links = {urljoin_if_valid(article_url, url) for url in urls_}
         return img_links
 
     def get_first_img_url(self, article_url, top_node):
@@ -570,7 +571,7 @@ def get_first_img_url(self, article_url, top_node):
         node_images = self.get_img_urls(article_url, top_node)
         node_images = list(node_images)
         if node_images:
-            return urljoin(article_url, node_images[0])
+            return urljoin_if_valid(article_url, node_images[0])
         return ""
 
     def _get_urls(self, doc, titles):

diff --git a/newspaper/urls.py b/newspaper/urls.py
@@ -356,3 +356,22 @@ def is_abs_url(url):
 
     c_regex = re.compile(regex)
     return c_regex.search(url) is not None
+
+
+def urljoin_if_valid(base_url: str, url: str) -> str:
+    """Join a base url and a possibly relative url, guard against
+    invalid urls resulted from parsing.
+
+    Args:
+        base_url (str): the base url (namely the article url)
+        url (str): a relative or absolute url
+
+    Returns:
+        str: joined url if valid, otherwise empty string
+    """
+
+    try:
+        res = urljoin(base_url, url)
+        return res
+    except ValueError:
+        return ""