Skip to content

Commit

Permalink
fix(parsing): catch url erros resulting resulting from parsed image l…
Browse files Browse the repository at this point in the history
…inks
  • Loading branch information
AndyTheFactory committed Nov 1, 2023
1 parent 41152eb commit 9140a04
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 14 deletions.
29 changes: 15 additions & 14 deletions newspaper/extractors/content_extractor.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,16 @@
import copy
import logging
import re
from collections import defaultdict
from datetime import datetime
import json

from dateutil.parser import parse as date_parser
from tldextract import tldextract
from urllib.parse import urlparse, urlunparse

from newspaper import urls
from newspaper.urls import urljoin_if_valid
from newspaper.extractors.defines import (
MOTLEY_REPLACEMENT,
TITLE_REPLACEMENTS,
Expand All @@ -15,17 +27,6 @@
url_stopwords,
)

import copy
import logging
import re
from collections import defaultdict
from datetime import datetime
import json

from dateutil.parser import parse as date_parser
from tldextract import tldextract
from urllib.parse import urljoin, urlparse, urlunparse

log = logging.getLogger(__name__)


Expand Down Expand Up @@ -450,7 +451,7 @@ def get_meta_img_url(self, article_url, doc):
top_meta_image = try_one or try_two or try_three or try_four

if top_meta_image:
return urljoin(article_url, top_meta_image)
return urljoin_if_valid(article_url, top_meta_image)
return ""

def get_meta_type(self, doc):
Expand Down Expand Up @@ -559,7 +560,7 @@ def get_img_urls(self, article_url, doc):
img_kwargs = {"tag": "img"}
img_tags = self.parser.getElementsByTag(doc, **img_kwargs)
urls_ = [img_tag.get("src") for img_tag in img_tags if img_tag.get("src")]
img_links = set([urljoin(article_url, url) for url in urls_])
img_links = {urljoin_if_valid(article_url, url) for url in urls_}
return img_links

def get_first_img_url(self, article_url, top_node):
Expand All @@ -570,7 +571,7 @@ def get_first_img_url(self, article_url, top_node):
node_images = self.get_img_urls(article_url, top_node)
node_images = list(node_images)
if node_images:
return urljoin(article_url, node_images[0])
return urljoin_if_valid(article_url, node_images[0])
return ""

def _get_urls(self, doc, titles):
Expand Down
19 changes: 19 additions & 0 deletions newspaper/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -356,3 +356,22 @@ def is_abs_url(url):

c_regex = re.compile(regex)
return c_regex.search(url) is not None


def urljoin_if_valid(base_url: str, url: str) -> str:
"""Join a base url and a possibly relative url, guard against
invalid urls resulted from parsing.
Args:
base_url (str): the base url (namely the article url)
url (str): a relative or absolute url
Returns:
str: joined url if valid, otherwise empty string
"""

try:
res = urljoin(base_url, url)
return res
except ValueError:
return ""

0 comments on commit 9140a04

Please sign in to comment.