diff --git a/src/config/config.yaml b/src/config/config.yaml index 67a0181..56dc19e 100644 --- a/src/config/config.yaml +++ b/src/config/config.yaml @@ -7,7 +7,7 @@ source_service: end_point: https://api.bing.microsoft.com subscription_key: result_count: 3 - text_extract: beautifulsoup # beautifulsoup / trafilatura + text_extract: trafilatura # beautifulsoup / trafilatura llm_service: provider: openai # openai/goose_ai openai_api: diff --git a/src/text_extract/html/trafilatura.py b/src/text_extract/html/trafilatura.py index 1dbf6f3..8e0ab5e 100644 --- a/src/text_extract/html/trafilatura.py +++ b/src/text_extract/html/trafilatura.py @@ -1,4 +1,5 @@ from trafilatura import bare_extraction +from trafilatura.meta import reset_caches from text_extract.html.abc_html_extract import AbstractHtmlExtractSvc @@ -9,6 +10,7 @@ def __init__(self): def extract_from_html(self, html_str: str): extract = bare_extraction(html_str, favor_precision=True) + reset_caches() try: return extract['text'].split("\n") except: