Merge pull request #68 from michaelthwan/trafilatura_mem_leak_fix

Reset cache after extracting html
michaelthwan · Mar 12, 2023 · 0472325 · 0472325
2 parents f810f77 + b779a7a
commit 0472325
Show file tree

Hide file tree

Showing 2 changed files with 3 additions and 1 deletion.
diff --git a/src/config/config.yaml b/src/config/config.yaml
@@ -7,7 +7,7 @@ source_service:
     end_point: https://api.bing.microsoft.com
     subscription_key:
     result_count: 3
-    text_extract: beautifulsoup # beautifulsoup / trafilatura
+    text_extract: trafilatura # beautifulsoup / trafilatura
 llm_service:
   provider: openai # openai/goose_ai
   openai_api:

diff --git a/src/text_extract/html/trafilatura.py b/src/text_extract/html/trafilatura.py
@@ -1,4 +1,5 @@
 from trafilatura import bare_extraction
+from trafilatura.meta import reset_caches
 
 from text_extract.html.abc_html_extract import AbstractHtmlExtractSvc
 
@@ -9,6 +10,7 @@ def __init__(self):
 
     def extract_from_html(self, html_str: str):
         extract = bare_extraction(html_str, favor_precision=True)
+        reset_caches()
         try:
             return extract['text'].split("\n")
         except: