From 754d6972dd40654091c96c7823c158858c3ad75c Mon Sep 17 00:00:00 2001 From: jakubwasikowski Date: Wed, 31 Jul 2019 17:35:22 +0200 Subject: [PATCH 1/4] Move tree cleaning to extract function --- extruct/w3cmicrodata.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/extruct/w3cmicrodata.py b/extruct/w3cmicrodata.py index cb8c9fa7..c855162f 100644 --- a/extruct/w3cmicrodata.py +++ b/extruct/w3cmicrodata.py @@ -67,7 +67,7 @@ def get_docid(self, node): def extract(self, htmlstring, base_url=None, encoding="UTF-8"): tree = parse_html(htmlstring, encoding=encoding) - return self.extract_items(tree, base_url) + return self.extract_items(cleaner.clean_html(tree), base_url) def extract_items(self, document, base_url): items_seen = set() @@ -203,8 +203,7 @@ def _extract_property_value(self, node, items_seen, base_url, force=False): return self._extract_textContent(node) def _extract_textContent(self, node): - clean_node = cleaner.clean_html(node) - return html_text.etree_to_text(clean_node) + return html_text.etree_to_text(node) MicrodataExtractor = LxmlMicrodataExtractor From f70f8a78346b22e5f3f7e5a1ccf0275fcd101380 Mon Sep 17 00:00:00 2001 From: jakubwasikowski Date: Wed, 31 Jul 2019 17:35:46 +0200 Subject: [PATCH 2/4] Do not clear link and meta tags --- extruct/w3cmicrodata.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/extruct/w3cmicrodata.py b/extruct/w3cmicrodata.py index c855162f..bc25ef6f 100644 --- a/extruct/w3cmicrodata.py +++ b/extruct/w3cmicrodata.py @@ -31,8 +31,8 @@ javascript=False, # onclick attributes are fine comments=True, style=True, - links=True, - meta=True, + links=False, # e.g. availability is included in tags + meta=False, # some sites use tags in body to provide property page_structure=False, # may be nice to have processing_instructions=True, embedded=False, # keep embedded content From 7422c018f7dbc9885ef00035eaa8f308759f56a2 Mon Sep 17 00:00:00 2001 From: jakubwasikowski <jakub.wasikowski@gmail.com> Date: Wed, 31 Jul 2019 17:41:45 +0200 Subject: [PATCH 3/4] Get rid of _xp_clean_text --- extruct/w3cmicrodata.py | 1 - 1 file changed, 1 deletion(-) diff --git a/extruct/w3cmicrodata.py b/extruct/w3cmicrodata.py index bc25ef6f..6bd83b2b 100644 --- a/extruct/w3cmicrodata.py +++ b/extruct/w3cmicrodata.py @@ -49,7 +49,6 @@ class LxmlMicrodataExtractor(object): _xp_prop = lxml.etree.XPath("""set:difference(.//*[@itemprop], .//*[@itemscope]//*[@itemprop])""", namespaces = {"set": "http://exslt.org/sets"}) - _xp_clean_text = lxml.etree.XPath('descendant-or-self::*[not(self::script or self::style)]/text()') # ancestor and preceding axes contain all elements before the context node # so counting them gives the "document order" of the context node _xp_item_docid = lxml.etree.XPath("""count(preceding::*[@itemscope]) From d8c03b7c7fece8c2ae9f0cbe22bad9afa5554d0d Mon Sep 17 00:00:00 2001 From: jakubwasikowski <jakub.wasikowski@gmail.com> Date: Fri, 2 Aug 2019 13:09:50 +0200 Subject: [PATCH 4/4] Change place where cleaning document is applied --- extruct/w3cmicrodata.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/extruct/w3cmicrodata.py b/extruct/w3cmicrodata.py index 6bd83b2b..f15a798e 100644 --- a/extruct/w3cmicrodata.py +++ b/extruct/w3cmicrodata.py @@ -66,14 +66,15 @@ def get_docid(self, node): def extract(self, htmlstring, base_url=None, encoding="UTF-8"): tree = parse_html(htmlstring, encoding=encoding) - return self.extract_items(cleaner.clean_html(tree), base_url) + return self.extract_items(tree, base_url) def extract_items(self, document, base_url): + cleaned_document = cleaner.clean_html(document) items_seen = set() return [ item for item in ( self._extract_item(it, items_seen=items_seen, base_url=base_url) - for it in self._xp_item(document)) + for it in self._xp_item(cleaned_document)) if item] def _extract_item(self, node, items_seen, base_url):