Skip to content

Make the html cleaning for microdata faster #123

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 5 additions & 6 deletions extruct/w3cmicrodata.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@
javascript=False, # onclick attributes are fine
comments=True,
style=True,
links=True,
meta=True,
links=False, # e.g. availability is included in <link> tags
meta=False, # some sites use <meta> tags in body to provide property
page_structure=False, # <title> may be nice to have
processing_instructions=True,
embedded=False, # keep embedded content
Expand All @@ -49,7 +49,6 @@ class LxmlMicrodataExtractor(object):
_xp_prop = lxml.etree.XPath("""set:difference(.//*[@itemprop],
.//*[@itemscope]//*[@itemprop])""",
namespaces = {"set": "http://exslt.org/sets"})
_xp_clean_text = lxml.etree.XPath('descendant-or-self::*[not(self::script or self::style)]/text()')
# ancestor and preceding axes contain all elements before the context node
# so counting them gives the "document order" of the context node
_xp_item_docid = lxml.etree.XPath("""count(preceding::*[@itemscope])
Expand All @@ -70,11 +69,12 @@ def extract(self, htmlstring, base_url=None, encoding="UTF-8"):
return self.extract_items(tree, base_url)

def extract_items(self, document, base_url):
cleaned_document = cleaner.clean_html(document)
items_seen = set()
return [
item for item in (
self._extract_item(it, items_seen=items_seen, base_url=base_url)
for it in self._xp_item(document))
for it in self._xp_item(cleaned_document))
if item]

def _extract_item(self, node, items_seen, base_url):
Expand Down Expand Up @@ -203,8 +203,7 @@ def _extract_property_value(self, node, items_seen, base_url, force=False):
return self._extract_textContent(node)

def _extract_textContent(self, node):
clean_node = cleaner.clean_html(node)
return html_text.etree_to_text(clean_node)
return html_text.etree_to_text(node)


MicrodataExtractor = LxmlMicrodataExtractor