diff --git a/extruct/w3cmicrodata.py b/extruct/w3cmicrodata.py
index cb8c9fa7..f15a798e 100644
--- a/extruct/w3cmicrodata.py
+++ b/extruct/w3cmicrodata.py
@@ -31,8 +31,8 @@
javascript=False, # onclick attributes are fine
comments=True,
style=True,
- links=True,
- meta=True,
+ links=False, # e.g. availability is included in tags
+ meta=False, # some sites use tags in body to provide property
page_structure=False, #
may be nice to have
processing_instructions=True,
embedded=False, # keep embedded content
@@ -49,7 +49,6 @@ class LxmlMicrodataExtractor(object):
_xp_prop = lxml.etree.XPath("""set:difference(.//*[@itemprop],
.//*[@itemscope]//*[@itemprop])""",
namespaces = {"set": "http://exslt.org/sets"})
- _xp_clean_text = lxml.etree.XPath('descendant-or-self::*[not(self::script or self::style)]/text()')
# ancestor and preceding axes contain all elements before the context node
# so counting them gives the "document order" of the context node
_xp_item_docid = lxml.etree.XPath("""count(preceding::*[@itemscope])
@@ -70,11 +69,12 @@ def extract(self, htmlstring, base_url=None, encoding="UTF-8"):
return self.extract_items(tree, base_url)
def extract_items(self, document, base_url):
+ cleaned_document = cleaner.clean_html(document)
items_seen = set()
return [
item for item in (
self._extract_item(it, items_seen=items_seen, base_url=base_url)
- for it in self._xp_item(document))
+ for it in self._xp_item(cleaned_document))
if item]
def _extract_item(self, node, items_seen, base_url):
@@ -203,8 +203,7 @@ def _extract_property_value(self, node, items_seen, base_url, force=False):
return self._extract_textContent(node)
def _extract_textContent(self, node):
- clean_node = cleaner.clean_html(node)
- return html_text.etree_to_text(clean_node)
+ return html_text.etree_to_text(node)
MicrodataExtractor = LxmlMicrodataExtractor