Skip to content

Commit

Permalink
ignore processing instructions and doctypes
Browse files Browse the repository at this point in the history
  • Loading branch information
keith-hall committed Dec 28, 2015
1 parent 8d490ea commit dfc17c3
Showing 1 changed file with 6 additions and 0 deletions.
6 changes: 6 additions & 0 deletions lxml_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from lxml import etree
from xml.sax import make_parser
from lxml.html import fromstring as fromhtmlstring
from xml.sax.handler import feature_external_pes, feature_external_ges

ns_loc = 'lxml'

Expand All @@ -11,6 +12,8 @@ def clean_html(html_soup):

def lxml_etree_parse_xml_string_with_location(xml_string, line_number_offset):
parser = make_parser()
parser.setFeature(feature_external_pes, False)
parser.setFeature(feature_external_ges, False)
global ns_loc

class ETreeContent(ElementTreeContentHandler):
Expand Down Expand Up @@ -135,6 +138,9 @@ def characters(self, data):
self._recordEndPosition()
super().characters(data)

def processingInstruction(self, target, data):
pass # ignore processing instructions

def endDocument(self):
self._recordPosition(self.etree.getroot(), 'close_tag_end_pos')

Expand Down

0 comments on commit dfc17c3

Please sign in to comment.