From 59e611eb662d5065e1375041d3e84a2b61142a46 Mon Sep 17 00:00:00 2001 From: Cathal Garvey Date: Fri, 30 Mar 2018 17:04:33 +0100 Subject: [PATCH 1/4] Added a JSON hot-swapping utility in utils.py, plus tests. --- extruct/utils.py | 53 +++++++++++++++++++++++++++++++++++++++++++++ tests/test_utils.py | 53 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 106 insertions(+) create mode 100644 extruct/utils.py create mode 100644 tests/test_utils.py diff --git a/extruct/utils.py b/extruct/utils.py new file mode 100644 index 00000000..eee17803 --- /dev/null +++ b/extruct/utils.py @@ -0,0 +1,53 @@ +import json +import sys + +_json_decoder = json.loads +_json_decoder_raises = tuple() # Better not to catch built-in errors at all! +def set_json_decoder(loader_func=_json_decoder, + loader_raises=_json_decoder_raises): + """ + Sets extruct's preferred JSON decoder function. + + You should provide a function that accepts strings, and returns decoded + native objects, as loader_func. + + When your preferred decoder encounters non-JSON strings, or malformed JSON, + typically it will raise Exceptions. Extruct expects json.JSONDecodeError + in such cases. If your preferred decoder does something else (such as the + ValueErrors raised by ujson), provide a tuple of all Exception classes + raised on bad JSON or non-JSON. + """ + global _json_decoder + global _json_decoder_raises + _json_decoder = loader_func + _json_decoder_raises = loader_raises + + +def json_loads(json_string): + """ + Uses the preferred JSON decoder (default is stdlib json) to decode a string, + converting any idiosyncratic exceptions to json.JSONDecodeError. + + Using this utility function allows one to swap in different decoders with + utils.set_json_decoder, for example to use `ujson`, without requiring + extruct to directly handle and support the weirdnesses of each third-party + json library. + """ + # Does this need `global _json_decoder` to prevent reference capture and failure-to-switch? + try: + data = _json_decoder(json_string) + except _json_decoder_raises as E: + # TODO: Deprecate with Python 2. Reason: Prefer exception chaining with `raise from` + _, _, traceback = sys.exc_info() + if sys.version_info < (3,): + raise ValueError("Error decoding document: {}".format(traceback)) + else: + if isinstance(E, json.JSONDecodeError): + raise + else: + raise json.JSONDecodeError( + msg="Error decoding document (error index unknown, see preceding traceback)", + doc=json_string, + pos=0, + ).with_traceback(traceback) + return data diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 00000000..f8c52951 --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,53 @@ +# -*- coding: utf-8 -*- +import unittest +import json + +from extruct import utils + + +# Python 2 is a vengeful fossil +native_json_exc = getattr(json, 'JSONDecodeError', ValueError) + + +class NotAJSONDecodeError(ValueError): + pass + + +class _json_shimmer: + def __init__(self): + self.invoked = False + + def decode(self, json_string): + if json_string == 'FAIL WITH VALUEERROR': + raise ValueError("Yes sir") + try: + return json.loads(json_string) + except native_json_exc: + raise NotAJSONDecodeError("This operation totally failed") + finally: + self.invoked = True + + +class TestJson(unittest.TestCase): + + def test_json_abstraction(self): + # Use default decoder + self.assertEqual(utils._json_decoder, json.loads) + self.assertEqual(utils._json_decoder_raises, tuple()) + self.assertEqual(utils.json_loads('{}'), {}) + with self.assertRaises(native_json_exc): # ugh, Python 2 + utils.json_loads('{') + # Set decoder, try again + shimmer = _json_shimmer() + utils.set_json_decoder(shimmer.decode, (NotAJSONDecodeError,)) + self.assertEqual(utils._json_decoder, shimmer.decode) + self.assertEqual(utils._json_decoder_raises, (NotAJSONDecodeError,)) + self.assertEqual(utils.json_loads('{}'), {}) + # ensure utils.json_loads didn't call a stale reference to json.loads + self.assertTrue(shimmer.invoked) + # Specified exceptions should be converted to JSONDecodeErrors. + with self.assertRaises(native_json_exc): + utils.json_loads('{') + # Others should not. + with self.assertRaises(ValueError): + utils.json_loads('FAIL WITH VALUEERROR') From 456e209be9c70c766f4c8e3b36488a9ed37068f1 Mon Sep 17 00:00:00 2001 From: Cathal Garvey Date: Fri, 30 Mar 2018 17:07:25 +0100 Subject: [PATCH 2/4] Slight change to utils.json_loads to enable passthrough for ValueError in Python 2 --- extruct/utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/extruct/utils.py b/extruct/utils.py index eee17803..444352c6 100644 --- a/extruct/utils.py +++ b/extruct/utils.py @@ -40,7 +40,10 @@ def json_loads(json_string): # TODO: Deprecate with Python 2. Reason: Prefer exception chaining with `raise from` _, _, traceback = sys.exc_info() if sys.version_info < (3,): - raise ValueError("Error decoding document: {}".format(traceback)) + if isinstance(E, ValueError): + raise + else: + raise ValueError("Error decoding document: {}".format(traceback)) else: if isinstance(E, json.JSONDecodeError): raise From 34ac135469dac81a76db5c896f186460180823c5 Mon Sep 17 00:00:00 2001 From: Cathal Garvey Date: Fri, 30 Mar 2018 17:23:55 +0100 Subject: [PATCH 3/4] Enables JSON hot-swapping for jsonld and rdfa modules --- extruct/jsonld.py | 8 ++++---- extruct/rdfa.py | 4 ++-- extruct/utils.py | 25 +++++++++++++------------ tests/test_utils.py | 10 +++------- 4 files changed, 22 insertions(+), 25 deletions(-) diff --git a/extruct/jsonld.py b/extruct/jsonld.py index b0cecefb..a66ef99c 100644 --- a/extruct/jsonld.py +++ b/extruct/jsonld.py @@ -3,7 +3,7 @@ JSON-LD extractor """ -import json +from extruct import utils import re import lxml.etree @@ -29,10 +29,10 @@ def extract_items(self, document, *args, **kwargs): def _extract_items(self, node): script = node.xpath('string()') try: - data = json.loads(script) - except ValueError: + data = utils.json_loads(script) + except utils.native_json_exc: # sometimes JSON-decoding errors are due to leading HTML or JavaScript comments - data = json.loads(HTML_OR_JS_COMMENTLINE.sub('', script)) + data = utils.json_loads(HTML_OR_JS_COMMENTLINE.sub('', script)) if isinstance(data, list): return data elif isinstance(data, dict): diff --git a/extruct/rdfa.py b/extruct/rdfa.py index 8d93feb2..d2a05e42 100644 --- a/extruct/rdfa.py +++ b/extruct/rdfa.py @@ -4,7 +4,7 @@ Based on pyrdfa3 and rdflib """ -import json +from extruct import utils import logging rdflib_logger = logging.getLogger('rdflib') rdflib_logger.setLevel(logging.ERROR) @@ -48,4 +48,4 @@ def extract_items(self, document, url, expanded=True, *args, **kwargs): g = PyRdfa(options, base=url).graph_from_DOM(document, graph=Graph(), pgraph=Graph()) jsonld_string = g.serialize(format='json-ld', auto_compact=not expanded).decode('utf-8') - return json.loads(jsonld_string) + return utils.json_loads(jsonld_string) diff --git a/extruct/utils.py b/extruct/utils.py index 444352c6..48b385f8 100644 --- a/extruct/utils.py +++ b/extruct/utils.py @@ -1,6 +1,11 @@ import json import sys + +# Python 2 is a vengeful fossil +native_json_exc = getattr(json, 'JSONDecodeError', ValueError) + + _json_decoder = json.loads _json_decoder_raises = tuple() # Better not to catch built-in errors at all! def set_json_decoder(loader_func=_json_decoder, @@ -38,19 +43,15 @@ def json_loads(json_string): data = _json_decoder(json_string) except _json_decoder_raises as E: # TODO: Deprecate with Python 2. Reason: Prefer exception chaining with `raise from` + if isinstance(E, native_json_exc): + raise _, _, traceback = sys.exc_info() if sys.version_info < (3,): - if isinstance(E, ValueError): - raise - else: - raise ValueError("Error decoding document: {}".format(traceback)) + raise ValueError("Error decoding document: {}".format(traceback)) else: - if isinstance(E, json.JSONDecodeError): - raise - else: - raise json.JSONDecodeError( - msg="Error decoding document (error index unknown, see preceding traceback)", - doc=json_string, - pos=0, - ).with_traceback(traceback) + raise json.JSONDecodeError( + msg="Error decoding document (error index unknown, see preceding traceback)", + doc=json_string, + pos=0, + ).with_traceback(traceback) return data diff --git a/tests/test_utils.py b/tests/test_utils.py index f8c52951..41cb8668 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -5,10 +5,6 @@ from extruct import utils -# Python 2 is a vengeful fossil -native_json_exc = getattr(json, 'JSONDecodeError', ValueError) - - class NotAJSONDecodeError(ValueError): pass @@ -22,7 +18,7 @@ def decode(self, json_string): raise ValueError("Yes sir") try: return json.loads(json_string) - except native_json_exc: + except utils.native_json_exc: raise NotAJSONDecodeError("This operation totally failed") finally: self.invoked = True @@ -35,7 +31,7 @@ def test_json_abstraction(self): self.assertEqual(utils._json_decoder, json.loads) self.assertEqual(utils._json_decoder_raises, tuple()) self.assertEqual(utils.json_loads('{}'), {}) - with self.assertRaises(native_json_exc): # ugh, Python 2 + with self.assertRaises(utils.native_json_exc): # ugh, Python 2 utils.json_loads('{') # Set decoder, try again shimmer = _json_shimmer() @@ -46,7 +42,7 @@ def test_json_abstraction(self): # ensure utils.json_loads didn't call a stale reference to json.loads self.assertTrue(shimmer.invoked) # Specified exceptions should be converted to JSONDecodeErrors. - with self.assertRaises(native_json_exc): + with self.assertRaises(utils.native_json_exc): utils.json_loads('{') # Others should not. with self.assertRaises(ValueError): From e469ec80de03f844d307788256cced1a459baf47 Mon Sep 17 00:00:00 2001 From: Cathal Garvey Date: Fri, 30 Mar 2018 17:29:57 +0100 Subject: [PATCH 4/4] Added a test case exploring whether ujson seems to work (very poorly tested!) Relevant to #37 (hey @codinguncut ;) ) Closes: #49 --- setup.py | 1 + tests/test_utils.py | 7 +++++++ tox.ini | 1 + 3 files changed, 9 insertions(+) diff --git a/setup.py b/setup.py index 38ded3b0..d0431fe8 100644 --- a/setup.py +++ b/setup.py @@ -29,6 +29,7 @@ def get_version(): packages=find_packages(exclude=['tests',]), package_data={'extruct': ['VERSION']}, install_requires=['lxml', 'rdflib', 'rdflib-jsonld'], + tests_require=['ujson'], extras_require={ 'service': [ 'bottle', diff --git a/tests/test_utils.py b/tests/test_utils.py index 41cb8668..e7c087f8 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -2,6 +2,7 @@ import unittest import json +import ujson from extruct import utils @@ -47,3 +48,9 @@ def test_json_abstraction(self): # Others should not. with self.assertRaises(ValueError): utils.json_loads('FAIL WITH VALUEERROR') + + def test_ujson(self): + utils.set_json_decoder(ujson.loads, (ValueError,)) + self.assertEqual(utils._json_decoder, ujson.loads) + self.assertEqual(utils._json_decoder_raises, (ValueError,)) + self.assertEqual(utils.json_loads('{"foo": "bar"}'), {'foo': 'bar'}) diff --git a/tox.ini b/tox.ini index e2bc2898..36941785 100644 --- a/tox.ini +++ b/tox.ini @@ -7,5 +7,6 @@ deps = pytest pytest-cov mock + ujson commands = py.test --cov-report=term --cov-report= --cov=extruct {posargs:extruct tests}