From 901fd3b3bda1120aa60efe9b23195536e3dd8bc9 Mon Sep 17 00:00:00 2001 From: Rodrigo Tobar Date: Thu, 2 Jan 2025 23:44:00 +0800 Subject: [PATCH] Add custom event name interning to yajl2_c Since its inception, the yajl2_c backend's performance has relied on a few tricks, one of the main ones being the fact that it uses its own global (now part of the module state) event name PyUnicode instances, since there are only so many event names that are always the same. Because internally the backend always uses these objects, we can perform direct pointer comparison when checking for event name equality. This works well for internal event names, but it actually breaks when users provide their own string instances. This can occur when doing event interception, or when using the *_coro function family. In these cases, it cannot be guaranteed that input event names are the same objects are the global ones kept by yajl2_c, and therefore pointer comparison will fail, leading to confusing behaviour. This commit remedies this situation by performing our own poor man's interning logic. When calling into the send() methods of the yajl2_c objects, we try to find the internal event name object corresponding to the one given by the user based on its hash value (but checking for pointer equality first, we might still be receiving a yajl2_c global object)x, and use that internally. If no corresponding event name can be found then it is used as-is, which will lead to undefined behaviour (but that's the case already too, even with the other backends). Since the yajl2_c backend itself doesn't invoke the send() functions directly but the underlying send_impl() ones (to save an unnecessary tuple packing/unpacking), performance isn't affected for normal use cases. New tests have been added showing these scenarios where user-provided event name instances are fed into the different methods. These tests fail for yajl2_c if the changes in this commit are not applied. Signed-off-by: Rodrigo Tobar --- CHANGELOG.md | 2 ++ src/ijson/backends/ext/_yajl2/event_names.h | 19 +++++++++++++++++++ .../backends/ext/_yajl2/items_basecoro.c | 1 + .../backends/ext/_yajl2/kvitems_basecoro.c | 1 + .../backends/ext/_yajl2/parse_basecoro.c | 1 + tests/test_items.py | 14 ++++++++++++++ tests/test_kvitems.py | 14 ++++++++++++++ tests/test_parse.py | 18 ++++++++++++++++++ 8 files changed, 70 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1b47e36..067c85d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,8 @@ * Improved performance of `parse` routine in C backend by ~4%. * Fixed several potential stability issues in C backend around correct error handling. +* Fixed corner-case wrong behaviour of `yajl2_c` backend, + which didn't work correctly with user-provided event names. * Pointing to our own fork of yajl (for when we build it ourselves) that contains fixes for all known CVEs (#126). * Removed leftover compatibility bits in the C backend. diff --git a/src/ijson/backends/ext/_yajl2/event_names.h b/src/ijson/backends/ext/_yajl2/event_names.h index a3040e3..c55d37e 100644 --- a/src/ijson/backends/ext/_yajl2/event_names.h +++ b/src/ijson/backends/ext/_yajl2/event_names.h @@ -50,4 +50,23 @@ typedef struct _event_names { f(9, start_array_ename, "start_array"); \ f(10, end_array_ename, "end_array"); +/* Returns the module-internal event name unicode object for the given */ +static inline PyObject *get_builtin_ename(enames_t *enames, PyObject *event) +{ +#define SWAP(x, y) { Py_INCREF(y); Py_DECREF(x); return y; } + + /* Compare by pointer equality first, then by hash */ +#define MATCH(i, member, _value) if (enames->member == event) SWAP(event, enames->member) + FOR_EACH_EVENT(MATCH) +#undef MATCH + + Py_hash_t hash = PyObject_Hash(event); + Py_hash_t *hashes = enames->hashes; +#define MATCH(i, member, _value) if (hashes[i] == hash) SWAP(event, enames->member) + FOR_EACH_EVENT(MATCH) +#undef MATCH + + return event; +} + #endif /* EVENT_NAMES_H */ \ No newline at end of file diff --git a/src/ijson/backends/ext/_yajl2/items_basecoro.c b/src/ijson/backends/ext/_yajl2/items_basecoro.c index a557d96..f7818cb 100644 --- a/src/ijson/backends/ext/_yajl2/items_basecoro.c +++ b/src/ijson/backends/ext/_yajl2/items_basecoro.c @@ -81,6 +81,7 @@ static PyObject* items_basecoro_send(PyObject *self, PyObject *tuple) PyObject *value = NULL; PyObject *result = NULL; if(!ijson_unpack(tuple, 3, &path, &event, &value)) { + event = get_builtin_ename(&((ItemsBasecoro *)self)->module_state->enames, event); result = items_basecoro_send_impl(self, path, event, value); } Py_XDECREF(value); diff --git a/src/ijson/backends/ext/_yajl2/kvitems_basecoro.c b/src/ijson/backends/ext/_yajl2/kvitems_basecoro.c index 3d551c1..77af3de 100644 --- a/src/ijson/backends/ext/_yajl2/kvitems_basecoro.c +++ b/src/ijson/backends/ext/_yajl2/kvitems_basecoro.c @@ -105,6 +105,7 @@ static PyObject* kvitems_basecoro_send(PyObject *self, PyObject *tuple) PyObject *value = NULL; PyObject *result = NULL; if(!ijson_unpack(tuple, 3, &path, &event, &value)) { + event = get_builtin_ename(&((KVItemsBasecoro *)self)->module_state->enames, event); result = kvitems_basecoro_send_impl(self, path, event, value); } Py_XDECREF(value); diff --git a/src/ijson/backends/ext/_yajl2/parse_basecoro.c b/src/ijson/backends/ext/_yajl2/parse_basecoro.c index 351a655..3705a84 100644 --- a/src/ijson/backends/ext/_yajl2/parse_basecoro.c +++ b/src/ijson/backends/ext/_yajl2/parse_basecoro.c @@ -121,6 +121,7 @@ static PyObject* parse_basecoro_send(PyObject *self, PyObject *tuple) PyObject *value = NULL; PyObject *result = NULL; if(!ijson_unpack(tuple, 2, &event, &value)) { + event = get_builtin_ename(&((ParseBasecoro *)self)->module_state->enames, event); result = parse_basecoro_send_impl(self, event, value); } Py_XDECREF(value); diff --git a/tests/test_items.py b/tests/test_items.py index b4de142..ee35b03 100644 --- a/tests/test_items.py +++ b/tests/test_items.py @@ -82,3 +82,17 @@ def test_coro_needs_input_with_three_elements(backend): # not an iterable with pytest.raises(TypeError, match="cannot unpack"): next(backend.items([None], '')) + + +def test_user_events(backend): + """ + User-provided events work correct -- yajl2_c used to fail with event names + that weren't generated by itself. + """ + embedded_empty_list_events = [ + ('', 'start_array', None), + ('item', 'start_array', None), + ('item', 'end_array', None), + ('', 'end_array', None), + ] + assert [[]] == list(backend.items(embedded_empty_list_events, 'item')) diff --git a/tests/test_kvitems.py b/tests/test_kvitems.py index 4e89e70..24484be 100644 --- a/tests/test_kvitems.py +++ b/tests/test_kvitems.py @@ -58,3 +58,17 @@ def test_coro_needs_input_with_three_elements(backend): # not an iterable with pytest.raises(TypeError, match="cannot unpack"): next(backend.kvitems([None], '')) + + +def test_user_events(backend): + """ + User-provided events work correct -- yajl2_c used to fail with event names + that weren't generated by itself. + """ + int_element_parse_events = [ + ('', 'start_map', None), + ('', 'map_key', 'a'), + ('a', 'number', 0), + ('', 'end_map', None), + ] + assert [('a', 0)] == list(backend.kvitems(int_element_parse_events, '')) diff --git a/tests/test_parse.py b/tests/test_parse.py index 61cae3b..0f8cc9f 100644 --- a/tests/test_parse.py +++ b/tests/test_parse.py @@ -23,3 +23,21 @@ def test_coro_needs_input_with_two_elements(backend): # not an iterable with pytest.raises(TypeError, match="cannot unpack"): next(backend.parse([None])) + + +def test_user_events(backend): + """ + User-provided events work correct -- yajl2_c used to fail with event names + that weren't generated by itself. + """ + int_array_basic_parse_events = [ + ('start_array', None), + ('number', 0), + ('end_array', None), + ] + expected_parse_events = [ + ('', 'start_array', None), + ('item', 'number', 0), + ('', 'end_array', None), + ] + assert expected_parse_events == list(backend.parse(int_array_basic_parse_events))