Skip to content

Commit 57247b8

Browse files
Coworkp1c2u
authored andcommitted
Refactor SchemaValidator caches to be per-resolver
The class-level _needs_state_cache was keyed on SchemaPath, but SchemaPath equality (inherited from pathable.BasePath) is path-only: two distinct OpenAPI specs that share a JSON-pointer path collide, returning stale answers across specs. The bug is silent in production because validation typically runs against a single spec at a time, but bites in any host that loads more than one spec, and in test suites where fresh SchemaPath.from_dict() calls produce short colliding paths. Replace the cache with a per-resolver registry (one cache per loaded spec), keyed on the resolver's identity and evicted via weakref.finalize when the resolver is garbage-collected. Inner cache keys are id(content_dict), which is safe within a single spec (the cache only lives as long as the resolver does, so id() reuse cannot cross spec boundaries). Perf-neutral on the bench (357 vs 348 ops/sec, within noise) because _schema_needs_state is only consulted during state-building, not on the per-value hot path. Adds a regression test that two specs with colliding paths return correct independent answers, and a GC test that the per-resolver cache slot is released when the spec is collected.
1 parent 850789a commit 57247b8

3 files changed

Lines changed: 252 additions & 39 deletions

File tree

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
"""Per-resolver schema-property caches.
2+
3+
Several ``SchemaValidator`` methods need to answer static questions
4+
about a schema -- "does this subtree carry composition?" or "does this
5+
subtree contain a binary/byte string?" -- and reuse the answers across
6+
many validation calls. A naive class-level cache keyed on ``SchemaPath``
7+
is unsafe because ``SchemaPath`` equality / hashing (inherited from
8+
``pathable.BasePath``) is path-only: two distinct OpenAPI specs that
9+
happen to share a JSON-pointer path (``anyOf#0``) collide.
10+
11+
This module provides a small key abstraction that keeps the answers
12+
correct across specs and lets them be reclaimed when the spec is
13+
garbage-collected.
14+
15+
Design:
16+
17+
* Each OpenAPI spec resolves through a single, stable ``Resolver``
18+
instance. All ``SchemaPath`` objects derived from the same root spec
19+
share that resolver, so the resolver's identity is a reliable
20+
per-spec key (verified empirically against ``jsonschema-path``).
21+
* Each spec's content is laid out as a single tree of dict objects.
22+
Two distinct dicts within the same spec have distinct ``id()``
23+
values, and the ``id()`` is stable for the lifetime of the dict
24+
(it is a CPython memory address). Within a spec, ``id(content)``
25+
is therefore safe as an inner cache key.
26+
* When the spec (and its resolver) is collected, ``weakref.finalize``
27+
evicts the entire spec's cache slot in one shot. This both prevents
28+
the cache from pinning the spec in memory and forecloses on the
29+
classic ``id()``-reuse hazard.
30+
31+
The module exposes one helper per query: ``ResolverScopedCache.get`` /
32+
``put``. Callers are responsible for the actual computation -- the
33+
cache only stores results.
34+
"""
35+
36+
from __future__ import annotations
37+
38+
import weakref
39+
from typing import Any
40+
from typing import Dict
41+
from typing import Optional
42+
43+
44+
class _PerResolverCache:
45+
"""One spec's worth of cached answers.
46+
47+
``slots`` reduces the per-spec overhead to two dict slots; we
48+
expect at most a handful of these to exist concurrently (one per
49+
loaded OpenAPI document).
50+
"""
51+
52+
__slots__ = ("needs_state", "needs_binary_normalization")
53+
54+
def __init__(self) -> None:
55+
self.needs_state: Dict[int, bool] = {}
56+
self.needs_binary_normalization: Dict[int, bool] = {}
57+
58+
59+
# Class-level registry of per-resolver caches. Keys are ``id(resolver)``
60+
# and entries are removed via ``weakref.finalize`` when the resolver is
61+
# collected; ``id()`` reuse is therefore safe by construction (the slot
62+
# is empty before the next resolver can claim the address).
63+
_caches: Dict[int, _PerResolverCache] = {}
64+
65+
66+
def cache_for(resolver: Any) -> _PerResolverCache:
67+
"""Return the per-resolver cache for ``resolver``, creating it on
68+
first access. Registers a finalizer so the entry evicts when the
69+
resolver is collected.
70+
"""
71+
rid = id(resolver)
72+
cache = _caches.get(rid)
73+
if cache is not None:
74+
return cache
75+
cache = _PerResolverCache()
76+
_caches[rid] = cache
77+
# ``weakref.finalize`` is the only mechanism that survives the
78+
# resolver's collection. The callback pops by the resolver's *old*
79+
# id, which is correct: the slot was claimed by this resolver and
80+
# nothing else can occupy it until this callback fires.
81+
weakref.finalize(resolver, _caches.pop, rid, None)
82+
return cache
83+
84+
85+
def _reset_for_tests() -> None:
86+
"""Drop all cached entries. Test-only helper; production code never
87+
needs to call this because the resolver lifetime drives eviction.
88+
"""
89+
_caches.clear()
90+
91+
92+
__all__ = ["cache_for", "_PerResolverCache", "_reset_for_tests"]

openapi_core/validation/schemas/validators.py

Lines changed: 58 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from openapi_core.validation.schemas.datatypes import (
1818
_EMPTY_STATES as _EMPTY_STATES_MAP,
1919
)
20+
from openapi_core.validation.schemas._caches import cache_for as _cache_for
2021
from openapi_core.validation.schemas.datatypes import FormatValidator
2122
from openapi_core.validation.schemas.datatypes import ValidationState
2223
from openapi_core.validation.schemas.exceptions import InvalidSchemaValue
@@ -47,60 +48,78 @@ def validate(self, value: Any) -> None:
4748
schema_type = (self.schema / "type").read_str_or_list("any")
4849
raise InvalidSchemaValue(value, schema_type, schema_errors=errors)
4950

50-
# Cache the recursive "does this schema benefit from a ValidationState?"
51-
# check, keyed on the SchemaPath. SchemaPath is hashed by content, so
52-
# two SchemaPaths pointing at the same spec location share a cache
53-
# slot regardless of identity -- safe across GC, bounded by the number
54-
# of distinct schema shapes in the spec rather than by input volume.
55-
_needs_state_cache: dict[SchemaPath, bool] = {}
56-
5751
@classmethod
5852
def _schema_needs_state(cls, schema: SchemaPath) -> bool:
5953
"""True if building a ValidationState for ``schema`` carries
6054
information the unmarshaller can reuse: either composition
6155
(oneOf/anyOf/allOf) on this node, or a descendant that does.
6256
63-
Cycle-safe: a False sentinel is stored before recursing, so a
64-
$ref loop terminates and the real answer overwrites the
65-
sentinel once the recursion completes.
57+
The answer is purely a function of the resolved schema contents,
58+
so we cache it per-resolver (i.e. per OpenAPI spec) keyed on
59+
the content dict's identity. See ``_caches.py`` for why a
60+
SchemaPath-keyed cache would be unsafe across specs.
6661
"""
67-
cache = cls._needs_state_cache
68-
cached = cache.get(schema)
62+
with schema.resolve() as resolved:
63+
return cls._contents_need_state(
64+
resolved.contents, _cache_for(resolved.resolver), set()
65+
)
66+
67+
@classmethod
68+
def _contents_need_state(
69+
cls,
70+
contents: Any,
71+
cache: Any,
72+
seen: set,
73+
) -> bool:
74+
# Boolean schemas (True/False) and other non-dict shapes can't
75+
# introduce composition.
76+
if not isinstance(contents, dict):
77+
return False
78+
79+
marker = id(contents)
80+
cached = cache.needs_state.get(marker)
6981
if cached is not None:
7082
return cached
71-
# Self-composition is the strongest signal; check it first to
72-
# short-circuit the cheap case.
73-
if "oneOf" in schema or "anyOf" in schema or "allOf" in schema:
74-
cache[schema] = True
83+
# Cycle protection: a $ref loop resolves back to the same dict.
84+
# ``seen`` is per-call (not shared across calls), so a True
85+
# result downstream still propagates back up correctly.
86+
if marker in seen:
87+
return False
88+
seen.add(marker)
89+
90+
# Self-composition: strongest signal, short-circuit.
91+
if (
92+
"oneOf" in contents
93+
or "anyOf" in contents
94+
or "allOf" in contents
95+
):
96+
cache.needs_state[marker] = True
7597
return True
76-
# Seed the in-progress sentinel for cycle protection.
77-
cache[schema] = False
78-
# Recurse into children. We only need to find one descendant
79-
# that needs state to flip our own answer.
98+
8099
result = False
81-
if "properties" in schema:
82-
prop_iter = (schema / "properties").items()
83-
for prop_name, prop_schema in prop_iter:
84-
if not isinstance(prop_name, str):
85-
continue
86-
if cls._schema_needs_state(prop_schema):
100+
101+
properties = contents.get("properties")
102+
if isinstance(properties, dict):
103+
for prop_schema in properties.values():
104+
if cls._contents_need_state(prop_schema, cache, seen):
87105
result = True
88106
break
89-
if not result and "additionalProperties" in schema:
90-
try:
91-
ap = schema / "additionalProperties"
92-
except Exception:
93-
ap = None
94-
if ap is not None and cls._schema_needs_state(ap):
107+
108+
if not result:
109+
additional = contents.get("additionalProperties")
110+
if isinstance(additional, dict) and cls._contents_need_state(
111+
additional, cache, seen
112+
):
95113
result = True
96-
if not result and "items" in schema:
97-
try:
98-
items = schema / "items"
99-
except Exception:
100-
items = None
101-
if items is not None and cls._schema_needs_state(items):
114+
115+
if not result:
116+
items = contents.get("items")
117+
if isinstance(items, dict) and cls._contents_need_state(
118+
items, cache, seen
119+
):
102120
result = True
103-
cache[schema] = result
121+
122+
cache.needs_state[marker] = result
104123
return result
105124

106125
def validate_state(self, value: Any) -> ValidationState:

tests/unit/validation/test_schema_validators.py

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -356,3 +356,105 @@ def test_enforce_properties_required_applies_to_nested_composed_schemas(
356356
schema,
357357
enforce_properties_required=True,
358358
).validate({"name": "openapi-core", "meta": {}})
359+
360+
361+
362+
class TestSchemaValidatorCacheIsolation:
363+
"""The per-resolver cache must keep ``_schema_needs_state`` answers
364+
independent across distinct OpenAPI specs that happen to share
365+
JSON-pointer paths.
366+
367+
Regression test for the ``SchemaPath``-keyed cache: ``SchemaPath``
368+
equality is path-only (inherited from ``pathable.BasePath``), so a
369+
``dict``-keyed cache would collide on identical paths regardless of
370+
what the paths actually resolve to. The bug is silent in production
371+
because all evolved schemas come from one spec, but bites in any
372+
process that loads more than one.
373+
"""
374+
375+
def test_disjoint_specs_with_colliding_paths(self):
376+
# Both specs have a value at ``anyOf/0`` but one is a leaf
377+
# string and the other carries oneOf -- only the second should
378+
# report needs_state=True.
379+
from openapi_core.validation.schemas.validators import SchemaValidator
380+
381+
spec_simple = SchemaPath.from_dict(
382+
{"anyOf": [{"type": "string"}, {"type": "integer"}]}
383+
)
384+
spec_composed = SchemaPath.from_dict(
385+
{
386+
"anyOf": [
387+
{
388+
"type": "object",
389+
"properties": {
390+
"x": {
391+
"oneOf": [
392+
{"type": "string"},
393+
{"type": "integer"},
394+
]
395+
}
396+
},
397+
},
398+
{"type": "integer"},
399+
]
400+
}
401+
)
402+
403+
# Each branch's value at anyOf/0 has the SAME SchemaPath
404+
# (anyOf#0) but disjoint contents.
405+
simple_branch = spec_simple / "anyOf" / 0
406+
composed_branch = spec_composed / "anyOf" / 0
407+
assert simple_branch == composed_branch # path-only equality
408+
assert hash(simple_branch) == hash(composed_branch)
409+
410+
# The cache must distinguish them by spec.
411+
assert SchemaValidator._schema_needs_state(simple_branch) is False
412+
assert SchemaValidator._schema_needs_state(composed_branch) is True
413+
# And the order doesn't matter -- ask in reverse.
414+
spec_simple_2 = SchemaPath.from_dict(
415+
{"anyOf": [{"type": "string"}, {"type": "integer"}]}
416+
)
417+
spec_composed_2 = SchemaPath.from_dict(
418+
{
419+
"anyOf": [
420+
{"oneOf": [{"type": "string"}]},
421+
{"type": "integer"},
422+
]
423+
}
424+
)
425+
assert (
426+
SchemaValidator._schema_needs_state(
427+
spec_composed_2 / "anyOf" / 0
428+
)
429+
is True
430+
)
431+
assert (
432+
SchemaValidator._schema_needs_state(
433+
spec_simple_2 / "anyOf" / 0
434+
)
435+
is False
436+
)
437+
438+
def test_cache_evicts_on_resolver_collection(self):
439+
# When a spec's resolver is garbage-collected, its cache slot
440+
# is dropped. This both prevents the cache from pinning the
441+
# spec in memory and forecloses on the classic id()-reuse
442+
# hazard (a freshly allocated resolver cannot inherit stale
443+
# answers from a collected one at the same address).
444+
import gc
445+
446+
from openapi_core.validation.schemas._caches import _caches
447+
from openapi_core.validation.schemas.validators import SchemaValidator
448+
449+
before = len(_caches)
450+
spec = SchemaPath.from_dict(
451+
{"oneOf": [{"type": "string"}, {"type": "integer"}]}
452+
)
453+
SchemaValidator._schema_needs_state(spec)
454+
# Capturing one extra slot is what we expect.
455+
assert len(_caches) == before + 1
456+
457+
# Drop the only outside reference; the cache slot must follow.
458+
del spec
459+
gc.collect()
460+
assert len(_caches) == before

0 commit comments

Comments
 (0)