2222from openapi_core .validation .schemas .exceptions import InvalidSchemaValue
2323from openapi_core .validation .schemas .exceptions import ValidateError
2424
25+ # OpenAPI ``format`` values whose *type: string* schemas are permitted to
26+ # carry a raw ``bytes`` payload end-to-end -- ``binary`` for opaque file
27+ # bodies (multipart/form-data, application/octet-stream) and ``byte`` for
28+ # base64 strings that callers may still hand in as ``bytes``.
29+ _BINARY_STRING_FORMATS = frozenset ({"binary" , "byte" })
30+
2531if TYPE_CHECKING :
2632 from openapi_core .casting .schemas .casters import SchemaCaster
2733
@@ -41,12 +47,157 @@ def __contains__(self, schema_format: str) -> bool:
4147 return schema_format in self .validator .format_checker .checkers
4248
4349 def validate (self , value : Any ) -> None :
44- errors_iter = self .validator .iter_errors (value )
50+ # OpenAPI allows ``bytes`` to flow through ``string`` schemas
51+ # whose ``format`` is ``binary`` or ``byte`` (file uploads,
52+ # base64-encoded blobs). jsonschema only validates ``string``
53+ # against text, so we present it a decoded view while keeping
54+ # the original ``value`` for downstream unmarshalling and error
55+ # reporting.
56+ normalized = self ._normalize_for_validation (value )
57+ errors_iter = self .validator .iter_errors (normalized )
4558 errors = tuple (errors_iter )
4659 if errors :
4760 schema_type = (self .schema / "type" ).read_str_or_list ("any" )
4861 raise InvalidSchemaValue (value , schema_type , schema_errors = errors )
4962
63+ @staticmethod
64+ def _decode_binary_value (value : bytes ) -> str :
65+ """Decode raw ``bytes`` into the text view jsonschema expects.
66+
67+ ``utf-8`` first because that's what the vast majority of byte
68+ bodies actually are; falling back to ASCII + ``surrogateescape``
69+ guarantees the call never raises for arbitrary binary payloads
70+ (a real file upload may contain any byte sequence).
71+ """
72+ try :
73+ return value .decode ("utf-8" )
74+ except UnicodeDecodeError :
75+ return value .decode ("ASCII" , errors = "surrogateescape" )
76+
77+ def _accepts_binary_string (self , value : Any ) -> bool :
78+ """True when ``value`` is ``bytes`` and the schema at this
79+ position is a ``string`` whose ``format`` allows raw bytes.
80+ """
81+ if not isinstance (value , bytes ):
82+ return False
83+ schema_format = (self .schema / "format" ).read_str (None )
84+ if schema_format not in _BINARY_STRING_FORMATS :
85+ return False
86+ schema_types = (self .schema / "type" ).read_str_or_list (None )
87+ if schema_types is None :
88+ # No declared type: OAS 3.1 lets any value flow; treat the
89+ # binary/byte format as authoritative.
90+ return True
91+ if isinstance (schema_types , str ):
92+ return schema_types == "string"
93+ return "string" in schema_types
94+
95+ def _normalize_for_validation (self , value : Any ) -> Any :
96+ """Return a view of ``value`` with ``bytes`` decoded to text
97+ wherever the schema-at-this-position is a binary/byte string.
98+
99+ The original ``value`` is never mutated. Containers are only
100+ copied when a descendant actually changes, so the unchanged
101+ fast path returns ``value`` itself -- callers can use object
102+ identity to detect a no-op.
103+
104+ Recursion is driven by the schema, not by introspecting the
105+ value: a ``dict`` is only descended when the schema declares
106+ ``properties``/``additionalProperties``, a ``list`` only when
107+ it declares ``items``, and composition (``oneOf``/``anyOf``/
108+ ``allOf``) is descended unconditionally because that's where
109+ a multipart binary branch typically lives.
110+ """
111+ if self ._accepts_binary_string (value ):
112+ return self ._decode_binary_value (value )
113+
114+ if isinstance (value , dict ):
115+ normalized = self ._normalize_mapping_for_validation (value )
116+ elif isinstance (value , list ) and "items" in self .schema :
117+ normalized = self ._normalize_array_for_validation (value )
118+ else :
119+ normalized = value
120+
121+ # Composition keywords are where the binary branch actually
122+ # lives in real specs (a multipart oneOf with a file branch and
123+ # a non-file branch, for example). We apply each sub-schema's
124+ # normalization in turn -- idempotent because a sub-schema that
125+ # doesn't touch a position returns the same object, and once a
126+ # bytes value has been decoded to ``str`` no other sub-schema
127+ # treats it as binary.
128+ for keyword in ("oneOf" , "anyOf" , "allOf" ):
129+ if keyword not in self .schema :
130+ continue
131+ for subschema in self .schema / keyword :
132+ normalized = self .evolve (
133+ subschema
134+ )._normalize_for_validation (normalized )
135+
136+ return normalized
137+
138+ def _normalize_mapping_for_validation (
139+ self , value : dict [str , Any ]
140+ ) -> dict [str , Any ]:
141+ normalized : dict [str , Any ] = value
142+
143+ if "properties" in self .schema :
144+ for prop_name , prop_schema in (self .schema / "properties" ).items ():
145+ if not isinstance (prop_name , str ) or prop_name not in value :
146+ continue
147+ prop_validator = self .evolve (prop_schema )
148+ new_value = prop_validator ._normalize_for_validation (
149+ value [prop_name ]
150+ )
151+ if new_value is value [prop_name ]:
152+ continue
153+ if normalized is value :
154+ normalized = dict (value )
155+ normalized [prop_name ] = new_value
156+
157+ additional = self .schema .get ("additionalProperties" , True )
158+ if additional in (True , False ):
159+ return normalized
160+
161+ property_names : set [str ] = set ()
162+ if "properties" in self .schema :
163+ property_names = {
164+ name
165+ for name in (self .schema / "properties" ).keys ()
166+ if isinstance (name , str )
167+ }
168+ additional_validator = self .evolve (
169+ self .schema / "additionalProperties"
170+ )
171+ for prop_name , prop_value in value .items ():
172+ if prop_name in property_names :
173+ continue
174+ new_value = additional_validator ._normalize_for_validation (
175+ prop_value
176+ )
177+ if new_value is prop_value :
178+ continue
179+ if normalized is value :
180+ normalized = dict (value )
181+ normalized [prop_name ] = new_value
182+
183+ return normalized
184+
185+ def _normalize_array_for_validation (
186+ self , value : list [Any ]
187+ ) -> list [Any ]:
188+ items_validator = self .evolve (self .schema / "items" )
189+ normalized : Optional [list [Any ]] = None
190+ for idx , item in enumerate (value ):
191+ new_item = items_validator ._normalize_for_validation (item )
192+ if new_item is item :
193+ continue
194+ if normalized is None :
195+ normalized = list (value )
196+ normalized [idx ] = new_item
197+ if normalized is None :
198+ return value
199+ return normalized
200+
50201 # Cache the recursive "does this schema benefit from a ValidationState?"
51202 # check, keyed on the SchemaPath. SchemaPath is hashed by content, so
52203 # two SchemaPaths pointing at the same spec location share a cache
@@ -267,6 +418,14 @@ def get_primitive_type(self, value: Any) -> Optional[str]:
267418 schema_types = sorted (self .validator .TYPE_CHECKER ._type_checkers )
268419 assert isinstance (schema_types , list )
269420 for schema_type in schema_types :
421+ if schema_type == "string" and self ._accepts_binary_string (
422+ value
423+ ):
424+ # Bytes value, binary/byte format, ``string`` is in the
425+ # declared type list: treat it as string without asking
426+ # jsonschema's type checker (which doesn't know about
427+ # OpenAPI's binary convention).
428+ return "string"
270429 result = self .type_validator (value , type_override = schema_type )
271430 if not result :
272431 continue
0 commit comments