2222from openapi_core .validation .schemas .exceptions import InvalidSchemaValue
2323from openapi_core .validation .schemas .exceptions import ValidateError
2424
25+ # OpenAPI ``format`` values whose *type: string* schemas are permitted to
26+ # carry a raw ``bytes`` payload end-to-end -- ``binary`` for opaque file
27+ # bodies (multipart/form-data, application/octet-stream) and ``byte`` for
28+ # base64 strings that callers may still hand in as ``bytes``.
29+ _BINARY_STRING_FORMATS = frozenset ({"binary" , "byte" })
30+
2531if TYPE_CHECKING :
2632 from openapi_core .casting .schemas .casters import SchemaCaster
2733
@@ -41,12 +47,156 @@ def __contains__(self, schema_format: str) -> bool:
4147 return schema_format in self .validator .format_checker .checkers
4248
4349 def validate (self , value : Any ) -> None :
44- errors_iter = self .validator .iter_errors (value )
50+ # OpenAPI allows ``bytes`` to flow through ``string`` schemas
51+ # whose ``format`` is ``binary`` or ``byte`` (file uploads,
52+ # base64-encoded blobs). jsonschema only validates ``string``
53+ # against text, so we present it a decoded view while keeping
54+ # the original ``value`` for downstream unmarshalling and error
55+ # reporting.
56+ normalized = self ._normalize_for_validation (value )
57+ errors_iter = self .validator .iter_errors (normalized )
4558 errors = tuple (errors_iter )
4659 if errors :
4760 schema_type = (self .schema / "type" ).read_str_or_list ("any" )
4861 raise InvalidSchemaValue (value , schema_type , schema_errors = errors )
4962
63+ @staticmethod
64+ def _decode_binary_value (value : bytes ) -> str :
65+ """Decode raw ``bytes`` into the text view jsonschema expects.
66+
67+ ``utf-8`` first because that's what the vast majority of byte
68+ bodies actually are; falling back to ASCII + ``surrogateescape``
69+ guarantees the call never raises for arbitrary binary payloads
70+ (a real file upload may contain any byte sequence).
71+ """
72+ try :
73+ return value .decode ("utf-8" )
74+ except UnicodeDecodeError :
75+ return value .decode ("ASCII" , errors = "surrogateescape" )
76+
77+ def _accepts_binary_string (self , value : Any ) -> bool :
78+ """True when ``value`` is ``bytes`` and the schema at this
79+ position is a ``string`` whose ``format`` allows raw bytes.
80+ """
81+ if not isinstance (value , bytes ):
82+ return False
83+ schema_format = (self .schema / "format" ).read_str (None )
84+ if schema_format not in _BINARY_STRING_FORMATS :
85+ return False
86+ schema_types = (self .schema / "type" ).read_str_or_list (None )
87+ if schema_types is None :
88+ # No declared type: OAS 3.1 lets any value flow; treat the
89+ # binary/byte format as authoritative.
90+ return True
91+ if isinstance (schema_types , str ):
92+ return schema_types == "string"
93+ return "string" in schema_types
94+
95+ def _normalize_for_validation (self , value : Any ) -> Any :
96+ """Return a view of ``value`` with ``bytes`` decoded to text
97+ wherever the schema-at-this-position is a binary/byte string.
98+
99+ The original ``value`` is never mutated. Containers are only
100+ copied when a descendant actually changes, so the unchanged
101+ fast path returns ``value`` itself -- callers can use object
102+ identity to detect a no-op.
103+
104+ Recursion is driven by the schema, not by introspecting the
105+ value: a ``dict`` is only descended when the schema declares
106+ ``properties``/``additionalProperties``, a ``list`` only when
107+ it declares ``items``, and composition (``oneOf``/``anyOf``/
108+ ``allOf``) is descended unconditionally because that's where
109+ a multipart binary branch typically lives.
110+ """
111+ if self ._accepts_binary_string (value ):
112+ return self ._decode_binary_value (value )
113+
114+ normalized : Any
115+ if isinstance (value , dict ):
116+ normalized = self ._normalize_mapping_for_validation (value )
117+ elif isinstance (value , list ) and "items" in self .schema :
118+ normalized = self ._normalize_array_for_validation (value )
119+ else :
120+ normalized = value
121+
122+ # Composition keywords are where the binary branch actually
123+ # lives in real specs (a multipart oneOf with a file branch and
124+ # a non-file branch, for example). We apply each sub-schema's
125+ # normalization in turn -- idempotent because a sub-schema that
126+ # doesn't touch a position returns the same object, and once a
127+ # bytes value has been decoded to ``str`` no other sub-schema
128+ # treats it as binary.
129+ for keyword in ("oneOf" , "anyOf" , "allOf" ):
130+ if keyword not in self .schema :
131+ continue
132+ for subschema in self .schema / keyword :
133+ normalized = self .evolve (subschema )._normalize_for_validation (
134+ normalized
135+ )
136+
137+ return normalized
138+
139+ def _normalize_mapping_for_validation (
140+ self , value : dict [str , Any ]
141+ ) -> dict [str , Any ]:
142+ normalized : dict [str , Any ] = value
143+
144+ if "properties" in self .schema :
145+ for prop_name , prop_schema in (self .schema / "properties" ).items ():
146+ if not isinstance (prop_name , str ) or prop_name not in value :
147+ continue
148+ prop_validator = self .evolve (prop_schema )
149+ new_value = prop_validator ._normalize_for_validation (
150+ value [prop_name ]
151+ )
152+ if new_value is value [prop_name ]:
153+ continue
154+ if normalized is value :
155+ normalized = dict (value )
156+ normalized [prop_name ] = new_value
157+
158+ additional = self .schema .get ("additionalProperties" , True )
159+ if additional in (True , False ):
160+ return normalized
161+
162+ property_names : set [str ] = set ()
163+ if "properties" in self .schema :
164+ property_names = {
165+ name
166+ for name in (self .schema / "properties" ).keys ()
167+ if isinstance (name , str )
168+ }
169+ additional_validator = self .evolve (
170+ self .schema / "additionalProperties"
171+ )
172+ for prop_name , prop_value in value .items ():
173+ if prop_name in property_names :
174+ continue
175+ new_value = additional_validator ._normalize_for_validation (
176+ prop_value
177+ )
178+ if new_value is prop_value :
179+ continue
180+ if normalized is value :
181+ normalized = dict (value )
182+ normalized [prop_name ] = new_value
183+
184+ return normalized
185+
186+ def _normalize_array_for_validation (self , value : list [Any ]) -> list [Any ]:
187+ items_validator = self .evolve (self .schema / "items" )
188+ normalized : Optional [list [Any ]] = None
189+ for idx , item in enumerate (value ):
190+ new_item = items_validator ._normalize_for_validation (item )
191+ if new_item is item :
192+ continue
193+ if normalized is None :
194+ normalized = list (value )
195+ normalized [idx ] = new_item
196+ if normalized is None :
197+ return value
198+ return normalized
199+
50200 # Cache the recursive "does this schema benefit from a ValidationState?"
51201 # check, keyed on the SchemaPath. SchemaPath is hashed by content, so
52202 # two SchemaPaths pointing at the same spec location share a cache
@@ -267,6 +417,12 @@ def get_primitive_type(self, value: Any) -> Optional[str]:
267417 schema_types = sorted (self .validator .TYPE_CHECKER ._type_checkers )
268418 assert isinstance (schema_types , list )
269419 for schema_type in schema_types :
420+ if schema_type == "string" and self ._accepts_binary_string (value ):
421+ # Bytes value, binary/byte format, ``string`` is in the
422+ # declared type list: treat it as string without asking
423+ # jsonschema's type checker (which doesn't know about
424+ # OpenAPI's binary convention).
425+ return "string"
270426 result = self .type_validator (value , type_override = schema_type )
271427 if not result :
272428 continue
0 commit comments