Skip to content

Commit d599ad7

Browse files
committed
Fix multipart binary composed-schema matching
1 parent 850789a commit d599ad7

4 files changed

Lines changed: 484 additions & 21 deletions

File tree

openapi_core/validation/schemas/validators.py

Lines changed: 160 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,12 @@
2222
from openapi_core.validation.schemas.exceptions import InvalidSchemaValue
2323
from openapi_core.validation.schemas.exceptions import ValidateError
2424

25+
# OpenAPI ``format`` values whose *type: string* schemas are permitted to
26+
# carry a raw ``bytes`` payload end-to-end -- ``binary`` for opaque file
27+
# bodies (multipart/form-data, application/octet-stream) and ``byte`` for
28+
# base64 strings that callers may still hand in as ``bytes``.
29+
_BINARY_STRING_FORMATS = frozenset({"binary", "byte"})
30+
2531
if TYPE_CHECKING:
2632
from openapi_core.casting.schemas.casters import SchemaCaster
2733

@@ -41,12 +47,157 @@ def __contains__(self, schema_format: str) -> bool:
4147
return schema_format in self.validator.format_checker.checkers
4248

4349
def validate(self, value: Any) -> None:
44-
errors_iter = self.validator.iter_errors(value)
50+
# OpenAPI allows ``bytes`` to flow through ``string`` schemas
51+
# whose ``format`` is ``binary`` or ``byte`` (file uploads,
52+
# base64-encoded blobs). jsonschema only validates ``string``
53+
# against text, so we present it a decoded view while keeping
54+
# the original ``value`` for downstream unmarshalling and error
55+
# reporting.
56+
normalized = self._normalize_for_validation(value)
57+
errors_iter = self.validator.iter_errors(normalized)
4558
errors = tuple(errors_iter)
4659
if errors:
4760
schema_type = (self.schema / "type").read_str_or_list("any")
4861
raise InvalidSchemaValue(value, schema_type, schema_errors=errors)
4962

63+
@staticmethod
64+
def _decode_binary_value(value: bytes) -> str:
65+
"""Decode raw ``bytes`` into the text view jsonschema expects.
66+
67+
``utf-8`` first because that's what the vast majority of byte
68+
bodies actually are; falling back to ASCII + ``surrogateescape``
69+
guarantees the call never raises for arbitrary binary payloads
70+
(a real file upload may contain any byte sequence).
71+
"""
72+
try:
73+
return value.decode("utf-8")
74+
except UnicodeDecodeError:
75+
return value.decode("ASCII", errors="surrogateescape")
76+
77+
def _accepts_binary_string(self, value: Any) -> bool:
78+
"""True when ``value`` is ``bytes`` and the schema at this
79+
position is a ``string`` whose ``format`` allows raw bytes.
80+
"""
81+
if not isinstance(value, bytes):
82+
return False
83+
schema_format = (self.schema / "format").read_str(None)
84+
if schema_format not in _BINARY_STRING_FORMATS:
85+
return False
86+
schema_types = (self.schema / "type").read_str_or_list(None)
87+
if schema_types is None:
88+
# No declared type: OAS 3.1 lets any value flow; treat the
89+
# binary/byte format as authoritative.
90+
return True
91+
if isinstance(schema_types, str):
92+
return schema_types == "string"
93+
return "string" in schema_types
94+
95+
def _normalize_for_validation(self, value: Any) -> Any:
96+
"""Return a view of ``value`` with ``bytes`` decoded to text
97+
wherever the schema-at-this-position is a binary/byte string.
98+
99+
The original ``value`` is never mutated. Containers are only
100+
copied when a descendant actually changes, so the unchanged
101+
fast path returns ``value`` itself -- callers can use object
102+
identity to detect a no-op.
103+
104+
Recursion is driven by the schema, not by introspecting the
105+
value: a ``dict`` is only descended when the schema declares
106+
``properties``/``additionalProperties``, a ``list`` only when
107+
it declares ``items``, and composition (``oneOf``/``anyOf``/
108+
``allOf``) is descended unconditionally because that's where
109+
a multipart binary branch typically lives.
110+
"""
111+
if self._accepts_binary_string(value):
112+
return self._decode_binary_value(value)
113+
114+
if isinstance(value, dict):
115+
normalized = self._normalize_mapping_for_validation(value)
116+
elif isinstance(value, list) and "items" in self.schema:
117+
normalized = self._normalize_array_for_validation(value)
118+
else:
119+
normalized = value
120+
121+
# Composition keywords are where the binary branch actually
122+
# lives in real specs (a multipart oneOf with a file branch and
123+
# a non-file branch, for example). We apply each sub-schema's
124+
# normalization in turn -- idempotent because a sub-schema that
125+
# doesn't touch a position returns the same object, and once a
126+
# bytes value has been decoded to ``str`` no other sub-schema
127+
# treats it as binary.
128+
for keyword in ("oneOf", "anyOf", "allOf"):
129+
if keyword not in self.schema:
130+
continue
131+
for subschema in self.schema / keyword:
132+
normalized = self.evolve(
133+
subschema
134+
)._normalize_for_validation(normalized)
135+
136+
return normalized
137+
138+
def _normalize_mapping_for_validation(
139+
self, value: dict[str, Any]
140+
) -> dict[str, Any]:
141+
normalized: dict[str, Any] = value
142+
143+
if "properties" in self.schema:
144+
for prop_name, prop_schema in (self.schema / "properties").items():
145+
if not isinstance(prop_name, str) or prop_name not in value:
146+
continue
147+
prop_validator = self.evolve(prop_schema)
148+
new_value = prop_validator._normalize_for_validation(
149+
value[prop_name]
150+
)
151+
if new_value is value[prop_name]:
152+
continue
153+
if normalized is value:
154+
normalized = dict(value)
155+
normalized[prop_name] = new_value
156+
157+
additional = self.schema.get("additionalProperties", True)
158+
if additional in (True, False):
159+
return normalized
160+
161+
property_names: set[str] = set()
162+
if "properties" in self.schema:
163+
property_names = {
164+
name
165+
for name in (self.schema / "properties").keys()
166+
if isinstance(name, str)
167+
}
168+
additional_validator = self.evolve(
169+
self.schema / "additionalProperties"
170+
)
171+
for prop_name, prop_value in value.items():
172+
if prop_name in property_names:
173+
continue
174+
new_value = additional_validator._normalize_for_validation(
175+
prop_value
176+
)
177+
if new_value is prop_value:
178+
continue
179+
if normalized is value:
180+
normalized = dict(value)
181+
normalized[prop_name] = new_value
182+
183+
return normalized
184+
185+
def _normalize_array_for_validation(
186+
self, value: list[Any]
187+
) -> list[Any]:
188+
items_validator = self.evolve(self.schema / "items")
189+
normalized: Optional[list[Any]] = None
190+
for idx, item in enumerate(value):
191+
new_item = items_validator._normalize_for_validation(item)
192+
if new_item is item:
193+
continue
194+
if normalized is None:
195+
normalized = list(value)
196+
normalized[idx] = new_item
197+
if normalized is None:
198+
return value
199+
return normalized
200+
50201
# Cache the recursive "does this schema benefit from a ValidationState?"
51202
# check, keyed on the SchemaPath. SchemaPath is hashed by content, so
52203
# two SchemaPaths pointing at the same spec location share a cache
@@ -267,6 +418,14 @@ def get_primitive_type(self, value: Any) -> Optional[str]:
267418
schema_types = sorted(self.validator.TYPE_CHECKER._type_checkers)
268419
assert isinstance(schema_types, list)
269420
for schema_type in schema_types:
421+
if schema_type == "string" and self._accepts_binary_string(
422+
value
423+
):
424+
# Bytes value, binary/byte format, ``string`` is in the
425+
# declared type list: treat it as string without asking
426+
# jsonschema's type checker (which doesn't know about
427+
# OpenAPI's binary convention).
428+
return "string"
270429
result = self.type_validator(value, type_override=schema_type)
271430
if not result:
272431
continue

tests/integration/unmarshalling/test_request_unmarshaller.py

Lines changed: 151 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
import json
22
from base64 import b64encode
3-
from email.generator import _make_boundary
43

54
import pytest
65

@@ -469,16 +468,14 @@ def test_request_body_with_object_default(self):
469468
assert result.errors == []
470469
assert result.body == {"tags": []}
471470

472-
@pytest.mark.xfail(
473-
reason=(
474-
"multipart composed-schema branch selection is not binary-aware"
475-
),
476-
strict=True,
477-
)
478471
def test_request_body_multipart_oneof_binary_field(self):
479472
from openapi_core import OpenAPI
480473

481-
boundary = _make_boundary()
474+
# email.generator._make_boundary() returns strings like
475+
# ``===============1234==`` whose ``=`` chars trip the mimetype
476+
# parameter parser. That's a separate bug; here we just want a
477+
# legal boundary that round-trips the binary oneOf branch.
478+
boundary = "openapicoreboundary1234567890"
482479
spec = OpenAPI.from_dict(
483480
{
484481
"openapi": "3.1.0",
@@ -545,6 +542,152 @@ def test_request_body_multipart_oneof_binary_field(self):
545542
assert result.errors == []
546543
assert result.body == {"file": b"\xff\xfe"}
547544

545+
def test_request_body_multipart_anyof_binary_field(self):
546+
# anyOf with a text-only branch and a binary branch: a posted file
547+
# should match the binary branch (and only the binary branch).
548+
from openapi_core import OpenAPI
549+
550+
boundary = "openapicoreboundary1234567890"
551+
spec = OpenAPI.from_dict(
552+
{
553+
"openapi": "3.1.0",
554+
"info": {"version": "0", "title": "test"},
555+
"paths": {
556+
"/test": {
557+
"post": {
558+
"requestBody": {
559+
"required": True,
560+
"content": {
561+
"multipart/form-data": {
562+
"schema": {
563+
"anyOf": [
564+
{
565+
"type": "object",
566+
"properties": {
567+
"note": {
568+
"type": "string"
569+
}
570+
},
571+
"required": ["note"],
572+
},
573+
{
574+
"type": "object",
575+
"properties": {
576+
"blob": {
577+
"type": "string",
578+
"format": "binary",
579+
}
580+
},
581+
"required": ["blob"],
582+
},
583+
]
584+
}
585+
}
586+
},
587+
},
588+
"responses": {"200": {"description": ""}},
589+
}
590+
}
591+
},
592+
}
593+
)
594+
data = (
595+
(
596+
f"--{boundary}\n"
597+
"Content-Type: application/octet-stream\n"
598+
"MIME-Version: 1.0\n"
599+
'Content-Disposition: form-data; name="blob"\n\n'
600+
).encode("ascii")
601+
+ b"\x00\x01\x02binary\xff"
602+
+ (f"\n--{boundary}--\n").encode("ascii")
603+
)
604+
request = MockRequest(
605+
"http://localhost",
606+
"post",
607+
"/test",
608+
content_type=f"multipart/form-data; boundary={boundary}",
609+
data=data,
610+
)
611+
612+
result = spec.unmarshal_request(request)
613+
614+
assert result.errors == []
615+
assert result.body == {"blob": b"\x00\x01\x02binary\xff"}
616+
617+
def test_request_body_multipart_allof_binary_field(self):
618+
# allOf: every branch must validate. Binary normalization has to
619+
# be visible to all of them.
620+
from openapi_core import OpenAPI
621+
622+
boundary = "openapicoreboundary1234567890"
623+
spec = OpenAPI.from_dict(
624+
{
625+
"openapi": "3.1.0",
626+
"info": {"version": "0", "title": "test"},
627+
"paths": {
628+
"/test": {
629+
"post": {
630+
"requestBody": {
631+
"required": True,
632+
"content": {
633+
"multipart/form-data": {
634+
"schema": {
635+
"allOf": [
636+
{
637+
"type": "object",
638+
"properties": {
639+
"label": {
640+
"type": "string"
641+
}
642+
},
643+
"required": ["label"],
644+
},
645+
{
646+
"type": "object",
647+
"properties": {
648+
"file": {
649+
"type": "string",
650+
"format": "binary",
651+
}
652+
},
653+
"required": ["file"],
654+
},
655+
]
656+
}
657+
}
658+
},
659+
},
660+
"responses": {"200": {"description": ""}},
661+
}
662+
}
663+
},
664+
}
665+
)
666+
data = (
667+
(
668+
f"--{boundary}\n"
669+
'Content-Disposition: form-data; name="label"\n\n'
670+
"report"
671+
f"\n--{boundary}\n"
672+
"Content-Type: application/octet-stream\n"
673+
'Content-Disposition: form-data; name="file"\n\n'
674+
).encode("ascii")
675+
+ b"\xff\xfe"
676+
+ (f"\n--{boundary}--\n").encode("ascii")
677+
)
678+
request = MockRequest(
679+
"http://localhost",
680+
"post",
681+
"/test",
682+
content_type=f"multipart/form-data; boundary={boundary}",
683+
data=data,
684+
)
685+
686+
result = spec.unmarshal_request(request)
687+
688+
assert result.errors == []
689+
assert result.body == {"label": "report", "file": b"\xff\xfe"}
690+
548691
def test_post_pets_validates_request_schema_once(
549692
self, request_unmarshaller
550693
):

tests/unit/deserializing/test_media_types_deserializers.py

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -657,12 +657,6 @@ def test_urlencoded_form_with_array_default(self, deserializer_factory):
657657

658658
assert result == {"tags": []}
659659

660-
@pytest.mark.xfail(
661-
reason=(
662-
"multipart composed-schema branch selection is not binary-aware"
663-
),
664-
strict=True,
665-
)
666660
def test_multipart_oneof_binary_field(self, spec, deserializer_factory):
667661
mimetype = "multipart/form-data"
668662
schema_dict = {
@@ -757,12 +751,6 @@ def test_multipart_oneof_string_field(self, spec, deserializer_factory):
757751
"fieldA": "value",
758752
}
759753

760-
@pytest.mark.xfail(
761-
reason=(
762-
"multipart composed-schema branch selection is not binary-aware"
763-
),
764-
strict=True,
765-
)
766754
def test_multipart_anyof_binary_field(self, spec, deserializer_factory):
767755
mimetype = "multipart/form-data"
768756
schema_dict = {

0 commit comments

Comments
 (0)