Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
131 changes: 130 additions & 1 deletion parquet/src/arrow/schema/primitive.rs
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,15 @@ fn decimal_256_type(scale: i32, precision: i32) -> Result<DataType> {
Ok(DataType::Decimal256(precision, scale))
}

fn check_decimal_length(type_length: i32) -> Result<()> {
if type_length < 1 || type_length > 32 {
return Err(ParquetError::General(format!(
"DECIMAL must be a Fixed Length Byte Array with length 1 to 32, got {type_length}"
)));
}
Ok(())
}

fn from_int32(info: &BasicTypeInfo, scale: i32, precision: i32) -> Result<DataType> {
match (info.logical_type_ref(), info.converted_type()) {
(None, ConvertedType::NONE) => Ok(DataType::Int32),
Expand Down Expand Up @@ -313,23 +322,30 @@ fn from_fixed_len_byte_array(
precision: i32,
type_length: i32,
) -> Result<DataType> {
// TODO: This should check the type length for the decimal and interval types
match (info.logical_type_ref(), info.converted_type()) {
(Some(LogicalType::Decimal { scale, precision }), _) => {
check_decimal_length(type_length)?;
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should this be checking exact lengths? The only valid lengths in this case are 16 and 32 , right?

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch on the wording! You're totally right—any byte length is valid here, not just 16 or 32.

As per the Parquet spec for fixed_len_byte_array, the precision is simply limited by the array size. As long as the precision fits within the byte length, it's completely valid.

That upper bound of 32 we were looking at is actually an Arrow constraint (since Decimal256 is 256 bits, which caps it at 32 bytes), rather than a limitation from Parquet itself. Checking for a range of 1..=32 makes perfect sense here, especially since we definitely see perfectly legitimate Parquet files out in the wild with byte lengths like 4 or 7.

// lengths 1..=16 map to Decimal128, 17..=32 to Decimal256
if type_length <= 16 {
decimal_128_type(*scale, *precision)
} else {
decimal_256_type(*scale, *precision)
}
}
(None, ConvertedType::DECIMAL) => {
check_decimal_length(type_length)?;
if type_length <= 16 {
decimal_128_type(scale, precision)
} else {
decimal_256_type(scale, precision)
}
}
(None, ConvertedType::INTERVAL) => {
if type_length != 12 {
return Err(ParquetError::General(format!(
"INTERVAL must be a Fixed Length Byte Array with length 12, got {type_length}"
)));
}
// There is currently no reliable way of determining which IntervalUnit
// to return. Thus without the original Arrow schema, the results
// would be incorrect if all 12 bytes of the interval are populated
Expand All @@ -348,3 +364,116 @@ fn from_fixed_len_byte_array(
_ => Ok(DataType::FixedSizeBinary(type_length)),
}
}

#[cfg(test)]
mod tests {
use super::*;
use crate::basic::Repetition;
use crate::schema::types::Type;

// The PrimitiveTypeBuilder rejects bad lengths at construction. To exercise
// the reader-side checks, build a valid type then overwrite its type_length,
// simulating a schema decoded from a file that wasn't produced via the builder.
fn with_type_length(ty: Type, type_length: i32) -> Type {
match ty {
Type::PrimitiveType {
basic_info,
physical_type,
precision,
scale,
..
} => Type::PrimitiveType {
basic_info,
physical_type,
type_length,
precision,
scale,
},
_ => unreachable!(),
}
}

fn flba_decimal_logical(type_length: i32) -> Type {
let valid = Type::primitive_type_builder("c", PhysicalType::FIXED_LEN_BYTE_ARRAY)
.with_repetition(Repetition::REQUIRED)
.with_logical_type(Some(LogicalType::Decimal {
precision: 5,
scale: 2,
}))
.with_length(16)
.with_precision(5)
.with_scale(2)
.build()
.unwrap();
with_type_length(valid, type_length)
}

fn flba_decimal_converted(type_length: i32) -> Type {
let valid = Type::primitive_type_builder("c", PhysicalType::FIXED_LEN_BYTE_ARRAY)
.with_repetition(Repetition::REQUIRED)
.with_converted_type(ConvertedType::DECIMAL)
.with_length(16)
.with_precision(5)
.with_scale(2)
.build()
.unwrap();
with_type_length(valid, type_length)
}

fn flba_interval(type_length: i32) -> Type {
let valid = Type::primitive_type_builder("c", PhysicalType::FIXED_LEN_BYTE_ARRAY)
.with_repetition(Repetition::REQUIRED)
.with_converted_type(ConvertedType::INTERVAL)
.with_length(12)
.build()
.unwrap();
with_type_length(valid, type_length)
}

fn assert_err_contains(ty: &Type, needle: &str) {
let err = convert_primitive(ty, None).expect_err("expected an error");
let msg = err.to_string();
assert!(msg.contains(needle), "expected {needle:?} in error: {msg}");
}

#[test]
fn decimal_logical_rejects_invalid_length() {
for bad in [-1, 0, 33] {
assert_err_contains(&flba_decimal_logical(bad), "DECIMAL");
}
}

#[test]
fn decimal_converted_rejects_invalid_length() {
for bad in [-1, 0, 33] {
assert_err_contains(&flba_decimal_converted(bad), "DECIMAL");
}
}

#[test]
fn decimal_accepts_valid_lengths() {
assert!(matches!(
convert_primitive(&flba_decimal_logical(16), None).unwrap(),
DataType::Decimal128(_, _)
));
assert!(matches!(
convert_primitive(&flba_decimal_logical(32), None).unwrap(),
DataType::Decimal256(_, _)
));
}

#[test]
fn interval_rejects_wrong_length() {
for bad in [0, 11, 13] {
assert_err_contains(&flba_interval(bad), "INTERVAL");
}
}

#[test]
fn interval_accepts_length_12() {
assert_eq!(
convert_primitive(&flba_interval(12), None).unwrap(),
DataType::Interval(IntervalUnit::DayTime)
);
}
}