Skip to content

Commit 2294dfc

Browse files
okhsunrog0x501D
authored andcommitted
feat: add configurable decimal serialization format for JSON writer
Add DecimalFormat enum with Number and String variants to control how decimal values are serialized in JSON output. The default Number format renders decimals as JSON numbers (e.g., 12.34), while String format renders them as quoted strings (e.g., "12.34"). Changes: - Add DecimalFormat enum to arrow-cast with Number (default) and String variants - Add decimal_format field to EncoderOptions and FormatOptions - Implement decimal formatting in DisplayIndexState for Decimal128/256 - Add WriterBuilder::with_decimal_format() configuration method - Add tests for decimal arrays, lists, and dictionaries - Fix lifetime elision warnings in BitChunks methods
1 parent 7ef69f7 commit 2294dfc

File tree

5 files changed

+197
-34
lines changed

5 files changed

+197
-34
lines changed

arrow-buffer/src/buffer/boolean.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ impl BooleanBuffer {
9292
/// Returns a `BitChunks` instance which can be used to iterate over
9393
/// this buffer's bits in `u64` chunks
9494
#[inline]
95-
pub fn bit_chunks(&self) -> BitChunks {
95+
pub fn bit_chunks(&self) -> BitChunks<'_> {
9696
BitChunks::new(self.values(), self.offset, self.len)
9797
}
9898

arrow-buffer/src/buffer/immutable.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -309,7 +309,7 @@ impl Buffer {
309309
/// Returns a `BitChunks` instance which can be used to iterate over this buffers bits
310310
/// in larger chunks and starting at arbitrary bit offsets.
311311
/// Note that both `offset` and `length` are measured in bits.
312-
pub fn bit_chunks(&self, offset: usize, len: usize) -> BitChunks {
312+
pub fn bit_chunks(&self, offset: usize, len: usize) -> BitChunks<'_> {
313313
BitChunks::new(self.as_slice(), offset, len)
314314
}
315315

arrow-cast/src/display.rs

Lines changed: 36 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,17 @@ use lexical_core::FormattedSize;
3838

3939
type TimeFormat<'a> = Option<&'a str>;
4040

41+
/// Format for displaying decimals
42+
#[derive(Default, Debug, Copy, Clone, PartialEq, Eq, Hash)]
43+
#[non_exhaustive]
44+
pub enum DecimalFormat {
45+
/// Render decimals as JSON numbers, e.g. 12.34
46+
#[default]
47+
Number,
48+
/// Render decimals as JSON strings, e.g. "12.34"
49+
String,
50+
}
51+
4152
/// Format for displaying durations
4253
#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
4354
#[non_exhaustive]
@@ -72,6 +83,8 @@ pub struct FormatOptions<'a> {
7283
time_format: TimeFormat<'a>,
7384
/// Duration format
7485
duration_format: DurationFormat,
86+
/// Decimal rendering format
87+
decimal_format: DecimalFormat,
7588
}
7689

7790
impl Default for FormatOptions<'_> {
@@ -92,6 +105,7 @@ impl<'a> FormatOptions<'a> {
92105
timestamp_tz_format: None,
93106
time_format: None,
94107
duration_format: DurationFormat::ISO8601,
108+
decimal_format: DecimalFormat::Number,
95109
}
96110
}
97111

@@ -158,6 +172,14 @@ impl<'a> FormatOptions<'a> {
158172
..self
159173
}
160174
}
175+
176+
/// Set how decimal values should be formatted
177+
pub const fn with_decimal_format(self, decimal_format: DecimalFormat) -> Self {
178+
Self {
179+
decimal_format,
180+
..self
181+
}
182+
}
161183
}
162184

163185
/// Implements [`Display`] for a specific array value
@@ -460,14 +482,24 @@ impl DisplayIndex for &PrimitiveArray<Float16Type> {
460482
macro_rules! decimal_display {
461483
($($t:ty),+) => {
462484
$(impl<'a> DisplayIndexState<'a> for &'a PrimitiveArray<$t> {
463-
type State = (u8, i8);
485+
type State = (u8, i8, DecimalFormat);
464486

465-
fn prepare(&self, _options: &FormatOptions<'a>) -> Result<Self::State, ArrowError> {
466-
Ok((self.precision(), self.scale()))
487+
fn prepare(&self, options: &FormatOptions<'a>) -> Result<Self::State, ArrowError> {
488+
Ok((self.precision(), self.scale(), options.decimal_format))
467489
}
468490

469491
fn write(&self, s: &Self::State, idx: usize, f: &mut dyn Write) -> FormatResult {
470-
write!(f, "{}", <$t>::format_decimal(self.values()[idx], s.0, s.1))?;
492+
let formatted = <$t>::format_decimal(self.values()[idx], s.0, s.1);
493+
match s.2 {
494+
DecimalFormat::String => {
495+
// Format as quoted string
496+
write!(f, "\"{}\"", formatted)?;
497+
}
498+
DecimalFormat::Number => {
499+
// Format as number
500+
write!(f, "{}", formatted)?;
501+
}
502+
}
471503
Ok(())
472504
}
473505
})+

arrow-json/src/writer/encoder.rs

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ use arrow_array::cast::AsArray;
1919
use arrow_array::types::*;
2020
use arrow_array::*;
2121
use arrow_buffer::{ArrowNativeType, NullBuffer, OffsetBuffer, ScalarBuffer};
22-
use arrow_cast::display::{ArrayFormatter, FormatOptions};
22+
use arrow_cast::display::{ArrayFormatter, DecimalFormat, FormatOptions};
2323
use arrow_schema::{ArrowError, DataType, FieldRef};
2424
use half::f16;
2525
use lexical_core::FormattedSize;
@@ -29,6 +29,7 @@ use std::io::Write;
2929
#[derive(Debug, Clone, Default)]
3030
pub struct EncoderOptions {
3131
pub explicit_nulls: bool,
32+
pub decimal_format: DecimalFormat,
3233
}
3334

3435
/// A trait to format array values as JSON values
@@ -139,8 +140,10 @@ fn make_encoder_impl<'a>(
139140
(Box::new(encoder) as _, array.nulls().cloned())
140141
}
141142
DataType::Decimal128(_, _) | DataType::Decimal256(_, _) => {
142-
let options = FormatOptions::new().with_display_error(true);
143-
let formatter = ArrayFormatter::try_new(array, &options)?;
143+
let format_options = FormatOptions::new()
144+
.with_display_error(true)
145+
.with_decimal_format(options.decimal_format);
146+
let formatter = ArrayFormatter::try_new(array, &format_options)?;
144147
(Box::new(RawArrayFormatter(formatter)) as _, array.nulls().cloned())
145148
}
146149
d => match d.is_temporal() {

arrow-json/src/writer/mod.rs

Lines changed: 153 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@ use std::{fmt::Debug, io::Write};
111111
use arrow_array::*;
112112
use arrow_schema::*;
113113

114+
use arrow_cast::display::DecimalFormat;
114115
use encoder::{make_encoder, EncoderOptions};
115116

116117
/// This trait defines how to format a sequence of JSON objects to a
@@ -227,6 +228,11 @@ impl WriterBuilder {
227228
self.0.explicit_nulls
228229
}
229230

231+
/// Returns the decimal format for this writer
232+
pub fn decimal_format(&self) -> DecimalFormat {
233+
self.0.decimal_format
234+
}
235+
230236
/// Set whether to keep keys with null values, or to omit writing them.
231237
///
232238
/// For example, with [`LineDelimited`] format:
@@ -253,6 +259,12 @@ impl WriterBuilder {
253259
self
254260
}
255261

262+
/// Set how decimals should be formatted in JSON output.
263+
pub fn with_decimal_format(mut self, decimal_format: DecimalFormat) -> Self {
264+
self.0.decimal_format = decimal_format;
265+
self
266+
}
267+
256268
/// Create a new `Writer` with specified `JsonFormat` and builder options.
257269
pub fn build<W, F>(self, writer: W) -> Writer<W, F>
258270
where
@@ -432,6 +444,29 @@ mod tests {
432444
assert_eq!(expected, actual);
433445
}
434446

447+
/// Helper to assert decimal output with `Number` and `String` decimal formats
448+
fn assert_decimal_outputs(
449+
batch: &RecordBatch,
450+
expected_default: &str,
451+
expected_decimal_as_string: &str,
452+
) {
453+
let mut buf = Vec::new();
454+
{
455+
let mut writer = LineDelimitedWriter::new(&mut buf);
456+
writer.write_batches(&[batch]).unwrap();
457+
}
458+
assert_json_eq(&buf, expected_default);
459+
460+
let mut buf = Vec::new();
461+
{
462+
let mut writer = WriterBuilder::new()
463+
.with_decimal_format(DecimalFormat::String)
464+
.build::<_, LineDelimited>(&mut buf);
465+
writer.write_batches(&[batch]).unwrap();
466+
}
467+
assert_json_eq(&buf, expected_decimal_as_string);
468+
}
469+
435470
#[test]
436471
fn write_simple_rows() {
437472
let schema = Schema::new(vec![
@@ -1887,17 +1922,15 @@ mod tests {
18871922
let schema = Schema::new(vec![field]);
18881923
let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(array)]).unwrap();
18891924

1890-
let mut buf = Vec::new();
1891-
{
1892-
let mut writer = LineDelimitedWriter::new(&mut buf);
1893-
writer.write_batches(&[&batch]).unwrap();
1894-
}
1895-
1896-
assert_json_eq(
1897-
&buf,
1925+
assert_decimal_outputs(
1926+
&batch,
18981927
r#"{"decimal":12.34}
18991928
{"decimal":56.78}
19001929
{"decimal":90.12}
1930+
"#,
1931+
r#"{"decimal":"12.34"}
1932+
{"decimal":"56.78"}
1933+
{"decimal":"90.12"}
19011934
"#,
19021935
);
19031936
}
@@ -1914,18 +1947,15 @@ mod tests {
19141947
let field = Arc::new(Field::new("decimal", array.data_type().clone(), true));
19151948
let schema = Schema::new(vec![field]);
19161949
let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(array)]).unwrap();
1917-
1918-
let mut buf = Vec::new();
1919-
{
1920-
let mut writer = LineDelimitedWriter::new(&mut buf);
1921-
writer.write_batches(&[&batch]).unwrap();
1922-
}
1923-
1924-
assert_json_eq(
1925-
&buf,
1950+
assert_decimal_outputs(
1951+
&batch,
19261952
r#"{"decimal":12.3400}
19271953
{"decimal":56.7800}
19281954
{"decimal":90.1200}
1955+
"#,
1956+
r#"{"decimal":"12.3400"}
1957+
{"decimal":"56.7800"}
1958+
{"decimal":"90.1200"}
19291959
"#,
19301960
);
19311961
}
@@ -1938,18 +1968,116 @@ mod tests {
19381968
let field = Arc::new(Field::new("decimal", array.data_type().clone(), true));
19391969
let schema = Schema::new(vec![field]);
19401970
let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(array)]).unwrap();
1971+
assert_decimal_outputs(
1972+
&batch,
1973+
r#"{"decimal":12.34}
1974+
{}
1975+
{"decimal":56.78}
1976+
"#,
1977+
r#"{"decimal":"12.34"}
1978+
{}
1979+
{"decimal":"56.78"}
1980+
"#,
1981+
);
1982+
}
19411983

1942-
let mut buf = Vec::new();
1943-
{
1944-
let mut writer = LineDelimitedWriter::new(&mut buf);
1945-
writer.write_batches(&[&batch]).unwrap();
1984+
#[test]
1985+
fn test_decimal128_list_encoder() {
1986+
let decimal_type = DataType::Decimal128(10, 2);
1987+
let item_field = FieldRef::new(Field::new("item", decimal_type.clone(), true));
1988+
let schema = Schema::new(vec![Field::new("list", DataType::List(item_field), true)]);
1989+
1990+
let values_builder = Decimal128Builder::new().with_data_type(decimal_type.clone());
1991+
let mut list_builder = ListBuilder::new(values_builder);
1992+
let rows = [Some(vec![Some(1234), None]), Some(vec![Some(5678)])];
1993+
1994+
for row in rows {
1995+
match row {
1996+
Some(values) => {
1997+
for value in values {
1998+
match value {
1999+
Some(v) => list_builder.values().append_value(v),
2000+
None => list_builder.values().append_null(),
2001+
}
2002+
}
2003+
list_builder.append(true);
2004+
}
2005+
None => list_builder.append(false),
2006+
}
19462007
}
19472008

1948-
assert_json_eq(
1949-
&buf,
1950-
r#"{"decimal":12.34}
2009+
let array = Arc::new(list_builder.finish()) as ArrayRef;
2010+
let batch = RecordBatch::try_new(Arc::new(schema), vec![array]).unwrap();
2011+
2012+
assert_decimal_outputs(
2013+
&batch,
2014+
r#"{"list":[12.34,null]}
2015+
{"list":[56.78]}
2016+
"#,
2017+
r#"{"list":["12.34",null]}
2018+
{"list":["56.78"]}
2019+
"#,
2020+
);
2021+
}
2022+
2023+
#[test]
2024+
fn test_decimal128_dictionary_encoder() {
2025+
let values = Arc::new(
2026+
Decimal128Array::from_iter_values([1234, 5678])
2027+
.with_precision_and_scale(10, 2)
2028+
.unwrap(),
2029+
);
2030+
let keys = Int8Array::from(vec![Some(0), None, Some(1)]);
2031+
let dict = DictionaryArray::new(keys, values.clone());
2032+
2033+
let schema = Schema::new(vec![Field::new(
2034+
"dict",
2035+
DataType::Dictionary(DataType::Int8.into(), DataType::Decimal128(10, 2).into()),
2036+
true,
2037+
)]);
2038+
2039+
let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(dict)]).unwrap();
2040+
2041+
assert_decimal_outputs(
2042+
&batch,
2043+
r#"{"dict":12.34}
19512044
{}
1952-
{"decimal":56.78}
2045+
{"dict":56.78}
2046+
"#,
2047+
r#"{"dict":"12.34"}
2048+
{}
2049+
{"dict":"56.78"}
2050+
"#,
2051+
);
2052+
}
2053+
2054+
#[test]
2055+
fn test_decimal256_dictionary_encoder() {
2056+
let values = Arc::new(
2057+
Decimal256Array::from_iter_values([i256::from(1234), i256::from(5678)])
2058+
.with_precision_and_scale(10, 2)
2059+
.unwrap(),
2060+
);
2061+
let keys = Int8Array::from(vec![Some(0), None, Some(1)]);
2062+
let dict = DictionaryArray::new(keys, values.clone());
2063+
2064+
let schema = Schema::new(vec![Field::new(
2065+
"dict",
2066+
DataType::Dictionary(DataType::Int8.into(), DataType::Decimal256(10, 2).into()),
2067+
true,
2068+
)]);
2069+
2070+
let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(dict)]).unwrap();
2071+
2072+
assert_decimal_outputs(
2073+
&batch,
2074+
r#"{"dict":12.34}
2075+
{}
2076+
{"dict":56.78}
2077+
"#,
2078+
r#"{"dict":"12.34"}
2079+
{}
2080+
{"dict":"56.78"}
19532081
"#,
19542082
);
19552083
}

0 commit comments

Comments
 (0)