Skip to content

Commit 22e23cf

Browse files
authored
Fix propagation of CSV options through protos (#245)
* Fix sink output schema being passed in to `FileSinkExec` instead of sink input schema * Expose double_quote csv option, and ensure all csv_options are propagated through logical/physical plans --------- Co-authored-by: svranesevic <[email protected]>
1 parent 71393c5 commit 22e23cf

File tree

9 files changed

+131
-8
lines changed

9 files changed

+131
-8
lines changed

datafusion/common/src/config.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1520,6 +1520,7 @@ config_namespace! {
15201520
pub delimiter: u8, default = b','
15211521
pub quote: u8, default = b'"'
15221522
pub escape: Option<u8>, default = None
1523+
pub double_quote: bool, default = true
15231524
pub compression: CompressionTypeVariant, default = CompressionTypeVariant::UNCOMPRESSED
15241525
pub schema_infer_max_rec: usize, default = 100
15251526
pub date_format: Option<String>, default = None

datafusion/common/src/file_options/csv_writer.rs

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,8 +51,13 @@ impl TryFrom<&CsvOptions> for CsvWriterOptions {
5151
fn try_from(value: &CsvOptions) -> Result<Self> {
5252
let mut builder = WriterBuilder::default()
5353
.with_header(value.has_header)
54-
.with_delimiter(value.delimiter);
54+
.with_quote(value.quote)
55+
.with_delimiter(value.delimiter)
56+
.with_double_quote(value.double_quote);
5557

58+
if let Some(v) = &value.escape {
59+
builder = builder.with_escape(*v)
60+
}
5661
if let Some(v) = &value.date_format {
5762
builder = builder.with_date_format(v.into())
5863
}

datafusion/proto/proto/datafusion.proto

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1255,6 +1255,12 @@ message CsvWriterOptions {
12551255
string time_format = 7;
12561256
// Optional value to represent null
12571257
string null_value = 8;
1258+
// Optional quote. Defaults to `b'"'`
1259+
string quote = 9;
1260+
// Optional escape. Defaults to `'\\'`
1261+
string escape = 10;
1262+
// Optional flag whether to double quote instead of escaping. Defaults to `true`
1263+
bool double_quote = 11;
12581264
}
12591265

12601266
// Options controlling CSV format
@@ -1271,6 +1277,7 @@ message CsvOptions {
12711277
string timestamp_tz_format = 10; // Optional timestamp with timezone format
12721278
string time_format = 11; // Optional time format
12731279
string null_value = 12; // Optional representation of null value
1280+
bool double_quote = 13; // Indicates whether to use double quotes instead of escaping
12741281
}
12751282

12761283
// Options controlling CSV format

datafusion/proto/src/generated/pbjson.rs

Lines changed: 70 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

datafusion/proto/src/generated/prost.rs

Lines changed: 12 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

datafusion/proto/src/logical_plan/mod.rs

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1626,6 +1626,9 @@ pub(crate) fn csv_writer_options_to_proto(
16261626
timestamp_format: csv_options.timestamp_format().unwrap_or("").to_owned(),
16271627
time_format: csv_options.time_format().unwrap_or("").to_owned(),
16281628
null_value: csv_options.null().to_owned(),
1629+
quote: (csv_options.quote() as char).to_string(),
1630+
escape: (csv_options.escape() as char).to_string(),
1631+
double_quote: csv_options.double_quote(),
16291632
}
16301633
}
16311634

@@ -1644,11 +1647,34 @@ pub(crate) fn csv_writer_options_from_proto(
16441647
return Err(proto_error("Error parsing CSV Delimiter"));
16451648
}
16461649
}
1650+
if !writer_options.quote.is_empty() {
1651+
if let Some(quote) = writer_options.quote.chars().next() {
1652+
if quote.is_ascii() {
1653+
builder = builder.with_quote(quote as u8);
1654+
} else {
1655+
return Err(proto_error("CSV quote is not ASCII"));
1656+
}
1657+
} else {
1658+
return Err(proto_error("Error parsing CSV quote"));
1659+
}
1660+
}
1661+
if !writer_options.escape.is_empty() {
1662+
if let Some(escape) = writer_options.escape.chars().next() {
1663+
if escape.is_ascii() {
1664+
builder = builder.with_escape(escape as u8);
1665+
} else {
1666+
return Err(proto_error("CSV escape is not ASCII"));
1667+
}
1668+
} else {
1669+
return Err(proto_error("Error parsing CSV escape"));
1670+
}
1671+
}
16471672
Ok(builder
16481673
.with_header(writer_options.has_header)
16491674
.with_date_format(writer_options.date_format.clone())
16501675
.with_datetime_format(writer_options.datetime_format.clone())
16511676
.with_timestamp_format(writer_options.timestamp_format.clone())
16521677
.with_time_format(writer_options.time_format.clone())
1653-
.with_null(writer_options.null_value.clone()))
1678+
.with_null(writer_options.null_value.clone())
1679+
.with_double_quote(writer_options.double_quote))
16541680
}

datafusion/proto/src/physical_plan/from_proto.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -872,6 +872,7 @@ impl TryFrom<&protobuf::CsvOptions> for CsvOptions {
872872
delimiter: proto_opts.delimiter[0],
873873
quote: proto_opts.quote[0],
874874
escape: proto_opts.escape.first().copied(),
875+
double_quote: proto_opts.double_quote,
875876
compression: proto_opts.compression().into(),
876877
schema_infer_max_rec: proto_opts.schema_infer_max_rec as usize,
877878
date_format: (!proto_opts.date_format.is_empty())

datafusion/proto/src/physical_plan/mod.rs

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1020,7 +1020,7 @@ impl AsExecutionPlan for protobuf::PhysicalPlanNode {
10201020
.as_ref()
10211021
.ok_or_else(|| proto_error("Missing required field in protobuf"))?
10221022
.try_into()?;
1023-
let sink_schema = convert_required!(sink.sink_schema)?;
1023+
let sink_schema = input.schema();
10241024
let sort_order = sink
10251025
.sort_order
10261026
.as_ref()
@@ -1037,7 +1037,7 @@ impl AsExecutionPlan for protobuf::PhysicalPlanNode {
10371037
Ok(Arc::new(FileSinkExec::new(
10381038
input,
10391039
Arc::new(data_sink),
1040-
Arc::new(sink_schema),
1040+
sink_schema,
10411041
sort_order,
10421042
)))
10431043
}
@@ -1050,7 +1050,7 @@ impl AsExecutionPlan for protobuf::PhysicalPlanNode {
10501050
.as_ref()
10511051
.ok_or_else(|| proto_error("Missing required field in protobuf"))?
10521052
.try_into()?;
1053-
let sink_schema = convert_required!(sink.sink_schema)?;
1053+
let sink_schema = input.schema();
10541054
let sort_order = sink
10551055
.sort_order
10561056
.as_ref()
@@ -1067,7 +1067,7 @@ impl AsExecutionPlan for protobuf::PhysicalPlanNode {
10671067
Ok(Arc::new(FileSinkExec::new(
10681068
input,
10691069
Arc::new(data_sink),
1070-
Arc::new(sink_schema),
1070+
sink_schema,
10711071
sort_order,
10721072
)))
10731073
}
@@ -1080,7 +1080,7 @@ impl AsExecutionPlan for protobuf::PhysicalPlanNode {
10801080
.as_ref()
10811081
.ok_or_else(|| proto_error("Missing required field in protobuf"))?
10821082
.try_into()?;
1083-
let sink_schema = convert_required!(sink.sink_schema)?;
1083+
let sink_schema = input.schema();
10841084
let sort_order = sink
10851085
.sort_order
10861086
.as_ref()
@@ -1097,7 +1097,7 @@ impl AsExecutionPlan for protobuf::PhysicalPlanNode {
10971097
Ok(Arc::new(FileSinkExec::new(
10981098
input,
10991099
Arc::new(data_sink),
1100-
Arc::new(sink_schema),
1100+
sink_schema,
11011101
sort_order,
11021102
)))
11031103
}

datafusion/proto/src/physical_plan/to_proto.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1070,6 +1070,7 @@ impl TryFrom<&CsvOptions> for protobuf::CsvOptions {
10701070
timestamp_tz_format: opts.timestamp_tz_format.clone().unwrap_or_default(),
10711071
time_format: opts.time_format.clone().unwrap_or_default(),
10721072
null_value: opts.null_value.clone().unwrap_or_default(),
1073+
double_quote: opts.double_quote,
10731074
})
10741075
}
10751076
}

0 commit comments

Comments
 (0)