Skip to content

Commit

Permalink
func: Limit number of columns in csv_extract (#23044)
Browse files Browse the repository at this point in the history
This PR limits the absolute number of columns that can be extracted with
`csv_extract` to 8192. Given the [maximum number of columns Postgres
supports is 1600](https://www.postgresql.org/docs/current/limits.html),
8192 seems like a good limit.

Currently we do limit the number of columns to just how much memory we
can allocate, but this leads to issues like
https://github.com/MaterializeInc/materialize/issues/22735, and
realistically we should restrict the number of columns much more than
just the number of column names we can fit in memory.

### Motivation

Fixes https://github.com/MaterializeInc/materialize/issues/22735

### Checklist

- [ ] This PR has adequate test coverage / QA involvement has been duly
considered.
- [ ] This PR has an associated up-to-date [design
doc](https://github.com/MaterializeInc/materialize/blob/main/doc/developer/design/README.md),
is a design doc
([template](https://github.com/MaterializeInc/materialize/blob/main/doc/developer/design/00000000_template.md)),
or is sufficiently small to not require a design.
  <!-- Reference the design in the description. -->
- [ ] If this PR evolves [an existing `$T ⇔ Proto$T`
mapping](https://github.com/MaterializeInc/materialize/blob/main/doc/developer/command-and-response-binary-encoding.md)
(possibly in a backwards-incompatible way), then it is tagged with a
`T-proto` label.
- [ ] If this PR will require changes to cloud orchestration or tests,
there is a companion cloud PR to account for those changes that is
tagged with the release-blocker label
([example](MaterializeInc/cloud#5021)).
<!-- Ask in #team-cloud on Slack if you need help preparing the cloud
PR. -->
- [x] This PR includes the following [user-facing behavior
changes](https://github.com/MaterializeInc/materialize/blob/main/doc/developer/guide-changes.md#what-changes-require-a-release-note):
  - N/a
  • Loading branch information
ParkMyCar authored Nov 13, 2023
1 parent 8f2feab commit 99c27e9
Show file tree
Hide file tree
Showing 4 changed files with 19 additions and 12 deletions.
2 changes: 1 addition & 1 deletion misc/python/materialize/sqlsmith.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@
"regex parse error",
"out of valid range",
'" does not exist', # role does not exist
"csv_extract number of columns too large",
"attempt to create relation with too many columns",
"target replica failed or was dropped", # expected on replica OoMs with #21587
"cannot materialize call to", # create materialized view on some internal views
"arrays must not contain null values", # aclexplode, mz_aclexplode
Expand Down
21 changes: 11 additions & 10 deletions src/sql/src/func.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3447,23 +3447,24 @@ pub static MZ_CATALOG_BUILTINS: Lazy<BTreeMap<&'static str, Func>> = Lazy::new(|
},
"csv_extract" => Table {
params!(Int64, String) => Operation::binary(move |_ecx, ncols, input| {
const MAX_EXTRACT_COLUMNS: i64 = 8192;
const TOO_MANY_EXTRACT_COLUMNS: i64 = MAX_EXTRACT_COLUMNS + 1;

let ncols = match ncols.into_literal_int64() {
None | Some(i64::MIN..=0) => {
sql_bail!("csv_extract number of columns must be a positive integer literal");
},
Some(ncols) => ncols,
Some(ncols @ 1..=MAX_EXTRACT_COLUMNS) => ncols,
Some(ncols @ TOO_MANY_EXTRACT_COLUMNS..) => {
return Err(PlanError::TooManyColumns {
max_num_columns: usize::try_from(MAX_EXTRACT_COLUMNS).unwrap_or(usize::MAX),
req_num_columns: usize::try_from(ncols).unwrap_or(usize::MAX),
});
},
};
let ncols = usize::try_from(ncols).expect("known to be greater than zero");

// Prevent OOMing if the user requests some extremely large number of columns.
let mut column_names = Vec::new();
if let Err(_) = column_names.try_reserve(ncols) {
sql_bail!("csv_extract number of columns too large");
};
for i in 1..=ncols {
column_names.push(format!("column{}", i).into());
}

let column_names = (1..=ncols).map(|i| format!("column{}", i).into()).collect();
Ok(TableFuncPlan {
expr: HirRelationExpr::CallTable {
func: TableFunc::CsvExtract(ncols),
Expand Down
6 changes: 6 additions & 0 deletions test/sqllogictest/extract.slt
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,9 @@ SELECT * FROM data, (VALUES (2)) ncols, csv_extract(ncols.column1, data.input)

query error db error: ERROR: csv_extract number of columns must be a positive integer literal
SELECT * FROM data, csv_extract((SELECT 2), data.input)

query error db error: ERROR: attempt to create relation with too many columns, 8193 max: 8192
SELECT * FROM data, csv_extract(8193, data.input)

statement ok
SELECT * FROM data, csv_extract(8192, data.input)
2 changes: 1 addition & 1 deletion test/testdrive/oom.td
Original file line number Diff line number Diff line change
Expand Up @@ -109,4 +109,4 @@ $ postgres-execute connection=mz_system
ALTER SYSTEM RESET max_result_size

! SELECT csv_extract(9223372036854775807, '');
contains:csv_extract number of columns too large
contains:attempt to create relation with too many columns, 9223372036854775807 max: 8192

0 comments on commit 99c27e9

Please sign in to comment.