Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 24 additions & 27 deletions datafusion/functions/src/encoding/inner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,10 @@ use arrow::{
datatypes::{ByteArrayType, DataType},
};
use arrow_buffer::{Buffer, OffsetBufferBuilder};
use base64::{engine::general_purpose, Engine as _};
use base64::{
engine::{DecodePaddingMode, GeneralPurpose, GeneralPurposeConfig},
Engine as _,
};
use datafusion_common::{
cast::{as_generic_binary_array, as_generic_string_array},
not_impl_err, plan_err,
Expand All @@ -40,6 +43,14 @@ use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
use datafusion_macros::user_doc;
use std::any::Any;

// Allow padding characters, but don't require them, and don't generate them.
const BASE64_ENGINE: GeneralPurpose = GeneralPurpose::new(
&base64::alphabet::STANDARD,
GeneralPurposeConfig::new()
.with_encode_padding(false)
.with_decode_padding_mode(DecodePaddingMode::Indifferent),
);

#[user_doc(
doc_section(label = "Binary String Functions"),
description = "Encode binary data into a textual representation.",
Expand Down Expand Up @@ -302,7 +313,7 @@ fn hex_encode(input: &[u8]) -> String {
}

fn base64_encode(input: &[u8]) -> String {
general_purpose::STANDARD_NO_PAD.encode(input)
BASE64_ENGINE.encode(input)
}

fn hex_decode(input: &[u8], buf: &mut [u8]) -> Result<usize> {
Expand All @@ -315,7 +326,7 @@ fn hex_decode(input: &[u8], buf: &mut [u8]) -> Result<usize> {
}

fn base64_decode(input: &[u8], buf: &mut [u8]) -> Result<usize> {
general_purpose::STANDARD_NO_PAD
BASE64_ENGINE
.decode_slice(input, buf)
.map_err(|e| internal_datafusion_err!("Failed to decode from base64: {e}"))
}
Expand Down Expand Up @@ -364,18 +375,16 @@ where
impl Encoding {
fn encode_scalar(self, value: Option<&[u8]>) -> ColumnarValue {
ColumnarValue::Scalar(match self {
Self::Base64 => ScalarValue::Utf8(
value.map(|v| general_purpose::STANDARD_NO_PAD.encode(v)),
),
Self::Base64 => ScalarValue::Utf8(value.map(|v| BASE64_ENGINE.encode(v))),
Self::Hex => ScalarValue::Utf8(value.map(hex::encode)),
})
}

fn encode_large_scalar(self, value: Option<&[u8]>) -> ColumnarValue {
ColumnarValue::Scalar(match self {
Self::Base64 => ScalarValue::LargeUtf8(
value.map(|v| general_purpose::STANDARD_NO_PAD.encode(v)),
),
Self::Base64 => {
ScalarValue::LargeUtf8(value.map(|v| BASE64_ENGINE.encode(v)))
}
Self::Hex => ScalarValue::LargeUtf8(value.map(hex::encode)),
})
}
Expand Down Expand Up @@ -411,15 +420,9 @@ impl Encoding {
};

let out = match self {
Self::Base64 => {
general_purpose::STANDARD_NO_PAD
.decode(value)
.map_err(|e| {
internal_datafusion_err!(
"Failed to decode value using base64: {e}"
)
})?
}
Self::Base64 => BASE64_ENGINE.decode(value).map_err(|e| {
internal_datafusion_err!("Failed to decode value using base64: {e}")
})?,
Self::Hex => hex::decode(value).map_err(|e| {
internal_datafusion_err!("Failed to decode value using hex: {e}")
})?,
Expand All @@ -435,15 +438,9 @@ impl Encoding {
};

let out = match self {
Self::Base64 => {
general_purpose::STANDARD_NO_PAD
.decode(value)
.map_err(|e| {
internal_datafusion_err!(
"Failed to decode value using base64: {e}"
)
})?
}
Self::Base64 => BASE64_ENGINE.decode(value).map_err(|e| {
internal_datafusion_err!("Failed to decode value using base64: {e}")
})?,
Self::Hex => hex::decode(value).map_err(|e| {
internal_datafusion_err!("Failed to decode value using hex: {e}")
})?,
Expand Down
9 changes: 4 additions & 5 deletions datafusion/sqllogictest/test_files/encoding.slt
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ CREATE TABLE test(
hex_field TEXT
) as VALUES
(0, 'abc', encode('abc', 'base64'), encode('abc', 'hex')),
(1, 'qweqwe', encode('qweqwe', 'base64'), encode('qweqwe', 'hex')),
(1, 'qweqw', encode('qweqw', 'base64') || '=', encode('qweqw', 'hex')),
(2, NULL, NULL, NULL),
(3, X'8f50d3f60eae370ddbf85c86219c55108a350165', encode('8f50d3f60eae370ddbf85c86219c55108a350165', 'base64'), encode('8f50d3f60eae370ddbf85c86219c55108a350165', 'hex'))
;
Expand Down Expand Up @@ -52,23 +52,23 @@ query T
SELECT encode(bin_field, 'hex') FROM test ORDER BY num;
----
616263
717765717765
7177657177
NULL
8f50d3f60eae370ddbf85c86219c55108a350165

query T
SELECT arrow_cast(decode(base64_field, 'base64'), 'Utf8') FROM test ORDER BY num;
----
abc
qweqwe
qweqw
NULL
8f50d3f60eae370ddbf85c86219c55108a350165

query T
SELECT arrow_cast(decode(hex_field, 'hex'), 'Utf8') FROM test ORDER BY num;
----
abc
qweqwe
qweqw
NULL
8f50d3f60eae370ddbf85c86219c55108a350165

Expand Down Expand Up @@ -110,7 +110,6 @@ SELECT
column1_utf8view,
encode(column1_utf8view, 'base64') AS column1_base64,
encode(column1_utf8view, 'hex') AS column1_hex,

column2_utf8view,
encode(column2_utf8view, 'base64') AS column2_base64,
encode(column2_utf8view, 'hex') AS column2_hex
Expand Down