Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions arrow-array/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ all-features = true
async = ["dep:futures"]
ffi = ["arrow-schema/ffi", "arrow-data/ffi"]
force_validate = []
simdutf8 = ["arrow-data/simdutf8"]

[dev-dependencies]
rand = { version = "0.9", default-features = false, features = ["std", "std_rng", "thread_rng"] }
Expand Down
2 changes: 1 addition & 1 deletion arrow-array/src/types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1801,7 +1801,7 @@ impl<O: OffsetSizeTrait> ByteArrayType for GenericStringType<O> {

fn validate(offsets: &OffsetBuffer<Self::Offset>, values: &Buffer) -> Result<(), ArrowError> {
// Verify that the slice as a whole is valid UTF-8
let validated = std::str::from_utf8(values).map_err(|e| {
let validated = arrow_data::utf8::check_utf8(values).map_err(|e| {
ArrowError::InvalidArgumentError(format!("Encountered non UTF-8 data: {e}"))
})?;

Expand Down
4 changes: 4 additions & 0 deletions arrow-csv/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,16 @@ all-features = true
[dependencies]
arrow-array = { workspace = true }
arrow-cast = { workspace = true }
arrow-data = { workspace = true }
arrow-schema = { workspace = true }
chrono = { workspace = true }
csv = { version = "1.1", default-features = false }
csv-core = { version = "0.1" }
regex = { version = "1.7.0", default-features = false, features = ["std", "unicode", "perf"] }

[features]
simdutf8 = ["arrow-data/simdutf8"]

[dev-dependencies]
arrow-buffer = { workspace = true }
tempfile = "3.3"
Expand Down
2 changes: 1 addition & 1 deletion arrow-csv/src/reader/records.rs
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ impl RecordDecoder {
});

// Need to truncate data t1o the actual amount of data read
let data = std::str::from_utf8(&self.data[..self.data_len]).map_err(|e| {
let data = arrow_data::utf8::check_utf8(&self.data[..self.data_len]).map_err(|e| {
let valid_up_to = e.valid_up_to();

// We can't use binary search because of empty fields
Expand Down
2 changes: 2 additions & 0 deletions arrow-data/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ bench = false
force_validate = []
# Enable ffi support
ffi = ["arrow-schema/ffi"]
simdutf8 = ["dep:simdutf8"]

[package.metadata.docs.rs]
all-features = true
Expand All @@ -51,6 +52,7 @@ arrow-schema = { workspace = true }
num-integer = { version = "0.1.46", default-features = false, features = ["std"] }
num-traits = { version = "0.2.19", default-features = false, features = ["std"] }
half = { version = "2.1", default-features = false }
simdutf8 = { workspace = true, optional = true }

[dev-dependencies]

Expand Down
2 changes: 1 addition & 1 deletion arrow-data/src/byte_view.rs
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ pub fn validate_binary_view(views: &[u128], buffers: &[Buffer]) -> Result<(), Ar
/// Validates the combination of `views` and `buffers` is a valid StringView
pub fn validate_string_view(views: &[u128], buffers: &[Buffer]) -> Result<(), ArrowError> {
validate_view_impl(views, buffers, |idx, b| {
std::str::from_utf8(b).map_err(|e| {
crate::utf8::check_utf8(b).map_err(|e| {
ArrowError::InvalidArgumentError(format!(
"Encountered non-UTF-8 data at index {idx}: {e}"
))
Expand Down
2 changes: 1 addition & 1 deletion arrow-data/src/data.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1510,7 +1510,7 @@ impl ArrayData {
T: ArrowNativeType + TryInto<usize> + num_traits::Num + std::fmt::Display,
{
let values_buffer = &self.buffers[1].as_slice();
if let Ok(values_str) = std::str::from_utf8(values_buffer) {
if let Ok(values_str) = crate::utf8::check_utf8(values_buffer) {
// Validate Offsets are correct
self.validate_each_offset::<T, _>(values_buffer.len(), |string_index, range| {
if !values_str.is_char_boundary(range.start)
Expand Down
2 changes: 2 additions & 0 deletions arrow-data/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,5 @@ pub mod ffi;

mod byte_view;
pub use byte_view::*;

pub mod utf8;
37 changes: 37 additions & 0 deletions arrow-data/src/utf8.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

//! SIMD-accelerated UTF-8 validation utilities.

/// Validates that `val` is valid UTF-8, returning the `&str` on success.
///
/// When the `simdutf8` feature is enabled, uses SIMD-accelerated validation
/// on the happy path for improved throughput. Falls back to `std::str::from_utf8`
/// on the error path to provide a detailed [`std::str::Utf8Error`].
#[inline(always)]
pub fn check_utf8(val: &[u8]) -> Result<&str, std::str::Utf8Error> {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we unify this with the existing utf8 check?

Copy link
Contributor Author

@lyang24 lyang24 Mar 8, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi, do you mean unifying it with the one in parquet folder?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah

#[cfg(feature = "simdutf8")]
{
if simdutf8::basic::from_utf8(val).is_ok() {
// SAFETY: simdutf8 just validated the bytes are valid UTF-8
return Ok(unsafe { std::str::from_utf8_unchecked(val) });
}
Err(std::str::from_utf8(val).unwrap_err())
}
#[cfg(not(feature = "simdutf8"))]
std::str::from_utf8(val)
}
3 changes: 3 additions & 0 deletions arrow-row/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,9 @@ bench = false
[package.metadata.docs.rs]
all-features = true

[features]
simdutf8 = ["arrow-data/simdutf8"]

[dependencies]
arrow-array = { workspace = true }
arrow-buffer = { workspace = true }
Expand Down
2 changes: 1 addition & 1 deletion arrow-row/src/variable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -378,7 +378,7 @@ fn decode_binary_view_inner(
if validate_utf8 {
// the values contains all data, no matter if it is short or long
// we can validate utf8 in one go.
std::str::from_utf8(values.as_slice()).unwrap();
arrow_data::utf8::check_utf8(values.as_slice()).unwrap();
}

let builder = ArrayDataBuilder::new(DataType::BinaryView)
Expand Down
3 changes: 2 additions & 1 deletion arrow/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ half = { version = "2.1", default-features = false, features = ["rand_distr"], o
all-features = true

[features]
default = ["csv", "ipc", "json"]
default = ["csv", "ipc", "json", "simdutf8"]
async = ["arrow-array/async"]
ipc_compression = ["ipc", "arrow-ipc/lz4", "arrow-ipc/zstd"]
csv = ["arrow-csv"]
Expand All @@ -81,6 +81,7 @@ force_validate = ["arrow-array/force_validate", "arrow-data/force_validate"]
# Enable ffi support
ffi = ["arrow-schema/ffi", "arrow-data/ffi", "arrow-array/ffi"]
chrono-tz = ["arrow-array/chrono-tz"]
simdutf8 = ["arrow-data/simdutf8", "arrow-array/simdutf8", "arrow-row/simdutf8", "arrow-csv?/simdutf8"]
canonical_extension_types = ["arrow-schema/canonical_extension_types"]

[dev-dependencies]
Expand Down
Loading