diff --git a/arrow-array/Cargo.toml b/arrow-array/Cargo.toml index a046fea2b0dc..427f993a380a 100644 --- a/arrow-array/Cargo.toml +++ b/arrow-array/Cargo.toml @@ -58,6 +58,7 @@ all-features = true async = ["dep:futures"] ffi = ["arrow-schema/ffi", "arrow-data/ffi"] force_validate = [] +simdutf8 = ["arrow-data/simdutf8"] [dev-dependencies] rand = { version = "0.9", default-features = false, features = ["std", "std_rng", "thread_rng"] } diff --git a/arrow-array/src/types.rs b/arrow-array/src/types.rs index ff1caaacaecc..a201537d1f8d 100644 --- a/arrow-array/src/types.rs +++ b/arrow-array/src/types.rs @@ -1801,7 +1801,7 @@ impl ByteArrayType for GenericStringType { fn validate(offsets: &OffsetBuffer, values: &Buffer) -> Result<(), ArrowError> { // Verify that the slice as a whole is valid UTF-8 - let validated = std::str::from_utf8(values).map_err(|e| { + let validated = arrow_data::utf8::check_utf8(values).map_err(|e| { ArrowError::InvalidArgumentError(format!("Encountered non UTF-8 data: {e}")) })?; diff --git a/arrow-csv/Cargo.toml b/arrow-csv/Cargo.toml index c44ec01ce357..f373867bb0d9 100644 --- a/arrow-csv/Cargo.toml +++ b/arrow-csv/Cargo.toml @@ -38,12 +38,16 @@ all-features = true [dependencies] arrow-array = { workspace = true } arrow-cast = { workspace = true } +arrow-data = { workspace = true } arrow-schema = { workspace = true } chrono = { workspace = true } csv = { version = "1.1", default-features = false } csv-core = { version = "0.1" } regex = { version = "1.7.0", default-features = false, features = ["std", "unicode", "perf"] } +[features] +simdutf8 = ["arrow-data/simdutf8"] + [dev-dependencies] arrow-buffer = { workspace = true } tempfile = "3.3" diff --git a/arrow-csv/src/reader/records.rs b/arrow-csv/src/reader/records.rs index 33927c93360a..f47f5b1f5465 100644 --- a/arrow-csv/src/reader/records.rs +++ b/arrow-csv/src/reader/records.rs @@ -210,7 +210,7 @@ impl RecordDecoder { }); // Need to truncate data t1o the actual amount of data read - let data = std::str::from_utf8(&self.data[..self.data_len]).map_err(|e| { + let data = arrow_data::utf8::check_utf8(&self.data[..self.data_len]).map_err(|e| { let valid_up_to = e.valid_up_to(); // We can't use binary search because of empty fields diff --git a/arrow-data/Cargo.toml b/arrow-data/Cargo.toml index 9c7a5206b2f4..e9f86603ba5d 100644 --- a/arrow-data/Cargo.toml +++ b/arrow-data/Cargo.toml @@ -39,6 +39,7 @@ bench = false force_validate = [] # Enable ffi support ffi = ["arrow-schema/ffi"] +simdutf8 = ["dep:simdutf8"] [package.metadata.docs.rs] all-features = true @@ -51,6 +52,7 @@ arrow-schema = { workspace = true } num-integer = { version = "0.1.46", default-features = false, features = ["std"] } num-traits = { version = "0.2.19", default-features = false, features = ["std"] } half = { version = "2.1", default-features = false } +simdutf8 = { workspace = true, optional = true } [dev-dependencies] diff --git a/arrow-data/src/byte_view.rs b/arrow-data/src/byte_view.rs index 270f4f9948ac..d4f5afe55907 100644 --- a/arrow-data/src/byte_view.rs +++ b/arrow-data/src/byte_view.rs @@ -152,7 +152,7 @@ pub fn validate_binary_view(views: &[u128], buffers: &[Buffer]) -> Result<(), Ar /// Validates the combination of `views` and `buffers` is a valid StringView pub fn validate_string_view(views: &[u128], buffers: &[Buffer]) -> Result<(), ArrowError> { validate_view_impl(views, buffers, |idx, b| { - std::str::from_utf8(b).map_err(|e| { + crate::utf8::check_utf8(b).map_err(|e| { ArrowError::InvalidArgumentError(format!( "Encountered non-UTF-8 data at index {idx}: {e}" )) diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index 21cf4e5b5e2c..8ebb180b1163 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -1510,7 +1510,7 @@ impl ArrayData { T: ArrowNativeType + TryInto + num_traits::Num + std::fmt::Display, { let values_buffer = &self.buffers[1].as_slice(); - if let Ok(values_str) = std::str::from_utf8(values_buffer) { + if let Ok(values_str) = crate::utf8::check_utf8(values_buffer) { // Validate Offsets are correct self.validate_each_offset::(values_buffer.len(), |string_index, range| { if !values_str.is_char_boundary(range.start) diff --git a/arrow-data/src/lib.rs b/arrow-data/src/lib.rs index 07e7553b2b43..257cdc7a4909 100644 --- a/arrow-data/src/lib.rs +++ b/arrow-data/src/lib.rs @@ -39,3 +39,5 @@ pub mod ffi; mod byte_view; pub use byte_view::*; + +pub mod utf8; diff --git a/arrow-data/src/utf8.rs b/arrow-data/src/utf8.rs new file mode 100644 index 000000000000..a58c15633932 --- /dev/null +++ b/arrow-data/src/utf8.rs @@ -0,0 +1,37 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! SIMD-accelerated UTF-8 validation utilities. + +/// Validates that `val` is valid UTF-8, returning the `&str` on success. +/// +/// When the `simdutf8` feature is enabled, uses SIMD-accelerated validation +/// on the happy path for improved throughput. Falls back to `std::str::from_utf8` +/// on the error path to provide a detailed [`std::str::Utf8Error`]. +#[inline(always)] +pub fn check_utf8(val: &[u8]) -> Result<&str, std::str::Utf8Error> { + #[cfg(feature = "simdutf8")] + { + if simdutf8::basic::from_utf8(val).is_ok() { + // SAFETY: simdutf8 just validated the bytes are valid UTF-8 + return Ok(unsafe { std::str::from_utf8_unchecked(val) }); + } + Err(std::str::from_utf8(val).unwrap_err()) + } + #[cfg(not(feature = "simdutf8"))] + std::str::from_utf8(val) +} diff --git a/arrow-row/Cargo.toml b/arrow-row/Cargo.toml index 8293093ce486..a184d2a35f81 100644 --- a/arrow-row/Cargo.toml +++ b/arrow-row/Cargo.toml @@ -35,6 +35,9 @@ bench = false [package.metadata.docs.rs] all-features = true +[features] +simdutf8 = ["arrow-data/simdutf8"] + [dependencies] arrow-array = { workspace = true } arrow-buffer = { workspace = true } diff --git a/arrow-row/src/variable.rs b/arrow-row/src/variable.rs index 9d557c57469f..0af2b6a6f32a 100644 --- a/arrow-row/src/variable.rs +++ b/arrow-row/src/variable.rs @@ -378,7 +378,7 @@ fn decode_binary_view_inner( if validate_utf8 { // the values contains all data, no matter if it is short or long // we can validate utf8 in one go. - std::str::from_utf8(values.as_slice()).unwrap(); + arrow_data::utf8::check_utf8(values.as_slice()).unwrap(); } let builder = ArrayDataBuilder::new(DataType::BinaryView) diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 137d785eee88..6ecd4be17fde 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -61,7 +61,7 @@ half = { version = "2.1", default-features = false, features = ["rand_distr"], o all-features = true [features] -default = ["csv", "ipc", "json"] +default = ["csv", "ipc", "json", "simdutf8"] async = ["arrow-array/async"] ipc_compression = ["ipc", "arrow-ipc/lz4", "arrow-ipc/zstd"] csv = ["arrow-csv"] @@ -81,6 +81,7 @@ force_validate = ["arrow-array/force_validate", "arrow-data/force_validate"] # Enable ffi support ffi = ["arrow-schema/ffi", "arrow-data/ffi", "arrow-array/ffi"] chrono-tz = ["arrow-array/chrono-tz"] +simdutf8 = ["arrow-data/simdutf8", "arrow-array/simdutf8", "arrow-row/simdutf8", "arrow-csv?/simdutf8"] canonical_extension_types = ["arrow-schema/canonical_extension_types"] [dev-dependencies]