apache · lyang24 · Mar 1, 2026 · Dandandan · Mar 2, 2026 · lyang24
diff --git a/arrow-array/Cargo.toml b/arrow-array/Cargo.toml
@@ -58,6 +58,7 @@ all-features = true
 async = ["dep:futures"]
 ffi = ["arrow-schema/ffi", "arrow-data/ffi"]
 force_validate = []
+simdutf8 = ["arrow-data/simdutf8"]
 
 [dev-dependencies]
 rand = { version = "0.9", default-features = false, features = ["std", "std_rng", "thread_rng"] }

diff --git a/arrow-array/src/types.rs b/arrow-array/src/types.rs
@@ -1801,7 +1801,7 @@ impl<O: OffsetSizeTrait> ByteArrayType for GenericStringType<O> {
 
     fn validate(offsets: &OffsetBuffer<Self::Offset>, values: &Buffer) -> Result<(), ArrowError> {
         // Verify that the slice as a whole is valid UTF-8
-        let validated = std::str::from_utf8(values).map_err(|e| {
+        let validated = arrow_data::utf8::check_utf8(values).map_err(|e| {
             ArrowError::InvalidArgumentError(format!("Encountered non UTF-8 data: {e}"))
         })?;
 

diff --git a/arrow-csv/Cargo.toml b/arrow-csv/Cargo.toml
@@ -38,12 +38,16 @@ all-features = true
 [dependencies]
 arrow-array = { workspace = true }
 arrow-cast = { workspace = true }
+arrow-data = { workspace = true }
 arrow-schema = { workspace = true }
 chrono = { workspace = true }
 csv = { version = "1.1", default-features = false }
 csv-core = { version = "0.1" }
 regex = { version = "1.7.0", default-features = false, features = ["std", "unicode", "perf"] }
 
+[features]
+simdutf8 = ["arrow-data/simdutf8"]
+
 [dev-dependencies]
 arrow-buffer = { workspace = true }
 tempfile = "3.3"

diff --git a/arrow-csv/src/reader/records.rs b/arrow-csv/src/reader/records.rs
@@ -210,7 +210,7 @@ impl RecordDecoder {
             });
 
         // Need to truncate data t1o the actual amount of data read
-        let data = std::str::from_utf8(&self.data[..self.data_len]).map_err(|e| {
+        let data = arrow_data::utf8::check_utf8(&self.data[..self.data_len]).map_err(|e| {
             let valid_up_to = e.valid_up_to();
 
             // We can't use binary search because of empty fields

diff --git a/arrow-data/Cargo.toml b/arrow-data/Cargo.toml
@@ -39,6 +39,7 @@ bench = false
 force_validate = []
 # Enable ffi support
 ffi = ["arrow-schema/ffi"]
+simdutf8 = ["dep:simdutf8"]
 
 [package.metadata.docs.rs]
 all-features = true
@@ -51,6 +52,7 @@ arrow-schema = { workspace = true }
 num-integer = { version = "0.1.46", default-features = false, features = ["std"] }
 num-traits = { version = "0.2.19", default-features = false, features = ["std"] }
 half = { version = "2.1", default-features = false }
+simdutf8 = { workspace = true, optional = true }
 
 [dev-dependencies]
 

diff --git a/arrow-data/src/byte_view.rs b/arrow-data/src/byte_view.rs
@@ -152,7 +152,7 @@ pub fn validate_binary_view(views: &[u128], buffers: &[Buffer]) -> Result<(), Ar
 /// Validates the combination of `views` and `buffers` is a valid StringView
 pub fn validate_string_view(views: &[u128], buffers: &[Buffer]) -> Result<(), ArrowError> {
     validate_view_impl(views, buffers, |idx, b| {
-        std::str::from_utf8(b).map_err(|e| {
+        crate::utf8::check_utf8(b).map_err(|e| {
             ArrowError::InvalidArgumentError(format!(
                 "Encountered non-UTF-8 data at index {idx}: {e}"
             ))

diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs
@@ -1510,7 +1510,7 @@ impl ArrayData {
         T: ArrowNativeType + TryInto<usize> + num_traits::Num + std::fmt::Display,
     {
         let values_buffer = &self.buffers[1].as_slice();
-        if let Ok(values_str) = std::str::from_utf8(values_buffer) {
+        if let Ok(values_str) = crate::utf8::check_utf8(values_buffer) {
             // Validate Offsets are correct
             self.validate_each_offset::<T, _>(values_buffer.len(), |string_index, range| {
                 if !values_str.is_char_boundary(range.start)

diff --git a/arrow-data/src/lib.rs b/arrow-data/src/lib.rs
@@ -39,3 +39,5 @@ pub mod ffi;
 
 mod byte_view;
 pub use byte_view::*;
+
+pub mod utf8;
diff --git a/arrow-data/src/utf8.rs b/arrow-data/src/utf8.rs
@@ -0,0 +1,37 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! SIMD-accelerated UTF-8 validation utilities.
+
+/// Validates that `val` is valid UTF-8, returning the `&str` on success.
+///
+/// When the `simdutf8` feature is enabled, uses SIMD-accelerated validation
+/// on the happy path for improved throughput. Falls back to `std::str::from_utf8`
+/// on the error path to provide a detailed [`std::str::Utf8Error`].
+#[inline(always)]
+pub fn check_utf8(val: &[u8]) -> Result<&str, std::str::Utf8Error> {
+    #[cfg(feature = "simdutf8")]
+    {
+        if simdutf8::basic::from_utf8(val).is_ok() {
+            // SAFETY: simdutf8 just validated the bytes are valid UTF-8
+            return Ok(unsafe { std::str::from_utf8_unchecked(val) });
+        }
+        Err(std::str::from_utf8(val).unwrap_err())
+    }
+    #[cfg(not(feature = "simdutf8"))]
+    std::str::from_utf8(val)
+}
diff --git a/arrow-row/Cargo.toml b/arrow-row/Cargo.toml
@@ -35,6 +35,9 @@ bench = false
 [package.metadata.docs.rs]
 all-features = true
 
+[features]
+simdutf8 = ["arrow-data/simdutf8"]
+
 [dependencies]
 arrow-array = { workspace = true }
 arrow-buffer = { workspace = true }

diff --git a/arrow-row/src/variable.rs b/arrow-row/src/variable.rs
@@ -378,7 +378,7 @@ fn decode_binary_view_inner(
     if validate_utf8 {
         // the values contains all data, no matter if it is short or long
         // we can validate utf8 in one go.
-        std::str::from_utf8(values.as_slice()).unwrap();
+        arrow_data::utf8::check_utf8(values.as_slice()).unwrap();
     }
 
     let builder = ArrayDataBuilder::new(DataType::BinaryView)

diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml
@@ -61,7 +61,7 @@ half = { version = "2.1", default-features = false, features = ["rand_distr"], o
 all-features = true
 
 [features]
-default = ["csv", "ipc", "json"]
+default = ["csv", "ipc", "json", "simdutf8"]
 async = ["arrow-array/async"]
 ipc_compression = ["ipc", "arrow-ipc/lz4", "arrow-ipc/zstd"]
 csv = ["arrow-csv"]
@@ -81,6 +81,7 @@ force_validate = ["arrow-array/force_validate", "arrow-data/force_validate"]
 # Enable ffi support
 ffi = ["arrow-schema/ffi", "arrow-data/ffi", "arrow-array/ffi"]
 chrono-tz = ["arrow-array/chrono-tz"]
+simdutf8 = ["arrow-data/simdutf8", "arrow-array/simdutf8", "arrow-row/simdutf8", "arrow-csv?/simdutf8"]
 canonical_extension_types = ["arrow-schema/canonical_extension_types"]
 
 [dev-dependencies]
Original file line number	Diff line number	Diff line change
Expand Up		@@ -39,3 +39,5 @@ pub mod ffi;

		mod byte_view;
		pub use byte_view::*;

		pub mod utf8;