This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new db20a81245 Add `simdutf8` feature to make `simdutf8` optional, 
consolidate `check_valid_utf8` (#6979)
db20a81245 is described below

commit db20a812454e8b9e9ec099fc41564ea7c17c0e3f
Author: Andrew Lamb <[email protected]>
AuthorDate: Fri Jan 17 16:56:10 2025 -0500

    Add `simdutf8` feature to make `simdutf8` optional, consolidate 
`check_valid_utf8` (#6979)
    
    * Add `simd8tf8` feature
    
    * Consolidate check utf8
    
    * Publically doc and export
    
    * fmt
    
    * Update parquet/src/util/utf8.rs
    
    Co-authored-by: Daniël Heres <[email protected]>
    
    * enable by default
    
    ---------
    
    Co-authored-by: Daniël Heres <[email protected]>
---
 .github/workflows/parquet.yml                     |  2 +
 parquet/Cargo.toml                                |  6 ++-
 parquet/README.md                                 | 20 ++++----
 parquet/src/arrow/array_reader/byte_view_array.rs | 12 +----
 parquet/src/arrow/buffer/offset_buffer.rs         | 10 +---
 parquet/src/lib.rs                                |  3 ++
 parquet/src/util/mod.rs                           |  2 +
 parquet/src/util/utf8.rs                          | 57 +++++++++++++++++++++++
 8 files changed, 83 insertions(+), 29 deletions(-)

diff --git a/.github/workflows/parquet.yml b/.github/workflows/parquet.yml
index 19503fde79..4c46fde198 100644
--- a/.github/workflows/parquet.yml
+++ b/.github/workflows/parquet.yml
@@ -97,6 +97,8 @@ jobs:
         run: cargo check -p parquet --no-default-features
       - name: Check compilation --no-default-features --features arrow
         run: cargo check -p parquet --no-default-features --features arrow
+      - name: Check compilation --no-default-features --features simdutf8
+        run: cargo check -p parquet --no-default-features --features simdutf8
       - name: Check compilation --no-default-features --all-features
         run: cargo check -p parquet --all-features
       - name: Check compilation --all-targets
diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml
index c14c0e1d34..54992d864d 100644
--- a/parquet/Cargo.toml
+++ b/parquet/Cargo.toml
@@ -69,7 +69,7 @@ paste = { version = "1.0" }
 half = { version = "2.1", default-features = false, features = ["num-traits"] }
 sysinfo = { version = "0.33.0", optional = true, default-features = false, 
features = ["system"] }
 crc32fast = { version = "1.4.2", optional = true, default-features = false }
-simdutf8 = { version = "0.1.5"}
+simdutf8 = { version = "0.1.5", optional = true, default-features = false }
 
 [dev-dependencies]
 base64 = { version = "0.22", default-features = false, features = ["std"] }
@@ -98,7 +98,7 @@ zstd-sys = { version = ">=2.0.0, <2.0.14", default-features = 
false }
 all-features = true
 
 [features]
-default = ["arrow", "snap", "brotli", "flate2", "lz4", "zstd", "base64"]
+default = ["arrow", "snap", "brotli", "flate2", "lz4", "zstd", "base64", 
"simdutf8"]
 # Enable lz4
 lz4 = ["lz4_flex"]
 # Enable arrow reader/writer APIs
@@ -121,6 +121,8 @@ zstd = ["dep:zstd", "zstd-sys"]
 sysinfo = ["dep:sysinfo"]
 # Verify 32-bit CRC checksum when decoding parquet pages
 crc = ["dep:crc32fast"]
+# Enable SIMD UTF-8 validation
+simdutf8 = ["dep:simdutf8"]
 
 
 [[example]]
diff --git a/parquet/README.md b/parquet/README.md
index 9ff1d921d6..1224e52f3f 100644
--- a/parquet/README.md
+++ b/parquet/README.md
@@ -51,17 +51,21 @@ major releases may contain breaking API changes.
 
 The `parquet` crate provides the following features which may be enabled in 
your `Cargo.toml`:
 
-- `arrow` (default) - support for reading / writing 
[`arrow`](https://crates.io/crates/arrow) arrays to / from parquet
-- `async` - support `async` APIs for reading parquet
-- `json` - support for reading / writing `json` data to / from parquet
-- `brotli` (default) - support for parquet using `brotli` compression
-- `flate2` (default) - support for parquet using `gzip` compression
-- `lz4` (default) - support for parquet using `lz4` compression
-- `zstd` (default) - support for parquet using `zstd` compression
-- `snap` (default) - support for parquet using `snappy` compression
+- `arrow` (default) - support for reading / writing [`arrow`] arrays to / from 
Parquet
+- `async` - support `async` APIs for reading Parquet
+- `json` - support for reading / writing `json` data to / from Parquet
+- `brotli` (default) - support for Parquet using `brotli` compression
+- `flate2` (default) - support for Parquet using `gzip` compression
+- `lz4` (default) - support for Parquet using `lz4` compression
+- `zstd` (default) - support for Parquet using `zstd` compression
+- `snap` (default) - support for Parquet using `snappy` compression
 - `cli` - parquet [CLI 
tools](https://github.com/apache/arrow-rs/tree/main/parquet/src/bin)
 - `crc` - enables functionality to automatically verify checksums of each page 
(if present) when decoding
 - `experimental` - Experimental APIs which may change, even between minor 
releases
+- `simdutf8` (default) - Use the [`simdutf8`] crate for SIMD-accelerated UTF-8 
validation
+
+[`arrow`]: https://crates.io/crates/arrow
+[`simdutf8`]: https://crates.io/crates/simdutf8
 
 ## Parquet Feature Status
 
diff --git a/parquet/src/arrow/array_reader/byte_view_array.rs 
b/parquet/src/arrow/array_reader/byte_view_array.rs
index 00627ad612..8df6590600 100644
--- a/parquet/src/arrow/array_reader/byte_view_array.rs
+++ b/parquet/src/arrow/array_reader/byte_view_array.rs
@@ -27,6 +27,7 @@ use crate::data_type::Int32Type;
 use crate::encodings::decoding::{Decoder, DeltaBitPackDecoder};
 use crate::errors::{ParquetError, Result};
 use crate::schema::types::ColumnDescPtr;
+use crate::util::utf8::check_valid_utf8;
 use arrow_array::{builder::make_view, ArrayRef};
 use arrow_buffer::Buffer;
 use arrow_data::ByteView;
@@ -681,17 +682,6 @@ impl ByteViewArrayDecoderDelta {
     }
 }
 
-/// Check that `val` is a valid UTF-8 sequence
-pub fn check_valid_utf8(val: &[u8]) -> Result<()> {
-    match simdutf8::basic::from_utf8(val) {
-        Ok(_) => Ok(()),
-        Err(_) => {
-            let e = simdutf8::compat::from_utf8(val).unwrap_err();
-            Err(general_err!("encountered non UTF-8 data: {}", e))
-        }
-    }
-}
-
 #[cfg(test)]
 mod tests {
     use arrow_array::StringViewArray;
diff --git a/parquet/src/arrow/buffer/offset_buffer.rs 
b/parquet/src/arrow/buffer/offset_buffer.rs
index 8dfb859612..5051dce12b 100644
--- a/parquet/src/arrow/buffer/offset_buffer.rs
+++ b/parquet/src/arrow/buffer/offset_buffer.rs
@@ -18,6 +18,7 @@
 use crate::arrow::buffer::bit_util::iter_set_bits_rev;
 use crate::arrow::record_reader::buffer::ValuesBuffer;
 use crate::errors::{ParquetError, Result};
+use crate::util::utf8::check_valid_utf8;
 use arrow_array::{make_array, ArrayRef, OffsetSizeTrait};
 use arrow_buffer::{ArrowNativeType, Buffer};
 use arrow_data::ArrayDataBuilder;
@@ -117,14 +118,7 @@ impl<I: OffsetSizeTrait> OffsetBuffer<I> {
     ///
     /// [`Self::try_push`] can perform this validation check on insertion
     pub fn check_valid_utf8(&self, start_offset: usize) -> Result<()> {
-        match 
simdutf8::basic::from_utf8(&self.values.as_slice()[start_offset..]) {
-            Ok(_) => Ok(()),
-            Err(_) => {
-                let e = 
simdutf8::compat::from_utf8(&self.values.as_slice()[start_offset..])
-                    .unwrap_err();
-                Err(general_err!("encountered non UTF-8 data: {}", e))
-            }
-        }
+        check_valid_utf8(&self.values.as_slice()[start_offset..])
     }
 
     /// Converts this into an [`ArrayRef`] with the provided `data_type` and 
`null_buffer`
diff --git a/parquet/src/lib.rs b/parquet/src/lib.rs
index 3b63845e70..3ca0dbe987 100644
--- a/parquet/src/lib.rs
+++ b/parquet/src/lib.rs
@@ -131,6 +131,9 @@ pub mod data_type;
 pub use self::encodings::{decoding, encoding};
 
 experimental!(#[macro_use] mod util);
+
+pub use util::utf8;
+
 #[cfg(feature = "arrow")]
 pub mod arrow;
 pub mod column;
diff --git a/parquet/src/util/mod.rs b/parquet/src/util/mod.rs
index dfa1285afc..1431132473 100644
--- a/parquet/src/util/mod.rs
+++ b/parquet/src/util/mod.rs
@@ -19,8 +19,10 @@
 pub mod bit_util;
 mod bit_pack;
 pub(crate) mod interner;
+
 #[cfg(any(test, feature = "test_common"))]
 pub(crate) mod test_common;
+pub mod utf8;
 
 #[cfg(any(test, feature = "test_common"))]
 pub use self::test_common::page_util::{
diff --git a/parquet/src/util/utf8.rs b/parquet/src/util/utf8.rs
new file mode 100644
index 0000000000..2a537b5e53
--- /dev/null
+++ b/parquet/src/util/utf8.rs
@@ -0,0 +1,57 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! [`check_valid_utf8`] validation function
+use crate::errors::{ParquetError, Result};
+
+/// Check that `val` is a valid UTF-8 sequence.
+///
+/// If the `simdutf8` feature is enabled, this function will use
+/// SIMD-accelerated validation from the [`simdutf8`] crate. Otherwise, it 
will use
+/// [`std::str::from_utf8`].
+///
+/// # Errors
+///
+/// Returns `Err::General` with a message compatible with 
[`std::str::from_utf8`] on failure.
+///
+/// # Example
+/// ```
+/// use parquet::utf8::check_valid_utf8;
+/// assert!(check_valid_utf8(b"hello").is_ok());
+/// assert!(check_valid_utf8(b"hello \xF0\x9F\x98\x8E").is_ok());
+/// // invalid UTF-8
+/// assert!(check_valid_utf8(b"hello \xF0\x9F\x98").is_err());
+/// ```
+///
+/// [`simdutf8`]: https://crates.io/crates/simdutf8
+#[inline(always)]
+pub fn check_valid_utf8(val: &[u8]) -> Result<()> {
+    #[cfg(feature = "simdutf8")]
+    match simdutf8::basic::from_utf8(val) {
+        Ok(_) => Ok(()),
+        Err(_) => {
+            // Use simdutf8::compat to return details about the decoding error
+            let e = simdutf8::compat::from_utf8(val).unwrap_err();
+            Err(general_err!("encountered non UTF-8 data: {}", e))
+        }
+    }
+    #[cfg(not(feature = "simdutf8"))]
+    match std::str::from_utf8(val) {
+        Ok(_) => Ok(()),
+        Err(e) => Err(general_err!("encountered non UTF-8 data: {}", e)),
+    }
+}

Reply via email to