This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new db20a81245 Add `simdutf8` feature to make `simdutf8` optional,
consolidate `check_valid_utf8` (#6979)
db20a81245 is described below
commit db20a812454e8b9e9ec099fc41564ea7c17c0e3f
Author: Andrew Lamb <[email protected]>
AuthorDate: Fri Jan 17 16:56:10 2025 -0500
Add `simdutf8` feature to make `simdutf8` optional, consolidate
`check_valid_utf8` (#6979)
* Add `simd8tf8` feature
* Consolidate check utf8
* Publically doc and export
* fmt
* Update parquet/src/util/utf8.rs
Co-authored-by: Daniël Heres <[email protected]>
* enable by default
---------
Co-authored-by: Daniël Heres <[email protected]>
---
.github/workflows/parquet.yml | 2 +
parquet/Cargo.toml | 6 ++-
parquet/README.md | 20 ++++----
parquet/src/arrow/array_reader/byte_view_array.rs | 12 +----
parquet/src/arrow/buffer/offset_buffer.rs | 10 +---
parquet/src/lib.rs | 3 ++
parquet/src/util/mod.rs | 2 +
parquet/src/util/utf8.rs | 57 +++++++++++++++++++++++
8 files changed, 83 insertions(+), 29 deletions(-)
diff --git a/.github/workflows/parquet.yml b/.github/workflows/parquet.yml
index 19503fde79..4c46fde198 100644
--- a/.github/workflows/parquet.yml
+++ b/.github/workflows/parquet.yml
@@ -97,6 +97,8 @@ jobs:
run: cargo check -p parquet --no-default-features
- name: Check compilation --no-default-features --features arrow
run: cargo check -p parquet --no-default-features --features arrow
+ - name: Check compilation --no-default-features --features simdutf8
+ run: cargo check -p parquet --no-default-features --features simdutf8
- name: Check compilation --no-default-features --all-features
run: cargo check -p parquet --all-features
- name: Check compilation --all-targets
diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml
index c14c0e1d34..54992d864d 100644
--- a/parquet/Cargo.toml
+++ b/parquet/Cargo.toml
@@ -69,7 +69,7 @@ paste = { version = "1.0" }
half = { version = "2.1", default-features = false, features = ["num-traits"] }
sysinfo = { version = "0.33.0", optional = true, default-features = false,
features = ["system"] }
crc32fast = { version = "1.4.2", optional = true, default-features = false }
-simdutf8 = { version = "0.1.5"}
+simdutf8 = { version = "0.1.5", optional = true, default-features = false }
[dev-dependencies]
base64 = { version = "0.22", default-features = false, features = ["std"] }
@@ -98,7 +98,7 @@ zstd-sys = { version = ">=2.0.0, <2.0.14", default-features =
false }
all-features = true
[features]
-default = ["arrow", "snap", "brotli", "flate2", "lz4", "zstd", "base64"]
+default = ["arrow", "snap", "brotli", "flate2", "lz4", "zstd", "base64",
"simdutf8"]
# Enable lz4
lz4 = ["lz4_flex"]
# Enable arrow reader/writer APIs
@@ -121,6 +121,8 @@ zstd = ["dep:zstd", "zstd-sys"]
sysinfo = ["dep:sysinfo"]
# Verify 32-bit CRC checksum when decoding parquet pages
crc = ["dep:crc32fast"]
+# Enable SIMD UTF-8 validation
+simdutf8 = ["dep:simdutf8"]
[[example]]
diff --git a/parquet/README.md b/parquet/README.md
index 9ff1d921d6..1224e52f3f 100644
--- a/parquet/README.md
+++ b/parquet/README.md
@@ -51,17 +51,21 @@ major releases may contain breaking API changes.
The `parquet` crate provides the following features which may be enabled in
your `Cargo.toml`:
-- `arrow` (default) - support for reading / writing
[`arrow`](https://crates.io/crates/arrow) arrays to / from parquet
-- `async` - support `async` APIs for reading parquet
-- `json` - support for reading / writing `json` data to / from parquet
-- `brotli` (default) - support for parquet using `brotli` compression
-- `flate2` (default) - support for parquet using `gzip` compression
-- `lz4` (default) - support for parquet using `lz4` compression
-- `zstd` (default) - support for parquet using `zstd` compression
-- `snap` (default) - support for parquet using `snappy` compression
+- `arrow` (default) - support for reading / writing [`arrow`] arrays to / from
Parquet
+- `async` - support `async` APIs for reading Parquet
+- `json` - support for reading / writing `json` data to / from Parquet
+- `brotli` (default) - support for Parquet using `brotli` compression
+- `flate2` (default) - support for Parquet using `gzip` compression
+- `lz4` (default) - support for Parquet using `lz4` compression
+- `zstd` (default) - support for Parquet using `zstd` compression
+- `snap` (default) - support for Parquet using `snappy` compression
- `cli` - parquet [CLI
tools](https://github.com/apache/arrow-rs/tree/main/parquet/src/bin)
- `crc` - enables functionality to automatically verify checksums of each page
(if present) when decoding
- `experimental` - Experimental APIs which may change, even between minor
releases
+- `simdutf8` (default) - Use the [`simdutf8`] crate for SIMD-accelerated UTF-8
validation
+
+[`arrow`]: https://crates.io/crates/arrow
+[`simdutf8`]: https://crates.io/crates/simdutf8
## Parquet Feature Status
diff --git a/parquet/src/arrow/array_reader/byte_view_array.rs
b/parquet/src/arrow/array_reader/byte_view_array.rs
index 00627ad612..8df6590600 100644
--- a/parquet/src/arrow/array_reader/byte_view_array.rs
+++ b/parquet/src/arrow/array_reader/byte_view_array.rs
@@ -27,6 +27,7 @@ use crate::data_type::Int32Type;
use crate::encodings::decoding::{Decoder, DeltaBitPackDecoder};
use crate::errors::{ParquetError, Result};
use crate::schema::types::ColumnDescPtr;
+use crate::util::utf8::check_valid_utf8;
use arrow_array::{builder::make_view, ArrayRef};
use arrow_buffer::Buffer;
use arrow_data::ByteView;
@@ -681,17 +682,6 @@ impl ByteViewArrayDecoderDelta {
}
}
-/// Check that `val` is a valid UTF-8 sequence
-pub fn check_valid_utf8(val: &[u8]) -> Result<()> {
- match simdutf8::basic::from_utf8(val) {
- Ok(_) => Ok(()),
- Err(_) => {
- let e = simdutf8::compat::from_utf8(val).unwrap_err();
- Err(general_err!("encountered non UTF-8 data: {}", e))
- }
- }
-}
-
#[cfg(test)]
mod tests {
use arrow_array::StringViewArray;
diff --git a/parquet/src/arrow/buffer/offset_buffer.rs
b/parquet/src/arrow/buffer/offset_buffer.rs
index 8dfb859612..5051dce12b 100644
--- a/parquet/src/arrow/buffer/offset_buffer.rs
+++ b/parquet/src/arrow/buffer/offset_buffer.rs
@@ -18,6 +18,7 @@
use crate::arrow::buffer::bit_util::iter_set_bits_rev;
use crate::arrow::record_reader::buffer::ValuesBuffer;
use crate::errors::{ParquetError, Result};
+use crate::util::utf8::check_valid_utf8;
use arrow_array::{make_array, ArrayRef, OffsetSizeTrait};
use arrow_buffer::{ArrowNativeType, Buffer};
use arrow_data::ArrayDataBuilder;
@@ -117,14 +118,7 @@ impl<I: OffsetSizeTrait> OffsetBuffer<I> {
///
/// [`Self::try_push`] can perform this validation check on insertion
pub fn check_valid_utf8(&self, start_offset: usize) -> Result<()> {
- match
simdutf8::basic::from_utf8(&self.values.as_slice()[start_offset..]) {
- Ok(_) => Ok(()),
- Err(_) => {
- let e =
simdutf8::compat::from_utf8(&self.values.as_slice()[start_offset..])
- .unwrap_err();
- Err(general_err!("encountered non UTF-8 data: {}", e))
- }
- }
+ check_valid_utf8(&self.values.as_slice()[start_offset..])
}
/// Converts this into an [`ArrayRef`] with the provided `data_type` and
`null_buffer`
diff --git a/parquet/src/lib.rs b/parquet/src/lib.rs
index 3b63845e70..3ca0dbe987 100644
--- a/parquet/src/lib.rs
+++ b/parquet/src/lib.rs
@@ -131,6 +131,9 @@ pub mod data_type;
pub use self::encodings::{decoding, encoding};
experimental!(#[macro_use] mod util);
+
+pub use util::utf8;
+
#[cfg(feature = "arrow")]
pub mod arrow;
pub mod column;
diff --git a/parquet/src/util/mod.rs b/parquet/src/util/mod.rs
index dfa1285afc..1431132473 100644
--- a/parquet/src/util/mod.rs
+++ b/parquet/src/util/mod.rs
@@ -19,8 +19,10 @@
pub mod bit_util;
mod bit_pack;
pub(crate) mod interner;
+
#[cfg(any(test, feature = "test_common"))]
pub(crate) mod test_common;
+pub mod utf8;
#[cfg(any(test, feature = "test_common"))]
pub use self::test_common::page_util::{
diff --git a/parquet/src/util/utf8.rs b/parquet/src/util/utf8.rs
new file mode 100644
index 0000000000..2a537b5e53
--- /dev/null
+++ b/parquet/src/util/utf8.rs
@@ -0,0 +1,57 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! [`check_valid_utf8`] validation function
+use crate::errors::{ParquetError, Result};
+
+/// Check that `val` is a valid UTF-8 sequence.
+///
+/// If the `simdutf8` feature is enabled, this function will use
+/// SIMD-accelerated validation from the [`simdutf8`] crate. Otherwise, it
will use
+/// [`std::str::from_utf8`].
+///
+/// # Errors
+///
+/// Returns `Err::General` with a message compatible with
[`std::str::from_utf8`] on failure.
+///
+/// # Example
+/// ```
+/// use parquet::utf8::check_valid_utf8;
+/// assert!(check_valid_utf8(b"hello").is_ok());
+/// assert!(check_valid_utf8(b"hello \xF0\x9F\x98\x8E").is_ok());
+/// // invalid UTF-8
+/// assert!(check_valid_utf8(b"hello \xF0\x9F\x98").is_err());
+/// ```
+///
+/// [`simdutf8`]: https://crates.io/crates/simdutf8
+#[inline(always)]
+pub fn check_valid_utf8(val: &[u8]) -> Result<()> {
+ #[cfg(feature = "simdutf8")]
+ match simdutf8::basic::from_utf8(val) {
+ Ok(_) => Ok(()),
+ Err(_) => {
+ // Use simdutf8::compat to return details about the decoding error
+ let e = simdutf8::compat::from_utf8(val).unwrap_err();
+ Err(general_err!("encountered non UTF-8 data: {}", e))
+ }
+ }
+ #[cfg(not(feature = "simdutf8"))]
+ match std::str::from_utf8(val) {
+ Ok(_) => Ok(()),
+ Err(e) => Err(general_err!("encountered non UTF-8 data: {}", e)),
+ }
+}