This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 389b2b0a67 [Variant] Fix several overflow panic risks for 32-bit arch
(#7752)
389b2b0a67 is described below
commit 389b2b0a67a10aba185677c3fc5c5e6f285d5a0d
Author: Ryan Johnson <[email protected]>
AuthorDate: Tue Jun 24 14:12:52 2025 -0700
[Variant] Fix several overflow panic risks for 32-bit arch (#7752)
# Which issue does this PR close?
Part of
* https://github.com/apache/arrow-rs/issues/6736
# Rationale for this change
The variant spec makes extensive use of 4-byte offsets. On a 32-bit
system, this can lead to integer overflow panics when doing offset
arithmetic on malicious or malformed variant data, because `usize` is 32
bits. That is bad.
# What changes are included in this PR?
Use checked arithmetic in code that operates on offsets extracted from
(untrusted) variant byte buffers.
Of particular interest, we define and use a new
`slice_from_slice_at_offset` helper, which safely adds an offset to a
range.
# Are there any user-facing changes?
No.
---
parquet-variant/src/decoder.rs | 16 +++++-----
parquet-variant/src/utils.rs | 45 +++++++++++++++++++++-------
parquet-variant/src/variant.rs | 4 +--
parquet-variant/src/variant/list.rs | 39 +++++++++++-------------
parquet-variant/src/variant/metadata.rs | 31 +++++++++++--------
parquet-variant/src/variant/object.rs | 53 +++++++++++++++++++++------------
6 files changed, 117 insertions(+), 71 deletions(-)
diff --git a/parquet-variant/src/decoder.rs b/parquet-variant/src/decoder.rs
index 7096b0a086..cb8336b5b8 100644
--- a/parquet-variant/src/decoder.rs
+++ b/parquet-variant/src/decoder.rs
@@ -14,7 +14,7 @@
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
-use crate::utils::{array_from_slice, slice_from_slice, string_from_slice};
+use crate::utils::{array_from_slice, slice_from_slice_at_offset,
string_from_slice};
use crate::ShortString;
use arrow_schema::ArrowError;
@@ -23,6 +23,9 @@ use chrono::{DateTime, Duration, NaiveDate, NaiveDateTime,
Utc};
use std::array::TryFromSliceError;
use std::num::TryFromIntError;
+// Makes the code a bit more readable
+pub(crate) const VARIANT_VALUE_HEADER_BYTES: usize = 1;
+
#[derive(Debug, Clone, Copy, PartialEq)]
pub enum VariantBasicType {
Primitive = 0,
@@ -262,21 +265,19 @@ pub(crate) fn decode_timestampntz_micros(data: &[u8]) ->
Result<NaiveDateTime, A
/// Decodes a Binary from the value section of a variant.
pub(crate) fn decode_binary(data: &[u8]) -> Result<&[u8], ArrowError> {
let len = u32::from_le_bytes(array_from_slice(data, 0)?) as usize;
- let value = slice_from_slice(data, 4..4 + len)?;
- Ok(value)
+ slice_from_slice_at_offset(data, 4, 0..len)
}
/// Decodes a long string from the value section of a variant.
pub(crate) fn decode_long_string(data: &[u8]) -> Result<&str, ArrowError> {
let len = u32::from_le_bytes(array_from_slice(data, 0)?) as usize;
- let string = string_from_slice(data, 4..4 + len)?;
- Ok(string)
+ string_from_slice(data, 4, 0..len)
}
/// Decodes a short string from the value section of a variant.
pub(crate) fn decode_short_string(metadata: u8, data: &[u8]) ->
Result<ShortString, ArrowError> {
let len = (metadata >> 2) as usize;
- let string = string_from_slice(data, 0..len)?;
+ let string = string_from_slice(data, 0, 0..len)?;
ShortString::try_new(string)
}
@@ -518,10 +519,11 @@ mod tests {
let width = OffsetSizeBytes::Two;
- // dictionary_size starts immediately after the header
+ // dictionary_size starts immediately after the header byte
let dict_size = width.unpack_usize(&buf, 1, 0).unwrap();
assert_eq!(dict_size, 2);
+ // offset array immediately follows the dictionary size
let first = width.unpack_usize(&buf, 1, 1).unwrap();
assert_eq!(first, 0);
diff --git a/parquet-variant/src/utils.rs b/parquet-variant/src/utils.rs
index 7a1b9f0399..e0f966cab8 100644
--- a/parquet-variant/src/utils.rs
+++ b/parquet-variant/src/utils.rs
@@ -21,6 +21,11 @@ use arrow_schema::ArrowError;
use std::fmt::Debug;
use std::slice::SliceIndex;
+/// Helper for reporting integer overflow errors in a consistent way.
+pub(crate) fn overflow_error(msg: &str) -> ArrowError {
+ ArrowError::InvalidArgumentError(format!("Integer overflow computing
{msg}"))
+}
+
#[inline]
pub(crate) fn slice_from_slice<I: SliceIndex<[u8]> + Clone + Debug>(
bytes: &[u8],
@@ -33,17 +38,33 @@ pub(crate) fn slice_from_slice<I: SliceIndex<[u8]> + Clone
+ Debug>(
))
})
}
+
+/// Helper to safely slice bytes with offset calculations.
+///
+/// Equivalent to `slice_from_slice(bytes, (base_offset +
range.start)..(base_offset + range.end))`
+/// but using checked addition to prevent integer overflow panics on 32-bit
systems.
+#[inline]
+pub(crate) fn slice_from_slice_at_offset(
+ bytes: &[u8],
+ base_offset: usize,
+ range: Range<usize>,
+) -> Result<&[u8], ArrowError> {
+ let start_byte = base_offset
+ .checked_add(range.start)
+ .ok_or_else(|| overflow_error("slice start"))?;
+ let end_byte = base_offset
+ .checked_add(range.end)
+ .ok_or_else(|| overflow_error("slice end"))?;
+ slice_from_slice(bytes, start_byte..end_byte)
+}
+
pub(crate) fn array_from_slice<const N: usize>(
bytes: &[u8],
offset: usize,
) -> Result<[u8; N], ArrowError> {
- let bytes = slice_from_slice(bytes, offset..offset + N)?;
- bytes.try_into().map_err(map_try_from_slice_error)
-}
-
-/// To be used in `map_err` when unpacking an integer from a slice of bytes.
-pub(crate) fn map_try_from_slice_error(e: TryFromSliceError) -> ArrowError {
- ArrowError::InvalidArgumentError(e.to_string())
+ slice_from_slice_at_offset(bytes, offset, 0..N)?
+ .try_into()
+ .map_err(|e: TryFromSliceError|
ArrowError::InvalidArgumentError(e.to_string()))
}
pub(crate) fn first_byte_from_slice(slice: &[u8]) -> Result<u8, ArrowError> {
@@ -53,9 +74,13 @@ pub(crate) fn first_byte_from_slice(slice: &[u8]) ->
Result<u8, ArrowError> {
.ok_or_else(|| ArrowError::InvalidArgumentError("Received empty
bytes".to_string()))
}
-/// Helper to get a &str from a slice based on range, if it's valid or an
error otherwise
-pub(crate) fn string_from_slice(slice: &[u8], range: Range<usize>) ->
Result<&str, ArrowError> {
- str::from_utf8(slice_from_slice(slice, range)?)
+/// Helper to get a &str from a slice at the given offset and range, or an
error if invalid.
+pub(crate) fn string_from_slice(
+ slice: &[u8],
+ offset: usize,
+ range: Range<usize>,
+) -> Result<&str, ArrowError> {
+ str::from_utf8(slice_from_slice_at_offset(slice, offset, range)?)
.map_err(|_| ArrowError::InvalidArgumentError("invalid UTF-8
string".to_string()))
}
diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs
index b343a538d5..da3fbd36fc 100644
--- a/parquet-variant/src/variant.rs
+++ b/parquet-variant/src/variant.rs
@@ -168,13 +168,13 @@ impl<'a> TryFrom<&'a str> for ShortString<'a> {
}
}
-impl<'a> AsRef<str> for ShortString<'a> {
+impl AsRef<str> for ShortString<'_> {
fn as_ref(&self) -> &str {
self.0
}
}
-impl<'a> Deref for ShortString<'a> {
+impl Deref for ShortString<'_> {
type Target = str;
fn deref(&self) -> &Self::Target {
diff --git a/parquet-variant/src/variant/list.rs
b/parquet-variant/src/variant/list.rs
index d9fd20eacc..703761b420 100644
--- a/parquet-variant/src/variant/list.rs
+++ b/parquet-variant/src/variant/list.rs
@@ -15,11 +15,16 @@
// specific language governing permissions and limitations
// under the License.
use crate::decoder::OffsetSizeBytes;
-use crate::utils::{first_byte_from_slice, slice_from_slice,
validate_fallible_iterator};
+use crate::utils::{
+ first_byte_from_slice, overflow_error, slice_from_slice_at_offset,
validate_fallible_iterator,
+};
use crate::variant::{Variant, VariantMetadata};
use arrow_schema::ArrowError;
+// The value header occupies one byte; use a named constant for readability
+const NUM_HEADER_BYTES: usize = 1;
+
/// A parsed version of the variant array value header byte.
#[derive(Clone, Debug, PartialEq)]
pub(crate) struct VariantListHeader {
@@ -78,25 +83,16 @@ impl<'m, 'v> VariantList<'m, 'v> {
false => OffsetSizeBytes::One,
};
- // Skip the header byte to read the num_elements
- let num_elements = num_elements_size.unpack_usize(value, 1, 0)?;
- let first_offset_byte = 1 + num_elements_size as usize;
-
- let overflow =
- || ArrowError::InvalidArgumentError("Variant value_byte_length
overflow".into());
-
- // 1. num_elements + 1
- let n_offsets = num_elements.checked_add(1).ok_or_else(overflow)?;
-
- // 2. (num_elements + 1) * offset_size
- let value_bytes = n_offsets
- .checked_mul(header.offset_size as usize)
- .ok_or_else(overflow)?;
+ // Skip the header byte to read the num_elements; the offset array
immediately follows
+ let num_elements = num_elements_size.unpack_usize(value,
NUM_HEADER_BYTES, 0)?;
+ let first_offset_byte = NUM_HEADER_BYTES + num_elements_size as usize;
- // 3. first_offset_byte + ...
- let first_value_byte = first_offset_byte
- .checked_add(value_bytes)
- .ok_or_else(overflow)?;
+ // (num_elements + 1) * offset_size + first_offset_byte
+ let first_value_byte = num_elements
+ .checked_add(1)
+ .and_then(|n| n.checked_mul(header.offset_size as usize))
+ .and_then(|n| n.checked_add(first_offset_byte))
+ .ok_or_else(|| overflow_error("offset of variant list values"))?;
let new_self = Self {
metadata,
@@ -139,9 +135,10 @@ impl<'m, 'v> VariantList<'m, 'v> {
};
// Read the value bytes from the offsets
- let variant_value_bytes = slice_from_slice(
+ let variant_value_bytes = slice_from_slice_at_offset(
self.value,
- self.first_value_byte + unpack(index)?..self.first_value_byte +
unpack(index + 1)?,
+ self.first_value_byte,
+ unpack(index)?..unpack(index + 1)?,
)?;
let variant = Variant::try_new_with_metadata(self.metadata,
variant_value_bytes)?;
Ok(variant)
diff --git a/parquet-variant/src/variant/metadata.rs
b/parquet-variant/src/variant/metadata.rs
index bfefeb506d..8fff65a6ee 100644
--- a/parquet-variant/src/variant/metadata.rs
+++ b/parquet-variant/src/variant/metadata.rs
@@ -17,7 +17,8 @@
use crate::decoder::OffsetSizeBytes;
use crate::utils::{
- first_byte_from_slice, slice_from_slice, string_from_slice,
validate_fallible_iterator,
+ first_byte_from_slice, overflow_error, slice_from_slice, string_from_slice,
+ validate_fallible_iterator,
};
use arrow_schema::ArrowError;
@@ -35,6 +36,9 @@ pub(crate) struct VariantMetadataHeader {
// purposes and to make that visible.
const CORRECT_VERSION_VALUE: u8 = 1;
+// The metadata header occupies one byte; use a named constant for readability
+const NUM_HEADER_BYTES: usize = 1;
+
impl VariantMetadataHeader {
/// Tries to construct the variant metadata header, which has the form
///
@@ -102,20 +106,22 @@ impl<'m> VariantMetadata<'m> {
let header_byte = first_byte_from_slice(bytes)?;
let header = VariantMetadataHeader::try_new(header_byte)?;
- // Offset 1, index 0 because first element after header is dictionary
size
- let dict_size = header.offset_size.unpack_usize(bytes, 1, 0)?;
+ // First element after header is dictionary size
+ let dict_size = header
+ .offset_size
+ .unpack_usize(bytes, NUM_HEADER_BYTES, 0)?;
// Calculate the starting offset of the dictionary string bytes.
//
// Value header, dict_size (offset_size bytes), and dict_size+1 offsets
- // = 1 + offset_size + (dict_size + 1) * offset_size
- // = (dict_size + 2) * offset_size + 1
+ // = NUM_HEADER_BYTES + offset_size + (dict_size + 1) * offset_size
+ // = (dict_size + 2) * offset_size + NUM_HEADER_BYTES
let dictionary_key_start_byte = dict_size
.checked_add(2)
.and_then(|n| n.checked_mul(header.offset_size as usize))
- .and_then(|n| n.checked_add(1))
- .ok_or_else(|| ArrowError::InvalidArgumentError("metadata length
overflow".into()))?;
- println!("dictionary_key_start_byte: {dictionary_key_start_byte}");
+ .and_then(|n| n.checked_add(NUM_HEADER_BYTES))
+ .ok_or_else(|| overflow_error("offset of variant metadata
dictionary"))?;
+
let new_self = Self {
bytes,
header,
@@ -149,16 +155,17 @@ impl<'m> VariantMetadata<'m> {
/// This offset is an index into the dictionary, at the boundary between
string `i-1` and string
/// `i`. See [`Self::get`] to retrieve a specific dictionary entry.
fn get_offset(&self, i: usize) -> Result<usize, ArrowError> {
- // Skipping the header byte (setting byte_offset = 1) and the
dictionary_size (setting offset_index +1)
+ // Skip the header byte and the dictionary_size entry (by offset_index
+ 1)
let bytes = slice_from_slice(self.bytes,
..self.dictionary_key_start_byte)?;
- self.header.offset_size.unpack_usize(bytes, 1, i + 1)
+ self.header
+ .offset_size
+ .unpack_usize(bytes, NUM_HEADER_BYTES, i + 1)
}
/// Gets a dictionary entry by index
pub fn get(&self, i: usize) -> Result<&'m str, ArrowError> {
- let dictionary_keys_bytes = slice_from_slice(self.bytes,
self.dictionary_key_start_byte..)?;
let byte_range = self.get_offset(i)?..self.get_offset(i + 1)?;
- string_from_slice(dictionary_keys_bytes, byte_range)
+ string_from_slice(self.bytes, self.dictionary_key_start_byte,
byte_range)
}
/// Get all dictionary entries as an Iterator of strings
diff --git a/parquet-variant/src/variant/object.rs
b/parquet-variant/src/variant/object.rs
index 471b94ccdb..b52701f8bb 100644
--- a/parquet-variant/src/variant/object.rs
+++ b/parquet-variant/src/variant/object.rs
@@ -16,12 +16,16 @@
// under the License.
use crate::decoder::OffsetSizeBytes;
use crate::utils::{
- first_byte_from_slice, slice_from_slice, try_binary_search_range_by,
validate_fallible_iterator,
+ first_byte_from_slice, overflow_error, slice_from_slice,
try_binary_search_range_by,
+ validate_fallible_iterator,
};
use crate::variant::{Variant, VariantMetadata};
use arrow_schema::ArrowError;
+// The value header occupies one byte; use a named constant for readability
+const NUM_HEADER_BYTES: usize = 1;
+
/// Header structure for [`VariantObject`]
#[derive(Clone, Debug, PartialEq)]
pub(crate) struct VariantObjectHeader {
@@ -72,36 +76,43 @@ impl<'m, 'v> VariantObject<'m, 'v> {
let header_byte = first_byte_from_slice(value)?;
let header = VariantObjectHeader::try_new(header_byte)?;
- // Determine num_elements size based on is_large flag
+ // Determine num_elements size based on is_large flag and fetch the
value
let num_elements_size = if header.is_large {
OffsetSizeBytes::Four
} else {
OffsetSizeBytes::One
};
+ let num_elements = num_elements_size.unpack_usize(value,
NUM_HEADER_BYTES, 0)?;
+
+ // Calculate byte offsets for different sections with overflow
protection
+ let field_ids_start_byte = NUM_HEADER_BYTES
+ .checked_add(num_elements_size as usize)
+ .ok_or_else(|| overflow_error("offset of variant object field
ids"))?;
- // Parse num_elements
- let num_elements = num_elements_size.unpack_usize(value, 1, 0)?;
+ let field_offsets_start_byte = num_elements
+ .checked_mul(header.field_id_size as usize)
+ .and_then(|n| n.checked_add(field_ids_start_byte))
+ .ok_or_else(|| overflow_error("offset of variant object field
offsets"))?;
- // Calculate byte offsets for different sections
- let field_ids_start_byte = 1 + num_elements_size as usize;
- let field_offsets_start_byte =
- field_ids_start_byte + num_elements * header.field_id_size as
usize;
- let values_start_byte =
- field_offsets_start_byte + (num_elements + 1) *
header.field_offset_size as usize;
+ let values_start_byte = num_elements
+ .checked_add(1)
+ .and_then(|n| n.checked_mul(header.field_offset_size as usize))
+ .and_then(|n| n.checked_add(field_offsets_start_byte))
+ .ok_or_else(|| overflow_error("offset of variant object field
values"))?;
// Spec says: "The last field_offset points to the byte after the end
of the last value"
//
// Use the last offset as a bounds check. The iterator check below
doesn't use it -- offsets
// are not monotonic -- so we have to check separately here.
- let last_field_offset =
- header
- .field_offset_size
- .unpack_usize(value, field_offsets_start_byte, num_elements)?;
- if values_start_byte + last_field_offset > value.len() {
+ let end_offset = header
+ .field_offset_size
+ .unpack_usize(value, field_offsets_start_byte, num_elements)?
+ .checked_add(values_start_byte)
+ .ok_or_else(|| overflow_error("end of variant object field
values"))?;
+ if end_offset > value.len() {
return Err(ArrowError::InvalidArgumentError(format!(
- "Last field offset value {} at offset {} is outside the value
slice of length {}",
- last_field_offset,
- values_start_byte,
+ "Last field offset value {} is outside the value slice of
length {}",
+ end_offset,
value.len()
)));
}
@@ -140,7 +151,11 @@ impl<'m, 'v> VariantObject<'m, 'v> {
self.field_offsets_start_byte,
i,
)?;
- let value_bytes = slice_from_slice(self.value, self.values_start_byte
+ start_offset..)?;
+ let value_start = self
+ .values_start_byte
+ .checked_add(start_offset)
+ .ok_or_else(|| overflow_error("offset of variant object field"))?;
+ let value_bytes = slice_from_slice(self.value, value_start..)?;
Variant::try_new_with_metadata(self.metadata, value_bytes)
}