This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new ee2f75a66 Err on `try_from_le_slice` (#6295)
ee2f75a66 is described below
commit ee2f75a66278dbd3e7aa6b85b5322951c792a58d
Author: Samuel Colvin <[email protected]>
AuthorDate: Mon Aug 26 22:20:51 2024 +0100
Err on `try_from_le_slice` (#6295)
* Err on try_from_le_slice, fix #3577
* format and changes
* small cleanup
* fix clippy
* add bad metadata test
* run test only if feature is enabled
* add MRE test
* fmt
---
parquet/src/encodings/rle.rs | 9 +++---
parquet/src/file/page_index/index.rs | 35 ++++++++++++++++++++----
parquet/src/file/serialized_reader.rs | 5 ++--
parquet/src/file/statistics.rs | 18 +++++++-----
parquet/src/util/bit_util.rs | 25 +++++++++--------
parquet/tests/arrow_reader/bad_data.rs | 25 +++++++++++++++++
parquet/tests/arrow_reader/bad_raw_metadata.bin | Bin 0 -> 35456 bytes
7 files changed, 85 insertions(+), 32 deletions(-)
diff --git a/parquet/src/encodings/rle.rs b/parquet/src/encodings/rle.rs
index 581f14b3c..97a122941 100644
--- a/parquet/src/encodings/rle.rs
+++ b/parquet/src/encodings/rle.rs
@@ -20,7 +20,6 @@ use std::{cmp, mem::size_of};
use bytes::Bytes;
use crate::errors::{ParquetError, Result};
-use crate::util::bit_util::from_le_slice;
use crate::util::bit_util::{self, BitReader, BitWriter, FromBytes};
/// Rle/Bit-Packing Hybrid Encoding
@@ -356,13 +355,13 @@ impl RleDecoder {
}
let value = if self.rle_left > 0 {
- let rle_value = from_le_slice(
+ let rle_value = T::try_from_le_slice(
&self
.current_value
.as_mut()
.expect("current_value should be Some")
.to_ne_bytes(),
- );
+ )?;
self.rle_left -= 1;
rle_value
} else {
@@ -388,9 +387,9 @@ impl RleDecoder {
let num_values =
cmp::min(buffer.len() - values_read, self.rle_left as
usize);
for i in 0..num_values {
- let repeated_value = from_le_slice(
+ let repeated_value = T::try_from_le_slice(
&self.current_value.as_mut().unwrap().to_ne_bytes(),
- );
+ )?;
buffer[values_read + i] = repeated_value;
}
self.rle_left -= num_values as u32;
diff --git a/parquet/src/file/page_index/index.rs
b/parquet/src/file/page_index/index.rs
index 0c23e4aa3..662ba4562 100644
--- a/parquet/src/file/page_index/index.rs
+++ b/parquet/src/file/page_index/index.rs
@@ -23,7 +23,6 @@ use crate::data_type::{AsBytes, ByteArray, FixedLenByteArray,
Int96};
use crate::errors::ParquetError;
use crate::file::metadata::LevelHistogram;
use crate::format::{BoundaryOrder, ColumnIndex};
-use crate::util::bit_util::from_le_slice;
use std::fmt::Debug;
/// Typed statistics for one data page
@@ -192,7 +191,7 @@ impl<T: ParquetValueType> NativeIndex<T> {
let indexes = index
.min_values
.iter()
- .zip(index.max_values.into_iter())
+ .zip(index.max_values.iter())
.zip(index.null_pages.into_iter())
.zip(null_counts.into_iter())
.zip(rep_hists.into_iter())
@@ -205,9 +204,10 @@ impl<T: ParquetValueType> NativeIndex<T> {
let (min, max) = if is_null {
(None, None)
} else {
- let min = min.as_slice();
- let max = max.as_slice();
- (Some(from_le_slice::<T>(min)),
Some(from_le_slice::<T>(max)))
+ (
+ Some(T::try_from_le_slice(min)?),
+ Some(T::try_from_le_slice(max)?),
+ )
};
Ok(PageIndex {
min,
@@ -321,4 +321,29 @@ mod tests {
assert_eq!(page_index.repetition_level_histogram(), None);
assert_eq!(page_index.definition_level_histogram(), None);
}
+
+ #[test]
+ fn test_invalid_column_index() {
+ let column_index = ColumnIndex {
+ null_pages: vec![true, false],
+ min_values: vec![
+ vec![],
+ vec![], // this shouldn't be empty as null_pages[1] is false
+ ],
+ max_values: vec![
+ vec![],
+ vec![], // this shouldn't be empty as null_pages[1] is false
+ ],
+ null_counts: None,
+ repetition_level_histograms: None,
+ definition_level_histograms: None,
+ boundary_order: BoundaryOrder::UNORDERED,
+ };
+
+ let err = NativeIndex::<i32>::try_new(column_index).unwrap_err();
+ assert_eq!(
+ err.to_string(),
+ "Parquet error: error converting value, expected 4 bytes got 0"
+ );
+ }
}
diff --git a/parquet/src/file/serialized_reader.rs
b/parquet/src/file/serialized_reader.rs
index 0a3e51931..b8ee4001a 100644
--- a/parquet/src/file/serialized_reader.rs
+++ b/parquet/src/file/serialized_reader.rs
@@ -781,7 +781,6 @@ mod tests {
use crate::file::writer::SerializedFileWriter;
use crate::record::RowAccessor;
use crate::schema::parser::parse_message_type;
- use crate::util::bit_util::from_le_slice;
use crate::util::test_common::file_util::{get_test_file, get_test_path};
use super::*;
@@ -1537,8 +1536,8 @@ mod tests {
assert_eq!(row_group_index.indexes.len(), page_size);
assert_eq!(row_group_index.boundary_order, boundary_order);
row_group_index.indexes.iter().all(|x| {
- x.min.as_ref().unwrap() >= &from_le_slice::<T>(min_max.0)
- && x.max.as_ref().unwrap() <= &from_le_slice::<T>(min_max.1)
+ x.min.as_ref().unwrap() >=
&T::try_from_le_slice(min_max.0).unwrap()
+ && x.max.as_ref().unwrap() <=
&T::try_from_le_slice(min_max.1).unwrap()
});
}
diff --git a/parquet/src/file/statistics.rs b/parquet/src/file/statistics.rs
index 680c75d6b..854900f1e 100644
--- a/parquet/src/file/statistics.rs
+++ b/parquet/src/file/statistics.rs
@@ -47,7 +47,7 @@ use crate::basic::Type;
use crate::data_type::private::ParquetValueType;
use crate::data_type::*;
use crate::errors::{ParquetError, Result};
-use crate::util::bit_util::from_le_slice;
+use crate::util::bit_util::FromBytes;
pub(crate) mod private {
use super::*;
@@ -186,14 +186,18 @@ pub fn from_thrift(
// INT96 statistics may not be correct, because comparison
is signed
// byte-wise, not actual timestamps. It is recommended to
ignore
// min/max statistics for INT96 columns.
- let min = min.map(|data| {
+ let min = if let Some(data) = min {
assert_eq!(data.len(), 12);
- from_le_slice::<Int96>(&data)
- });
- let max = max.map(|data| {
+ Some(Int96::try_from_le_slice(&data)?)
+ } else {
+ None
+ };
+ let max = if let Some(data) = max {
assert_eq!(data.len(), 12);
- from_le_slice::<Int96>(&data)
- });
+ Some(Int96::try_from_le_slice(&data)?)
+ } else {
+ None
+ };
Statistics::int96(min, max, distinct_count, null_count,
old_format)
}
Type::FLOAT => Statistics::float(
diff --git a/parquet/src/util/bit_util.rs b/parquet/src/util/bit_util.rs
index adbf45014..a17202254 100644
--- a/parquet/src/util/bit_util.rs
+++ b/parquet/src/util/bit_util.rs
@@ -23,12 +23,6 @@ use crate::data_type::{AsBytes, ByteArray,
FixedLenByteArray, Int96};
use crate::errors::{ParquetError, Result};
use crate::util::bit_pack::{unpack16, unpack32, unpack64, unpack8};
-#[inline]
-pub fn from_le_slice<T: FromBytes>(bs: &[u8]) -> T {
- // TODO: propagate the error (#3577)
- T::try_from_le_slice(bs).unwrap()
-}
-
#[inline]
fn array_from_slice<const N: usize>(bs: &[u8]) -> Result<[u8; N]> {
// Need to slice as may be called with zero-padded values
@@ -91,15 +85,22 @@ unsafe impl FromBytes for Int96 {
type Buffer = [u8; 12];
fn try_from_le_slice(b: &[u8]) -> Result<Self> {
- Ok(Self::from_le_bytes(array_from_slice(b)?))
+ let bs: [u8; 12] = array_from_slice(b)?;
+ let mut i = Int96::new();
+ i.set_data(
+ u32::try_from_le_slice(&bs[0..4])?,
+ u32::try_from_le_slice(&bs[4..8])?,
+ u32::try_from_le_slice(&bs[8..12])?,
+ );
+ Ok(i)
}
fn from_le_bytes(bs: Self::Buffer) -> Self {
let mut i = Int96::new();
i.set_data(
- from_le_slice(&bs[0..4]),
- from_le_slice(&bs[4..8]),
- from_le_slice(&bs[8..12]),
+ u32::try_from_le_slice(&bs[0..4]).unwrap(),
+ u32::try_from_le_slice(&bs[4..8]).unwrap(),
+ u32::try_from_le_slice(&bs[8..12]).unwrap(),
);
i
}
@@ -438,7 +439,7 @@ impl BitReader {
}
// TODO: better to avoid copying here
- Some(from_le_slice(v.as_bytes()))
+ T::try_from_le_slice(v.as_bytes()).ok()
}
/// Read multiple values from their packed representation where each
element is represented
@@ -1026,7 +1027,7 @@ mod tests {
.collect();
// Generic values used to check against actual values read from
`get_batch`.
- let expected_values: Vec<T> = values.iter().map(|v|
from_le_slice(v.as_bytes())).collect();
+ let expected_values: Vec<T> = values.iter().map(|v|
T::try_from_le_slice(v.as_bytes()).unwrap()).collect();
(0..total).for_each(|i| writer.put_value(values[i], num_bits));
diff --git a/parquet/tests/arrow_reader/bad_data.rs
b/parquet/tests/arrow_reader/bad_data.rs
index 6e325f119..cbd5d4d3b 100644
--- a/parquet/tests/arrow_reader/bad_data.rs
+++ b/parquet/tests/arrow_reader/bad_data.rs
@@ -134,3 +134,28 @@ fn read_file(name: &str) -> Result<usize, ParquetError> {
}
Ok(num_rows)
}
+
+#[cfg(feature = "async")]
+#[tokio::test]
+async fn bad_metadata_err() {
+ use bytes::Bytes;
+ use parquet::arrow::async_reader::MetadataLoader;
+
+ let metadata_buffer =
Bytes::from_static(include_bytes!("bad_raw_metadata.bin"));
+
+ let metadata_length = metadata_buffer.len();
+
+ let mut reader = std::io::Cursor::new(&metadata_buffer);
+ let mut loader = MetadataLoader::load(&mut reader, metadata_length, None)
+ .await
+ .unwrap();
+ loader.load_page_index(false, false).await.unwrap();
+ loader.load_page_index(false, true).await.unwrap();
+
+ let err = loader.load_page_index(true, false).await.unwrap_err();
+
+ assert_eq!(
+ err.to_string(),
+ "Parquet error: error converting value, expected 4 bytes got 0"
+ );
+}
diff --git a/parquet/tests/arrow_reader/bad_raw_metadata.bin
b/parquet/tests/arrow_reader/bad_raw_metadata.bin
new file mode 100644
index 000000000..47f9aa1c0
Binary files /dev/null and b/parquet/tests/arrow_reader/bad_raw_metadata.bin
differ