This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 80a6ef750 Fix parquet clippy lints (#1254) (#2377)
80a6ef750 is described below
commit 80a6ef750e6635486cc1d3bbf9890423b27596c4
Author: Raphael Taylor-Davies <[email protected]>
AuthorDate: Mon Aug 8 21:52:23 2022 +0100
Fix parquet clippy lints (#1254) (#2377)
* Fix parquet clippy lints (#1254)
* Fix MapArrayReader
---
.github/workflows/parquet.yml | 6 +-
parquet/src/arrow/array_reader/builder.rs | 5 +-
parquet/src/arrow/array_reader/list_array.rs | 14 +--
parquet/src/arrow/array_reader/map_array.rs | 2 +
parquet/src/arrow/array_reader/null_array.rs | 4 +-
parquet/src/arrow/array_reader/primitive_array.rs | 7 +-
parquet/src/arrow/array_reader/struct_array.rs | 1 -
parquet/src/arrow/array_reader/test_util.rs | 3 +-
parquet/src/arrow/arrow_reader.rs | 7 +-
parquet/src/arrow/buffer/converter.rs | 50 ++++-----
parquet/src/arrow/buffer/dictionary_buffer.rs | 1 +
parquet/src/arrow/record_reader/mod.rs | 6 +-
parquet/src/arrow/schema.rs | 4 +-
parquet/src/basic.rs | 37 ++++---
parquet/src/bin/parquet-fromcsv.rs | 6 +-
parquet/src/bin/parquet-read.rs | 2 +-
parquet/src/bin/parquet-schema.rs | 4 +-
parquet/src/column/page.rs | 6 ++
parquet/src/column/reader.rs | 4 +-
parquet/src/column/writer/encoder.rs | 1 -
parquet/src/column/writer/mod.rs | 37 +------
parquet/src/compression.rs | 2 +-
parquet/src/data_type.rs | 15 ---
parquet/src/encodings/decoding.rs | 121 ++++++++++------------
parquet/src/encodings/encoding/dict_encoder.rs | 6 +-
parquet/src/encodings/encoding/mod.rs | 63 +++++++----
parquet/src/encodings/levels.rs | 4 +-
parquet/src/encodings/rle.rs | 11 +-
parquet/src/errors.rs | 6 +-
parquet/src/file/metadata.rs | 12 +++
parquet/src/file/page_index/index.rs | 1 +
parquet/src/file/page_index/mod.rs | 2 +
parquet/src/file/page_index/range.rs | 1 +
parquet/src/file/properties.rs | 3 +-
parquet/src/file/serialized_reader.rs | 8 +-
parquet/src/lib.rs | 8 --
parquet/src/record/reader.rs | 8 +-
parquet/src/record/triplet.rs | 2 +-
parquet/src/util/bit_util.rs | 72 +------------
parquet/src/util/io.rs | 2 +-
parquet/src/util/test_common/mod.rs | 15 +--
parquet/src/util/test_common/page_util.rs | 97 +----------------
parquet/src/util/test_common/rand_gen.rs | 101 ++++++++++++++++--
43 files changed, 316 insertions(+), 451 deletions(-)
diff --git a/.github/workflows/parquet.yml b/.github/workflows/parquet.yml
index d8e09f04b..ad694ec60 100644
--- a/.github/workflows/parquet.yml
+++ b/.github/workflows/parquet.yml
@@ -128,8 +128,4 @@ jobs:
rustup component add clippy
- name: Run clippy
run: |
- # Only run clippy for the library at this time,
- # as there are clippy errors for other targets
- cargo clippy -p parquet --all-features --lib -- -D warnings
- # https://github.com/apache/arrow-rs/issues/1254
- #cargo clippy -p parquet --all-targets --all-features -- -D warnings
+ cargo clippy -p parquet --all-targets --all-features -- -D warnings
diff --git a/parquet/src/arrow/array_reader/builder.rs
b/parquet/src/arrow/array_reader/builder.rs
index d9c1bedb2..32ffaeb9d 100644
--- a/parquet/src/arrow/array_reader/builder.rs
+++ b/parquet/src/arrow/array_reader/builder.rs
@@ -104,13 +104,11 @@ fn build_list_reader(
let data_type = field.arrow_type.clone();
let item_reader = build_reader(&children[0], row_groups)?;
- let item_type = item_reader.get_data_type().clone();
match is_large {
false => Ok(Box::new(ListArrayReader::<i32>::new(
item_reader,
data_type,
- item_type,
field.def_level,
field.rep_level,
field.nullable,
@@ -118,7 +116,6 @@ fn build_list_reader(
true => Ok(Box::new(ListArrayReader::<i64>::new(
item_reader,
data_type,
- item_type,
field.def_level,
field.rep_level,
field.nullable,
@@ -318,7 +315,7 @@ mod tests {
use super::*;
use crate::arrow::parquet_to_arrow_schema;
use crate::file::reader::{FileReader, SerializedFileReader};
- use crate::util::test_common::get_test_file;
+ use crate::util::test_common::file_util::get_test_file;
use arrow::datatypes::Field;
use std::sync::Arc;
diff --git a/parquet/src/arrow/array_reader/list_array.rs
b/parquet/src/arrow/array_reader/list_array.rs
index 94770794c..2acd59dcc 100644
--- a/parquet/src/arrow/array_reader/list_array.rs
+++ b/parquet/src/arrow/array_reader/list_array.rs
@@ -34,7 +34,6 @@ use std::sync::Arc;
pub struct ListArrayReader<OffsetSize: OffsetSizeTrait> {
item_reader: Box<dyn ArrayReader>,
data_type: ArrowType,
- item_type: ArrowType,
/// The definition level at which this list is not null
def_level: i16,
/// The repetition level that corresponds to a new value in this array
@@ -49,7 +48,6 @@ impl<OffsetSize: OffsetSizeTrait> ListArrayReader<OffsetSize>
{
pub fn new(
item_reader: Box<dyn ArrayReader>,
data_type: ArrowType,
- item_type: ArrowType,
def_level: i16,
rep_level: i16,
nullable: bool,
@@ -57,7 +55,6 @@ impl<OffsetSize: OffsetSizeTrait> ListArrayReader<OffsetSize>
{
Self {
item_reader,
data_type,
- item_type,
def_level,
rep_level,
nullable,
@@ -304,13 +301,13 @@ mod tests {
// ]
let l3_item_type = ArrowType::Int32;
- let l3_type = list_type::<OffsetSize>(l3_item_type.clone(), true);
+ let l3_type = list_type::<OffsetSize>(l3_item_type, true);
let l2_item_type = l3_type.clone();
- let l2_type = list_type::<OffsetSize>(l2_item_type.clone(), true);
+ let l2_type = list_type::<OffsetSize>(l2_item_type, true);
let l1_item_type = l2_type.clone();
- let l1_type = list_type::<OffsetSize>(l1_item_type.clone(), false);
+ let l1_type = list_type::<OffsetSize>(l1_item_type, false);
let leaf = PrimitiveArray::<Int32Type>::from_iter(vec![
Some(1),
@@ -387,7 +384,6 @@ mod tests {
let l3 = ListArrayReader::<OffsetSize>::new(
Box::new(item_array_reader),
l3_type,
- l3_item_type,
5,
3,
true,
@@ -396,7 +392,6 @@ mod tests {
let l2 = ListArrayReader::<OffsetSize>::new(
Box::new(l3),
l2_type,
- l2_item_type,
3,
2,
false,
@@ -405,7 +400,6 @@ mod tests {
let mut l1 = ListArrayReader::<OffsetSize>::new(
Box::new(l2),
l1_type,
- l1_item_type,
2,
1,
true,
@@ -456,7 +450,6 @@ mod tests {
let mut list_array_reader = ListArrayReader::<OffsetSize>::new(
Box::new(item_array_reader),
list_type::<OffsetSize>(ArrowType::Int32, true),
- ArrowType::Int32,
1,
1,
false,
@@ -509,7 +502,6 @@ mod tests {
let mut list_array_reader = ListArrayReader::<OffsetSize>::new(
Box::new(item_array_reader),
list_type::<OffsetSize>(ArrowType::Int32, true),
- ArrowType::Int32,
2,
1,
true,
diff --git a/parquet/src/arrow/array_reader/map_array.rs
b/parquet/src/arrow/array_reader/map_array.rs
index 83ba63ca1..3ba7f6960 100644
--- a/parquet/src/arrow/array_reader/map_array.rs
+++ b/parquet/src/arrow/array_reader/map_array.rs
@@ -32,6 +32,7 @@ pub struct MapArrayReader {
value_reader: Box<dyn ArrayReader>,
data_type: ArrowType,
map_def_level: i16,
+ #[allow(unused)]
map_rep_level: i16,
}
@@ -47,6 +48,7 @@ impl MapArrayReader {
key_reader,
value_reader,
data_type,
+ // These are the wrong way round
https://github.com/apache/arrow-rs/issues/1699
map_def_level: rep_level,
map_rep_level: def_level,
}
diff --git a/parquet/src/arrow/array_reader/null_array.rs
b/parquet/src/arrow/array_reader/null_array.rs
index 682d15f8a..405633f0a 100644
--- a/parquet/src/arrow/array_reader/null_array.rs
+++ b/parquet/src/arrow/array_reader/null_array.rs
@@ -39,7 +39,6 @@ where
pages: Box<dyn PageIterator>,
def_levels_buffer: Option<Buffer>,
rep_levels_buffer: Option<Buffer>,
- column_desc: ColumnDescPtr,
record_reader: RecordReader<T>,
}
@@ -50,14 +49,13 @@ where
{
/// Construct null array reader.
pub fn new(pages: Box<dyn PageIterator>, column_desc: ColumnDescPtr) ->
Result<Self> {
- let record_reader = RecordReader::<T>::new(column_desc.clone());
+ let record_reader = RecordReader::<T>::new(column_desc);
Ok(Self {
data_type: ArrowType::Null,
pages,
def_levels_buffer: None,
rep_levels_buffer: None,
- column_desc,
record_reader,
})
}
diff --git a/parquet/src/arrow/array_reader/primitive_array.rs
b/parquet/src/arrow/array_reader/primitive_array.rs
index 45614d509..516a3f50c 100644
--- a/parquet/src/arrow/array_reader/primitive_array.rs
+++ b/parquet/src/arrow/array_reader/primitive_array.rs
@@ -44,7 +44,6 @@ where
pages: Box<dyn PageIterator>,
def_levels_buffer: Option<Buffer>,
rep_levels_buffer: Option<Buffer>,
- column_desc: ColumnDescPtr,
record_reader: RecordReader<T>,
}
@@ -67,14 +66,13 @@ where
.clone(),
};
- let record_reader = RecordReader::<T>::new(column_desc.clone());
+ let record_reader = RecordReader::<T>::new(column_desc);
Ok(Self {
data_type,
pages,
def_levels_buffer: None,
rep_levels_buffer: None,
- column_desc,
record_reader,
})
}
@@ -244,7 +242,7 @@ mod tests {
use crate::data_type::Int32Type;
use crate::schema::parser::parse_message_type;
use crate::schema::types::SchemaDescriptor;
- use crate::util::test_common::make_pages;
+ use crate::util::test_common::rand_gen::make_pages;
use crate::util::InMemoryPageIterator;
use arrow::array::PrimitiveArray;
use arrow::datatypes::ArrowPrimitiveType;
@@ -252,6 +250,7 @@ mod tests {
use rand::distributions::uniform::SampleUniform;
use std::collections::VecDeque;
+ #[allow(clippy::too_many_arguments)]
fn make_column_chunks<T: DataType>(
column_desc: ColumnDescPtr,
encoding: Encoding,
diff --git a/parquet/src/arrow/array_reader/struct_array.rs
b/parquet/src/arrow/array_reader/struct_array.rs
index b333c66cb..f682f146c 100644
--- a/parquet/src/arrow/array_reader/struct_array.rs
+++ b/parquet/src/arrow/array_reader/struct_array.rs
@@ -314,7 +314,6 @@ mod tests {
let list_reader = ListArrayReader::<i32>::new(
Box::new(reader),
expected_l.data_type().clone(),
- ArrowType::Int32,
3,
1,
true,
diff --git a/parquet/src/arrow/array_reader/test_util.rs
b/parquet/src/arrow/array_reader/test_util.rs
index da9b8d3bf..ca1aabfd4 100644
--- a/parquet/src/arrow/array_reader/test_util.rs
+++ b/parquet/src/arrow/array_reader/test_util.rs
@@ -48,8 +48,7 @@ pub fn utf8_column() -> ColumnDescPtr {
/// Encode `data` with the provided `encoding`
pub fn encode_byte_array(encoding: Encoding, data: &[ByteArray]) ->
ByteBufferPtr {
- let descriptor = utf8_column();
- let mut encoder = get_encoder::<ByteArrayType>(descriptor,
encoding).unwrap();
+ let mut encoder = get_encoder::<ByteArrayType>(encoding).unwrap();
encoder.put(data).unwrap();
encoder.flush_buffer().unwrap()
diff --git a/parquet/src/arrow/arrow_reader.rs
b/parquet/src/arrow/arrow_reader.rs
index 594b41676..726c20060 100644
--- a/parquet/src/arrow/arrow_reader.rs
+++ b/parquet/src/arrow/arrow_reader.rs
@@ -85,6 +85,7 @@ pub(crate) struct RowSelection {
impl RowSelection {
/// Select `row_count` rows
+ #[allow(unused)]
pub fn select(row_count: usize) -> Self {
Self {
row_count,
@@ -93,6 +94,7 @@ impl RowSelection {
}
/// Skip `row_count` rows
+ #[allow(unused)]
pub fn skip(row_count: usize) -> Self {
Self {
row_count,
@@ -109,7 +111,7 @@ pub struct ArrowReaderOptions {
impl ArrowReaderOptions {
/// Create a new [`ArrowReaderOptions`] with the default settings
- fn new() -> Self {
+ pub fn new() -> Self {
Self::default()
}
@@ -129,6 +131,7 @@ impl ArrowReaderOptions {
/// Scan rows from the parquet file according to the provided `selection`
///
/// TODO: Make public once row selection fully implemented (#1792)
+ #[allow(unused)]
pub(crate) fn with_row_selection(
self,
selection: impl Into<Vec<RowSelection>>,
@@ -433,7 +436,7 @@ mod tests {
use crate::file::writer::SerializedFileWriter;
use crate::schema::parser::parse_message_type;
use crate::schema::types::{Type, TypePtr};
- use crate::util::test_common::RandGen;
+ use crate::util::test_common::rand_gen::RandGen;
#[test]
fn test_arrow_reader_all_columns() {
diff --git a/parquet/src/arrow/buffer/converter.rs
b/parquet/src/arrow/buffer/converter.rs
index 4cd058942..d8cbd256a 100644
--- a/parquet/src/arrow/buffer/converter.rs
+++ b/parquet/src/arrow/buffer/converter.rs
@@ -17,18 +17,21 @@
use crate::data_type::{ByteArray, FixedLenByteArray, Int96};
use arrow::array::{
- Array, ArrayRef, BasicDecimalArray, BinaryArray, BinaryBuilder,
Decimal128Array,
- FixedSizeBinaryArray, FixedSizeBinaryBuilder, IntervalDayTimeArray,
- IntervalDayTimeBuilder, IntervalYearMonthArray, IntervalYearMonthBuilder,
- LargeBinaryArray, LargeBinaryBuilder, LargeStringArray, LargeStringBuilder,
- StringArray, StringBuilder, TimestampNanosecondArray,
+ Array, ArrayRef, BasicDecimalArray, Decimal128Array, FixedSizeBinaryArray,
+ FixedSizeBinaryBuilder, IntervalDayTimeArray, IntervalDayTimeBuilder,
+ IntervalYearMonthArray, IntervalYearMonthBuilder, TimestampNanosecondArray,
};
-use std::convert::{From, TryInto};
use std::sync::Arc;
use crate::errors::Result;
use std::marker::PhantomData;
+#[cfg(test)]
+use arrow::array::{
+ BinaryArray, BinaryBuilder, LargeStringArray, LargeStringBuilder,
StringArray,
+ StringBuilder,
+};
+
/// A converter is used to consume record reader's content and convert it to
arrow
/// primitive array.
pub trait Converter<S, T> {
@@ -185,8 +188,10 @@ impl Converter<Vec<Option<Int96>>,
TimestampNanosecondArray> for Int96ArrayConve
}
}
+#[cfg(test)]
pub struct Utf8ArrayConverter {}
+#[cfg(test)]
impl Converter<Vec<Option<ByteArray>>, StringArray> for Utf8ArrayConverter {
fn convert(&self, source: Vec<Option<ByteArray>>) -> Result<StringArray> {
let data_size = source
@@ -206,8 +211,10 @@ impl Converter<Vec<Option<ByteArray>>, StringArray> for
Utf8ArrayConverter {
}
}
+#[cfg(test)]
pub struct LargeUtf8ArrayConverter {}
+#[cfg(test)]
impl Converter<Vec<Option<ByteArray>>, LargeStringArray> for
LargeUtf8ArrayConverter {
fn convert(&self, source: Vec<Option<ByteArray>>) ->
Result<LargeStringArray> {
let data_size = source
@@ -227,8 +234,10 @@ impl Converter<Vec<Option<ByteArray>>, LargeStringArray>
for LargeUtf8ArrayConve
}
}
+#[cfg(test)]
pub struct BinaryArrayConverter {}
+#[cfg(test)]
impl Converter<Vec<Option<ByteArray>>, BinaryArray> for BinaryArrayConverter {
fn convert(&self, source: Vec<Option<ByteArray>>) -> Result<BinaryArray> {
let mut builder = BinaryBuilder::new(source.len());
@@ -243,33 +252,9 @@ impl Converter<Vec<Option<ByteArray>>, BinaryArray> for
BinaryArrayConverter {
}
}
-pub struct LargeBinaryArrayConverter {}
-
-impl Converter<Vec<Option<ByteArray>>, LargeBinaryArray> for
LargeBinaryArrayConverter {
- fn convert(&self, source: Vec<Option<ByteArray>>) ->
Result<LargeBinaryArray> {
- let mut builder = LargeBinaryBuilder::new(source.len());
- for v in source {
- match v {
- Some(array) => builder.append_value(array.data()),
- None => builder.append_null(),
- }
- }
-
- Ok(builder.finish())
- }
-}
-
+#[cfg(test)]
pub type Utf8Converter =
ArrayRefConverter<Vec<Option<ByteArray>>, StringArray, Utf8ArrayConverter>;
-pub type LargeUtf8Converter =
- ArrayRefConverter<Vec<Option<ByteArray>>, LargeStringArray,
LargeUtf8ArrayConverter>;
-pub type BinaryConverter =
- ArrayRefConverter<Vec<Option<ByteArray>>, BinaryArray,
BinaryArrayConverter>;
-pub type LargeBinaryConverter = ArrayRefConverter<
- Vec<Option<ByteArray>>,
- LargeBinaryArray,
- LargeBinaryArrayConverter,
->;
pub type Int96Converter =
ArrayRefConverter<Vec<Option<Int96>>, TimestampNanosecondArray,
Int96ArrayConverter>;
@@ -299,11 +284,13 @@ pub type DecimalFixedLengthByteArrayConverter =
ArrayRefConverter<
pub type DecimalByteArrayConvert =
ArrayRefConverter<Vec<Option<ByteArray>>, Decimal128Array,
DecimalArrayConverter>;
+#[cfg(test)]
pub struct FromConverter<S, T> {
_source: PhantomData<S>,
_dest: PhantomData<T>,
}
+#[cfg(test)]
impl<S, T> FromConverter<S, T>
where
T: From<S>,
@@ -316,6 +303,7 @@ where
}
}
+#[cfg(test)]
impl<S, T> Converter<S, T> for FromConverter<S, T>
where
T: From<S>,
diff --git a/parquet/src/arrow/buffer/dictionary_buffer.rs
b/parquet/src/arrow/buffer/dictionary_buffer.rs
index b64b2946b..ae9e3590d 100644
--- a/parquet/src/arrow/buffer/dictionary_buffer.rs
+++ b/parquet/src/arrow/buffer/dictionary_buffer.rs
@@ -49,6 +49,7 @@ impl<K: ScalarValue, V: ScalarValue> Default for
DictionaryBuffer<K, V> {
impl<K: ScalarValue + ArrowNativeType + Ord, V: ScalarValue + OffsetSizeTrait>
DictionaryBuffer<K, V>
{
+ #[allow(unused)]
pub fn len(&self) -> usize {
match self {
Self::Dict { keys, .. } => keys.len(),
diff --git a/parquet/src/arrow/record_reader/mod.rs
b/parquet/src/arrow/record_reader/mod.rs
index 60fe4cdc9..88d45f3d7 100644
--- a/parquet/src/arrow/record_reader/mod.rs
+++ b/parquet/src/arrow/record_reader/mod.rs
@@ -214,6 +214,7 @@ where
}
/// Returns number of records stored in buffer.
+ #[allow(unused)]
pub fn num_records(&self) -> usize {
self.num_records
}
@@ -273,11 +274,6 @@ where
.map(|levels| levels.split_bitmask(self.num_values))
}
- /// Returns column reader.
- pub(crate) fn column_reader(&self) -> Option<&ColumnReader<CV>> {
- self.column_reader.as_ref()
- }
-
/// Try to read one batch of data.
fn read_one_batch(&mut self, batch_size: usize) -> Result<usize> {
let rep_levels = self
diff --git a/parquet/src/arrow/schema.rs b/parquet/src/arrow/schema.rs
index 2cb47bc00..01aefcd48 100644
--- a/parquet/src/arrow/schema.rs
+++ b/parquet/src/arrow/schema.rs
@@ -73,7 +73,7 @@ pub fn parquet_to_arrow_schema_by_columns(
// Add the Arrow metadata to the Parquet metadata skipping keys that
collide
if let Some(arrow_schema) = &maybe_schema {
arrow_schema.metadata().iter().for_each(|(k, v)| {
- metadata.entry(k.clone()).or_insert(v.clone());
+ metadata.entry(k.clone()).or_insert_with(|| v.clone());
});
}
@@ -100,7 +100,7 @@ fn get_arrow_schema_from_metadata(encoded_meta: &str) ->
Result<Schema> {
Ok(message) => message
.header_as_schema()
.map(arrow::ipc::convert::fb_to_schema)
- .ok_or(arrow_err!("the message is not Arrow Schema")),
+ .ok_or_else(|| arrow_err!("the message is not Arrow
Schema")),
Err(err) => {
// The flatbuffers implementation returns an error on
verification error.
Err(arrow_err!(
diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs
index 0cf1d5121..9c58f764c 100644
--- a/parquet/src/basic.rs
+++ b/parquet/src/basic.rs
@@ -18,7 +18,7 @@
//! Contains Rust mappings for Thrift definition.
//! Refer to `parquet.thrift` file to see raw definitions.
-use std::{convert, fmt, result, str};
+use std::{fmt, result, str};
use parquet_format as parquet;
@@ -42,6 +42,7 @@ pub use parquet_format::{
/// For example INT16 is not included as a type since a good encoding of INT32
/// would handle this.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+#[allow(non_camel_case_types)]
pub enum Type {
BOOLEAN,
INT32,
@@ -63,6 +64,7 @@ pub enum Type {
/// This struct was renamed from `LogicalType` in version 4.0.0.
/// If targeting Parquet format 2.4.0 or above, please use [LogicalType]
instead.
#[derive(Debug, Clone, Copy, PartialEq)]
+#[allow(non_camel_case_types)]
pub enum ConvertedType {
NONE,
/// A BYTE_ARRAY actually contains UTF8 encoded chars.
@@ -197,6 +199,7 @@ pub enum LogicalType {
/// Representation of field types in schema.
#[derive(Debug, Clone, Copy, PartialEq)]
+#[allow(non_camel_case_types)]
pub enum Repetition {
/// Field is required (can not be null) and each record has exactly 1
value.
REQUIRED,
@@ -213,6 +216,7 @@ pub enum Repetition {
/// Not all encodings are valid for all types. These enums are also used to
specify the
/// encoding of definition and repetition levels.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Ord, PartialOrd)]
+#[allow(non_camel_case_types)]
pub enum Encoding {
/// Default byte encoding.
/// - BOOLEAN - 1 bit per value, 0 is false; 1 is true.
@@ -294,6 +298,7 @@ pub enum Compression {
/// Available data pages for Parquet file format.
/// Note that some of the page types may not be supported.
#[derive(Debug, Clone, Copy, PartialEq)]
+#[allow(non_camel_case_types)]
pub enum PageType {
DATA_PAGE,
INDEX_PAGE,
@@ -313,6 +318,7 @@ pub enum PageType {
/// See reference in
/// <https://github.com/apache/parquet-cpp/blob/master/src/parquet/types.h>
#[derive(Debug, Clone, Copy, PartialEq)]
+#[allow(non_camel_case_types)]
pub enum SortOrder {
/// Signed (either value or legacy byte-wise) comparison.
SIGNED,
@@ -328,6 +334,7 @@ pub enum SortOrder {
/// If column order is undefined, then it is the legacy behaviour and all
values should
/// be compared as signed values/bytes.
#[derive(Debug, Clone, Copy, PartialEq)]
+#[allow(non_camel_case_types)]
pub enum ColumnOrder {
/// Column uses the order defined by its logical or physical type
/// (if there is no logical type), parquet-format 2.4.0+.
@@ -489,7 +496,7 @@ impl fmt::Display for ColumnOrder {
// ----------------------------------------------------------------------
// parquet::Type <=> Type conversion
-impl convert::From<parquet::Type> for Type {
+impl From<parquet::Type> for Type {
fn from(value: parquet::Type) -> Self {
match value {
parquet::Type::Boolean => Type::BOOLEAN,
@@ -504,7 +511,7 @@ impl convert::From<parquet::Type> for Type {
}
}
-impl convert::From<Type> for parquet::Type {
+impl From<Type> for parquet::Type {
fn from(value: Type) -> Self {
match value {
Type::BOOLEAN => parquet::Type::Boolean,
@@ -522,7 +529,7 @@ impl convert::From<Type> for parquet::Type {
// ----------------------------------------------------------------------
// parquet::ConvertedType <=> ConvertedType conversion
-impl convert::From<Option<parquet::ConvertedType>> for ConvertedType {
+impl From<Option<parquet::ConvertedType>> for ConvertedType {
fn from(option: Option<parquet::ConvertedType>) -> Self {
match option {
None => ConvertedType::NONE,
@@ -558,7 +565,7 @@ impl convert::From<Option<parquet::ConvertedType>> for
ConvertedType {
}
}
-impl convert::From<ConvertedType> for Option<parquet::ConvertedType> {
+impl From<ConvertedType> for Option<parquet::ConvertedType> {
fn from(value: ConvertedType) -> Self {
match value {
ConvertedType::NONE => None,
@@ -595,7 +602,7 @@ impl convert::From<ConvertedType> for
Option<parquet::ConvertedType> {
// ----------------------------------------------------------------------
// parquet::LogicalType <=> LogicalType conversion
-impl convert::From<parquet::LogicalType> for LogicalType {
+impl From<parquet::LogicalType> for LogicalType {
fn from(value: parquet::LogicalType) -> Self {
match value {
parquet::LogicalType::STRING(_) => LogicalType::String,
@@ -627,7 +634,7 @@ impl convert::From<parquet::LogicalType> for LogicalType {
}
}
-impl convert::From<LogicalType> for parquet::LogicalType {
+impl From<LogicalType> for parquet::LogicalType {
fn from(value: LogicalType) -> Self {
match value {
LogicalType::String =>
parquet::LogicalType::STRING(Default::default()),
@@ -723,7 +730,7 @@ impl From<Option<LogicalType>> for ConvertedType {
// ----------------------------------------------------------------------
// parquet::FieldRepetitionType <=> Repetition conversion
-impl convert::From<parquet::FieldRepetitionType> for Repetition {
+impl From<parquet::FieldRepetitionType> for Repetition {
fn from(value: parquet::FieldRepetitionType) -> Self {
match value {
parquet::FieldRepetitionType::Required => Repetition::REQUIRED,
@@ -733,7 +740,7 @@ impl convert::From<parquet::FieldRepetitionType> for
Repetition {
}
}
-impl convert::From<Repetition> for parquet::FieldRepetitionType {
+impl From<Repetition> for parquet::FieldRepetitionType {
fn from(value: Repetition) -> Self {
match value {
Repetition::REQUIRED => parquet::FieldRepetitionType::Required,
@@ -746,7 +753,7 @@ impl convert::From<Repetition> for
parquet::FieldRepetitionType {
// ----------------------------------------------------------------------
// parquet::Encoding <=> Encoding conversion
-impl convert::From<parquet::Encoding> for Encoding {
+impl From<parquet::Encoding> for Encoding {
fn from(value: parquet::Encoding) -> Self {
match value {
parquet::Encoding::Plain => Encoding::PLAIN,
@@ -762,7 +769,7 @@ impl convert::From<parquet::Encoding> for Encoding {
}
}
-impl convert::From<Encoding> for parquet::Encoding {
+impl From<Encoding> for parquet::Encoding {
fn from(value: Encoding) -> Self {
match value {
Encoding::PLAIN => parquet::Encoding::Plain,
@@ -781,7 +788,7 @@ impl convert::From<Encoding> for parquet::Encoding {
// ----------------------------------------------------------------------
// parquet::CompressionCodec <=> Compression conversion
-impl convert::From<parquet::CompressionCodec> for Compression {
+impl From<parquet::CompressionCodec> for Compression {
fn from(value: parquet::CompressionCodec) -> Self {
match value {
parquet::CompressionCodec::Uncompressed =>
Compression::UNCOMPRESSED,
@@ -795,7 +802,7 @@ impl convert::From<parquet::CompressionCodec> for
Compression {
}
}
-impl convert::From<Compression> for parquet::CompressionCodec {
+impl From<Compression> for parquet::CompressionCodec {
fn from(value: Compression) -> Self {
match value {
Compression::UNCOMPRESSED =>
parquet::CompressionCodec::Uncompressed,
@@ -812,7 +819,7 @@ impl convert::From<Compression> for
parquet::CompressionCodec {
// ----------------------------------------------------------------------
// parquet::PageType <=> PageType conversion
-impl convert::From<parquet::PageType> for PageType {
+impl From<parquet::PageType> for PageType {
fn from(value: parquet::PageType) -> Self {
match value {
parquet::PageType::DataPage => PageType::DATA_PAGE,
@@ -823,7 +830,7 @@ impl convert::From<parquet::PageType> for PageType {
}
}
-impl convert::From<PageType> for parquet::PageType {
+impl From<PageType> for parquet::PageType {
fn from(value: PageType) -> Self {
match value {
PageType::DATA_PAGE => parquet::PageType::DataPage,
diff --git a/parquet/src/bin/parquet-fromcsv.rs
b/parquet/src/bin/parquet-fromcsv.rs
index aa1d50563..827aa7311 100644
--- a/parquet/src/bin/parquet-fromcsv.rs
+++ b/parquet/src/bin/parquet-fromcsv.rs
@@ -439,7 +439,7 @@ mod tests {
// test default values
assert_eq!(args.input_format, CsvDialect::Csv);
assert_eq!(args.batch_size, 1000);
- assert_eq!(args.has_header, false);
+ assert!(!args.has_header);
assert_eq!(args.delimiter, None);
assert_eq!(args.get_delimiter(), b',');
assert_eq!(args.record_terminator, None);
@@ -553,7 +553,7 @@ mod tests {
Field::new("field5", DataType::Utf8, false),
]));
- let reader_builder = configure_reader_builder(&args,
arrow_schema.clone());
+ let reader_builder = configure_reader_builder(&args, arrow_schema);
let builder_debug = format!("{:?}", reader_builder);
assert_debug_text(&builder_debug, "has_header", "false");
assert_debug_text(&builder_debug, "delimiter", "Some(44)");
@@ -585,7 +585,7 @@ mod tests {
Field::new("field4", DataType::Utf8, false),
Field::new("field5", DataType::Utf8, false),
]));
- let reader_builder = configure_reader_builder(&args,
arrow_schema.clone());
+ let reader_builder = configure_reader_builder(&args, arrow_schema);
let builder_debug = format!("{:?}", reader_builder);
assert_debug_text(&builder_debug, "has_header", "true");
assert_debug_text(&builder_debug, "delimiter", "Some(9)");
diff --git a/parquet/src/bin/parquet-read.rs b/parquet/src/bin/parquet-read.rs
index 0530afaa7..927d96f8c 100644
--- a/parquet/src/bin/parquet-read.rs
+++ b/parquet/src/bin/parquet-read.rs
@@ -93,6 +93,6 @@ fn print_row(row: &Row, json: bool) {
if json {
println!("{}", row.to_json_value())
} else {
- println!("{}", row.to_string());
+ println!("{}", row);
}
}
diff --git a/parquet/src/bin/parquet-schema.rs
b/parquet/src/bin/parquet-schema.rs
index b875b0e71..68c52def7 100644
--- a/parquet/src/bin/parquet-schema.rs
+++ b/parquet/src/bin/parquet-schema.rs
@@ -67,9 +67,9 @@ fn main() {
println!("Metadata for file: {}", &filename);
println!();
if verbose {
- print_parquet_metadata(&mut std::io::stdout(), &metadata);
+ print_parquet_metadata(&mut std::io::stdout(), metadata);
} else {
- print_file_metadata(&mut std::io::stdout(),
&metadata.file_metadata());
+ print_file_metadata(&mut std::io::stdout(),
metadata.file_metadata());
}
}
}
diff --git a/parquet/src/column/page.rs b/parquet/src/column/page.rs
index c61e9c0b3..1658797ce 100644
--- a/parquet/src/column/page.rs
+++ b/parquet/src/column/page.rs
@@ -174,6 +174,12 @@ pub struct PageWriteSpec {
pub bytes_written: u64,
}
+impl Default for PageWriteSpec {
+ fn default() -> Self {
+ Self::new()
+ }
+}
+
impl PageWriteSpec {
/// Creates new spec with default page write metrics.
pub fn new() -> Self {
diff --git a/parquet/src/column/reader.rs b/parquet/src/column/reader.rs
index 8e0fa5a4d..8eee807c2 100644
--- a/parquet/src/column/reader.rs
+++ b/parquet/src/column/reader.rs
@@ -544,8 +544,8 @@ mod tests {
use crate::basic::Type as PhysicalType;
use crate::schema::types::{ColumnDescriptor, ColumnPath, Type as
SchemaType};
- use crate::util::test_common::make_pages;
use crate::util::test_common::page_util::InMemoryPageReader;
+ use crate::util::test_common::rand_gen::make_pages;
const NUM_LEVELS: usize = 128;
const NUM_PAGES: usize = 2;
@@ -1231,6 +1231,7 @@ mod tests {
// Helper function for the general case of `read_batch()` where
`values`,
// `def_levels` and `rep_levels` are always provided with enough space.
+ #[allow(clippy::too_many_arguments)]
fn test_read_batch_general(
&mut self,
desc: ColumnDescPtr,
@@ -1262,6 +1263,7 @@ mod tests {
// Helper function to test `read_batch()` method with custom buffers
for values,
// definition and repetition levels.
+ #[allow(clippy::too_many_arguments)]
fn test_read_batch(
&mut self,
desc: ColumnDescPtr,
diff --git a/parquet/src/column/writer/encoder.rs
b/parquet/src/column/writer/encoder.rs
index d7363129f..4fb4f210e 100644
--- a/parquet/src/column/writer/encoder.rs
+++ b/parquet/src/column/writer/encoder.rs
@@ -168,7 +168,6 @@ impl<T: DataType> ColumnValueEncoder for
ColumnValueEncoderImpl<T> {
// Set either main encoder or fallback encoder.
let encoder = get_encoder(
- descr.clone(),
props
.encoding(descr.path())
.unwrap_or_else(|| fallback_encoding(T::get_physical_type(),
props)),
diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs
index 20a3babd8..669cacee6 100644
--- a/parquet/src/column/writer/mod.rs
+++ b/parquet/src/column/writer/mod.rs
@@ -258,6 +258,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> {
}
}
+ #[allow(clippy::too_many_arguments)]
pub(crate) fn write_batch_internal(
&mut self,
values: &E::Values,
@@ -907,12 +908,6 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E>
{
_ => {}
}
}
-
- /// Returns reference to the underlying page writer.
- /// This method is intended to use in tests only.
- fn get_page_writer_ref(&self) -> &dyn PageWriter {
- self.page_writer.as_ref()
- }
}
fn update_min<T: ParquetValueType>(
@@ -1101,7 +1096,7 @@ mod tests {
writer::SerializedPageWriter,
};
use crate::schema::types::{ColumnDescriptor, ColumnPath, Type as
SchemaType};
- use crate::util::{io::FileSource, test_common::random_numbers_range};
+ use crate::util::{io::FileSource,
test_common::rand_gen::random_numbers_range};
use super::*;
@@ -2408,20 +2403,6 @@ mod tests {
get_typed_column_writer::<T>(column_writer)
}
- /// Returns decimals column reader.
- fn get_test_decimals_column_reader<T: DataType>(
- page_reader: Box<dyn PageReader>,
- max_def_level: i16,
- max_rep_level: i16,
- ) -> ColumnReaderImpl<T> {
- let descr = Arc::new(get_test_decimals_column_descr::<T>(
- max_def_level,
- max_rep_level,
- ));
- let column_reader = get_column_reader(descr, page_reader);
- get_typed_column_reader::<T>(column_reader)
- }
-
/// Returns descriptor for Decimal type with primitive column.
fn get_test_decimals_column_descr<T: DataType>(
max_def_level: i16,
@@ -2456,20 +2437,6 @@ mod tests {
get_typed_column_writer::<T>(column_writer)
}
- /// Returns column reader for UINT32 Column provided as ConvertedType only
- fn get_test_unsigned_int_given_as_converted_column_reader<T: DataType>(
- page_reader: Box<dyn PageReader>,
- max_def_level: i16,
- max_rep_level: i16,
- ) -> ColumnReaderImpl<T> {
- let descr =
Arc::new(get_test_converted_type_unsigned_integer_column_descr::<T>(
- max_def_level,
- max_rep_level,
- ));
- let column_reader = get_column_reader(descr, page_reader);
- get_typed_column_reader::<T>(column_reader)
- }
-
/// Returns column descriptor for UINT32 Column provided as ConvertedType
only
fn get_test_converted_type_unsigned_integer_column_descr<T: DataType>(
max_def_level: i16,
diff --git a/parquet/src/compression.rs b/parquet/src/compression.rs
index a5e49360a..ee5141cbe 100644
--- a/parquet/src/compression.rs
+++ b/parquet/src/compression.rs
@@ -329,7 +329,7 @@ pub use zstd_codec::*;
mod tests {
use super::*;
- use crate::util::test_common::*;
+ use crate::util::test_common::rand_gen::random_bytes;
fn test_roundtrip(c: CodecType, data: &[u8]) {
let mut c1 = create_codec(c).unwrap().unwrap();
diff --git a/parquet/src/data_type.rs b/parquet/src/data_type.rs
index 43c9a4238..005b35570 100644
--- a/parquet/src/data_type.rs
+++ b/parquet/src/data_type.rs
@@ -574,8 +574,6 @@ pub(crate) mod private {
use super::{ParquetError, Result, SliceAsBytes};
- pub type BitIndex = u64;
-
/// Sealed trait to start to remove specialisation from implementations
///
/// This is done to force the associated value type to be unimplementable
outside of this
@@ -710,19 +708,6 @@ pub(crate) mod private {
}
}
- /// Hopelessly unsafe function that emulates `num::as_ne_bytes`
- ///
- /// It is not recommended to use this outside of this private module as,
while it
- /// _should_ work for primitive values, it is little better than a
transmutation
- /// and can act as a backdoor into mis-interpreting types as arbitary byte
slices
- #[inline]
- fn as_raw<'a, T>(value: *const T) -> &'a [u8] {
- unsafe {
- let value = value as *const u8;
- std::slice::from_raw_parts(value, std::mem::size_of::<T>())
- }
- }
-
macro_rules! impl_from_raw {
($ty: ty, $physical_ty: expr, $self: ident => $as_i64: block) => {
impl ParquetValueType for $ty {
diff --git a/parquet/src/encodings/decoding.rs
b/parquet/src/encodings/decoding.rs
index 58aa592d1..3db050a0f 100644
--- a/parquet/src/encodings/decoding.rs
+++ b/parquet/src/encodings/decoding.rs
@@ -322,6 +322,12 @@ pub struct DictDecoder<T: DataType> {
num_values: usize,
}
+impl<T: DataType> Default for DictDecoder<T> {
+ fn default() -> Self {
+ Self::new()
+ }
+}
+
impl<T: DataType> DictDecoder<T> {
/// Creates new dictionary decoder.
pub fn new() -> Self {
@@ -394,6 +400,12 @@ pub struct RleValueDecoder<T: DataType> {
_phantom: PhantomData<T>,
}
+impl<T: DataType> Default for RleValueDecoder<T> {
+ fn default() -> Self {
+ Self::new()
+ }
+}
+
impl<T: DataType> RleValueDecoder<T> {
pub fn new() -> Self {
Self {
@@ -485,6 +497,15 @@ pub struct DeltaBitPackDecoder<T: DataType> {
last_value: T::T,
}
+impl<T: DataType> Default for DeltaBitPackDecoder<T>
+where
+ T::T: Default + FromPrimitive + WrappingAdd + Copy,
+{
+ fn default() -> Self {
+ Self::new()
+ }
+}
+
impl<T: DataType> DeltaBitPackDecoder<T>
where
T::T: Default + FromPrimitive + WrappingAdd + Copy,
@@ -706,8 +727,6 @@ where
Ok(to_read)
}
-
-
fn values_left(&self) -> usize {
self.values_left
}
@@ -751,6 +770,12 @@ pub struct DeltaLengthByteArrayDecoder<T: DataType> {
_phantom: PhantomData<T>,
}
+impl<T: DataType> Default for DeltaLengthByteArrayDecoder<T> {
+ fn default() -> Self {
+ Self::new()
+ }
+}
+
impl<T: DataType> DeltaLengthByteArrayDecoder<T> {
/// Creates new delta length byte array decoder.
pub fn new() -> Self {
@@ -829,7 +854,10 @@ impl<T: DataType> Decoder<T> for
DeltaLengthByteArrayDecoder<T> {
Type::BYTE_ARRAY => {
let num_values = cmp::min(num_values, self.num_values);
- let next_offset: i32 =
self.lengths[self.current_idx..self.current_idx + num_values].iter().sum();
+ let next_offset: i32 = self.lengths
+ [self.current_idx..self.current_idx + num_values]
+ .iter()
+ .sum();
self.current_idx += num_values;
self.offset += next_offset as usize;
@@ -837,8 +865,9 @@ impl<T: DataType> Decoder<T> for
DeltaLengthByteArrayDecoder<T> {
self.num_values -= num_values;
Ok(num_values)
}
- other_type => Err(general_err!(
- "DeltaLengthByteArrayDecoder not support {}, only support byte
array", other_type
+ other_type => Err(general_err!(
+ "DeltaLengthByteArrayDecoder not support {}, only support byte
array",
+ other_type
)),
}
}
@@ -874,6 +903,12 @@ pub struct DeltaByteArrayDecoder<T: DataType> {
_phantom: PhantomData<T>,
}
+impl<T: DataType> Default for DeltaByteArrayDecoder<T> {
+ fn default() -> Self {
+ Self::new()
+ }
+}
+
impl<T: DataType> DeltaByteArrayDecoder<T> {
/// Creates new delta byte array decoder.
pub fn new() -> Self {
@@ -990,7 +1025,7 @@ mod tests {
use crate::schema::types::{
ColumnDescPtr, ColumnDescriptor, ColumnPath, Type as SchemaType,
};
- use crate::util::{bit_util::set_array_bit, test_common::RandGen};
+ use crate::util::test_common::rand_gen::RandGen;
#[test]
fn test_get_decoders() {
@@ -1068,13 +1103,7 @@ mod tests {
fn test_plain_skip_all_int32() {
let data = vec![42, 18, 52];
let data_bytes = Int32Type::to_byte_array(&data[..]);
- test_plain_skip::<Int32Type>(
- ByteBufferPtr::new(data_bytes),
- 3,
- 5,
- -1,
- &[],
- );
+ test_plain_skip::<Int32Type>(ByteBufferPtr::new(data_bytes), 3, 5, -1,
&[]);
}
#[test]
@@ -1096,7 +1125,6 @@ mod tests {
);
}
-
#[test]
fn test_plain_decode_int64() {
let data = vec![42, 18, 52];
@@ -1128,16 +1156,9 @@ mod tests {
fn test_plain_skip_all_int64() {
let data = vec![42, 18, 52];
let data_bytes = Int64Type::to_byte_array(&data[..]);
- test_plain_skip::<Int64Type>(
- ByteBufferPtr::new(data_bytes),
- 3,
- 3,
- -1,
- &[],
- );
+ test_plain_skip::<Int64Type>(ByteBufferPtr::new(data_bytes), 3, 3, -1,
&[]);
}
-
#[test]
fn test_plain_decode_float() {
let data = vec![3.14, 2.414, 12.51];
@@ -1169,13 +1190,7 @@ mod tests {
fn test_plain_skip_all_float() {
let data = vec![3.14, 2.414, 12.51];
let data_bytes = FloatType::to_byte_array(&data[..]);
- test_plain_skip::<FloatType>(
- ByteBufferPtr::new(data_bytes),
- 3,
- 4,
- -1,
- &[],
- );
+ test_plain_skip::<FloatType>(ByteBufferPtr::new(data_bytes), 3, 4, -1,
&[]);
}
#[test]
@@ -1195,13 +1210,7 @@ mod tests {
fn test_plain_skip_all_double() {
let data = vec![3.14f64, 2.414f64, 12.51f64];
let data_bytes = DoubleType::to_byte_array(&data[..]);
- test_plain_skip::<DoubleType>(
- ByteBufferPtr::new(data_bytes),
- 3,
- 5,
- -1,
- &[],
- );
+ test_plain_skip::<DoubleType>(ByteBufferPtr::new(data_bytes), 3, 5,
-1, &[]);
}
#[test]
@@ -1261,13 +1270,7 @@ mod tests {
data[2].set_data(10, 20, 30);
data[3].set_data(40, 50, 60);
let data_bytes = Int96Type::to_byte_array(&data[..]);
- test_plain_skip::<Int96Type>(
- ByteBufferPtr::new(data_bytes),
- 4,
- 8,
- -1,
- &[],
- );
+ test_plain_skip::<Int96Type>(ByteBufferPtr::new(data_bytes), 4, 8, -1,
&[]);
}
#[test]
@@ -1307,16 +1310,9 @@ mod tests {
false, true, false, false, true, false, true, true, false, true,
];
let data_bytes = BoolType::to_byte_array(&data[..]);
- test_plain_skip::<BoolType>(
- ByteBufferPtr::new(data_bytes),
- 10,
- 20,
- -1,
- &[],
- );
+ test_plain_skip::<BoolType>(ByteBufferPtr::new(data_bytes), 10, 20,
-1, &[]);
}
-
#[test]
fn test_plain_decode_byte_array() {
let mut data = vec![ByteArray::new(); 2];
@@ -1354,13 +1350,7 @@ mod tests {
data[0].set_data(ByteBufferPtr::new(String::from("hello").into_bytes()));
data[1].set_data(ByteBufferPtr::new(String::from("parquet").into_bytes()));
let data_bytes = ByteArrayType::to_byte_array(&data[..]);
- test_plain_skip::<ByteArrayType>(
- ByteBufferPtr::new(data_bytes),
- 2,
- 2,
- -1,
- &[],
- );
+ test_plain_skip::<ByteArrayType>(ByteBufferPtr::new(data_bytes), 2, 2,
-1, &[]);
}
#[test]
@@ -1587,7 +1577,6 @@ mod tests {
];
test_skip::<Int32Type>(block_data.clone(),
Encoding::DELTA_BINARY_PACKED, 5);
test_skip::<Int32Type>(block_data, Encoding::DELTA_BINARY_PACKED, 100);
-
}
#[test]
@@ -1833,8 +1822,7 @@ mod tests {
let col_descr = create_test_col_desc_ptr(-1, T::get_physical_type());
// Encode data
- let mut encoder =
- get_encoder::<T>(col_descr.clone(), encoding).expect("get
encoder");
+ let mut encoder = get_encoder::<T>(encoding).expect("get encoder");
for v in &data[..] {
encoder.put(&v[..]).expect("ok to encode");
@@ -1867,17 +1855,14 @@ mod tests {
let col_descr = create_test_col_desc_ptr(-1, T::get_physical_type());
// Encode data
- let mut encoder =
- get_encoder::<T>(col_descr.clone(), encoding).expect("get
encoder");
+ let mut encoder = get_encoder::<T>(encoding).expect("get encoder");
encoder.put(&data).expect("ok to encode");
let bytes = encoder.flush_buffer().expect("ok to flush buffer");
let mut decoder = get_decoder::<T>(col_descr, encoding).expect("get
decoder");
- decoder
- .set_data(bytes, data.len())
- .expect("ok to set data");
+ decoder.set_data(bytes, data.len()).expect("ok to set data");
if skip >= data.len() {
let skipped = decoder.skip(skip).expect("ok to skip");
@@ -1894,7 +1879,7 @@ mod tests {
let expected = &data[skip..];
let mut buffer = vec![T::T::default(); remaining];
let fetched = decoder.get(&mut buffer).expect("ok to decode");
- assert_eq!(remaining,fetched);
+ assert_eq!(remaining, fetched);
assert_eq!(&buffer, expected);
}
}
@@ -1966,7 +1951,7 @@ mod tests {
v.push(0);
}
if *item {
- set_array_bit(&mut v[..], i);
+ v[i / 8] |= 1 << (i % 8);
}
}
v
diff --git a/parquet/src/encodings/encoding/dict_encoder.rs
b/parquet/src/encodings/encoding/dict_encoder.rs
index a7855cc84..18deba65e 100644
--- a/parquet/src/encodings/encoding/dict_encoder.rs
+++ b/parquet/src/encodings/encoding/dict_encoder.rs
@@ -73,9 +73,6 @@ impl<T: DataType> Storage for KeyStorage<T> {
/// (max bit width = 32), followed by the values encoded using RLE/Bit packed
described
/// above (with the given bit width).
pub struct DictEncoder<T: DataType> {
- /// Descriptor for the column to be encoded.
- desc: ColumnDescPtr,
-
interner: Interner<KeyStorage<T>>,
/// The buffered indices
@@ -92,7 +89,6 @@ impl<T: DataType> DictEncoder<T> {
};
Self {
- desc,
interner: Interner::new(storage),
indices: vec![],
}
@@ -117,7 +113,7 @@ impl<T: DataType> DictEncoder<T> {
/// Writes out the dictionary values with PLAIN encoding in a byte buffer,
and return
/// the result.
pub fn write_dict(&self) -> Result<ByteBufferPtr> {
- let mut plain_encoder = PlainEncoder::<T>::new(self.desc.clone(),
vec![]);
+ let mut plain_encoder = PlainEncoder::<T>::new();
plain_encoder.put(&self.interner.storage().uniques)?;
plain_encoder.flush_buffer()
}
diff --git a/parquet/src/encodings/encoding/mod.rs
b/parquet/src/encodings/encoding/mod.rs
index 2c6bd9b37..050f1b9f8 100644
--- a/parquet/src/encodings/encoding/mod.rs
+++ b/parquet/src/encodings/encoding/mod.rs
@@ -24,7 +24,6 @@ use crate::data_type::private::ParquetValueType;
use crate::data_type::*;
use crate::encodings::rle::RleEncoder;
use crate::errors::{ParquetError, Result};
-use crate::schema::types::ColumnDescPtr;
use crate::util::{
bit_util::{self, num_required_bits, BitWriter},
memory::ByteBufferPtr,
@@ -76,12 +75,9 @@ pub trait Encoder<T: DataType> {
/// Gets a encoder for the particular data type `T` and encoding `encoding`.
Memory usage
/// for the encoder instance is tracked by `mem_tracker`.
-pub fn get_encoder<T: DataType>(
- desc: ColumnDescPtr,
- encoding: Encoding,
-) -> Result<Box<dyn Encoder<T>>> {
+pub fn get_encoder<T: DataType>(encoding: Encoding) -> Result<Box<dyn
Encoder<T>>> {
let encoder: Box<dyn Encoder<T>> = match encoding {
- Encoding::PLAIN => Box::new(PlainEncoder::new(desc, vec![])),
+ Encoding::PLAIN => Box::new(PlainEncoder::new()),
Encoding::RLE_DICTIONARY | Encoding::PLAIN_DICTIONARY => {
return Err(general_err!(
"Cannot initialize this encoding through this function"
@@ -113,17 +109,21 @@ pub fn get_encoder<T: DataType>(
pub struct PlainEncoder<T: DataType> {
buffer: Vec<u8>,
bit_writer: BitWriter,
- desc: ColumnDescPtr,
_phantom: PhantomData<T>,
}
+impl<T: DataType> Default for PlainEncoder<T> {
+ fn default() -> Self {
+ Self::new()
+ }
+}
+
impl<T: DataType> PlainEncoder<T> {
/// Creates new plain encoder.
- pub fn new(desc: ColumnDescPtr, buffer: Vec<u8>) -> Self {
+ pub fn new() -> Self {
Self {
- buffer,
+ buffer: vec![],
bit_writer: BitWriter::new(256),
- desc,
_phantom: PhantomData,
}
}
@@ -171,6 +171,12 @@ pub struct RleValueEncoder<T: DataType> {
_phantom: PhantomData<T>,
}
+impl<T: DataType> Default for RleValueEncoder<T> {
+ fn default() -> Self {
+ Self::new()
+ }
+}
+
impl<T: DataType> RleValueEncoder<T> {
/// Creates new rle value encoder.
pub fn new() -> Self {
@@ -280,6 +286,12 @@ pub struct DeltaBitPackEncoder<T: DataType> {
_phantom: PhantomData<T>,
}
+impl<T: DataType> Default for DeltaBitPackEncoder<T> {
+ fn default() -> Self {
+ Self::new()
+ }
+}
+
impl<T: DataType> DeltaBitPackEncoder<T> {
/// Creates new delta bit packed encoder.
pub fn new() -> Self {
@@ -531,6 +543,12 @@ pub struct DeltaLengthByteArrayEncoder<T: DataType> {
_phantom: PhantomData<T>,
}
+impl<T: DataType> Default for DeltaLengthByteArrayEncoder<T> {
+ fn default() -> Self {
+ Self::new()
+ }
+}
+
impl<T: DataType> DeltaLengthByteArrayEncoder<T> {
/// Creates new delta length byte array encoder.
pub fn new() -> Self {
@@ -610,6 +628,12 @@ pub struct DeltaByteArrayEncoder<T: DataType> {
_phantom: PhantomData<T>,
}
+impl<T: DataType> Default for DeltaByteArrayEncoder<T> {
+ fn default() -> Self {
+ Self::new()
+ }
+}
+
impl<T: DataType> DeltaByteArrayEncoder<T> {
/// Creates new delta byte array encoder.
pub fn new() -> Self {
@@ -705,7 +729,7 @@ mod tests {
use crate::schema::types::{
ColumnDescPtr, ColumnDescriptor, ColumnPath, Type as SchemaType,
};
- use crate::util::test_common::{random_bytes, RandGen};
+ use crate::util::test_common::rand_gen::{random_bytes, RandGen};
const TEST_SET_SIZE: usize = 1024;
@@ -847,7 +871,7 @@ mod tests {
Encoding::PLAIN_DICTIONARY | Encoding::RLE_DICTIONARY => {
Box::new(create_test_dict_encoder::<T>(type_length))
}
- _ => create_test_encoder::<T>(type_length, encoding),
+ _ => create_test_encoder::<T>(encoding),
};
assert_eq!(encoder.estimated_data_encoded_size(), initial_size);
@@ -900,7 +924,7 @@ mod tests {
#[test]
fn test_issue_47() {
let mut encoder =
- create_test_encoder::<ByteArrayType>(0,
Encoding::DELTA_BYTE_ARRAY);
+ create_test_encoder::<ByteArrayType>(Encoding::DELTA_BYTE_ARRAY);
let mut decoder =
create_test_decoder::<ByteArrayType>(0,
Encoding::DELTA_BYTE_ARRAY);
@@ -952,7 +976,7 @@ mod tests {
impl<T: DataType + RandGen<T>> EncodingTester<T> for T {
fn test_internal(enc: Encoding, total: usize, type_length: i32) ->
Result<()> {
- let mut encoder = create_test_encoder::<T>(type_length, enc);
+ let mut encoder = create_test_encoder::<T>(enc);
let mut decoder = create_test_decoder::<T>(type_length, enc);
let mut values = <T as RandGen<T>>::gen_vec(type_length, total);
let mut result_data = vec![T::T::default(); total];
@@ -1054,8 +1078,7 @@ mod tests {
encoding: Encoding,
err: Option<ParquetError>,
) {
- let descr = create_test_col_desc_ptr(-1, T::get_physical_type());
- let encoder = get_encoder::<T>(descr, encoding);
+ let encoder = get_encoder::<T>(encoding);
match err {
Some(parquet_error) => {
assert!(encoder.is_err());
@@ -1082,12 +1105,8 @@ mod tests {
))
}
- fn create_test_encoder<T: DataType>(
- type_len: i32,
- enc: Encoding,
- ) -> Box<dyn Encoder<T>> {
- let desc = create_test_col_desc_ptr(type_len, T::get_physical_type());
- get_encoder(desc, enc).unwrap()
+ fn create_test_encoder<T: DataType>(enc: Encoding) -> Box<dyn Encoder<T>> {
+ get_encoder(enc).unwrap()
}
fn create_test_decoder<T: DataType>(
diff --git a/parquet/src/encodings/levels.rs b/parquet/src/encodings/levels.rs
index 62c68d843..40af135ff 100644
--- a/parquet/src/encodings/levels.rs
+++ b/parquet/src/encodings/levels.rs
@@ -142,12 +142,14 @@ impl LevelEncoder {
/// Decoder for definition/repetition levels.
/// Currently only supports RLE and BIT_PACKED encoding for Data Page v1 and
/// RLE for Data Page v2.
+#[allow(unused)]
pub enum LevelDecoder {
Rle(Option<usize>, RleDecoder),
RleV2(Option<usize>, RleDecoder),
BitPacked(Option<usize>, u8, BitReader),
}
+#[allow(unused)]
impl LevelDecoder {
/// Creates new level decoder based on encoding and max
definition/repetition level.
/// This method only initializes level decoder, `set_data` method must be
called
@@ -274,7 +276,7 @@ impl LevelDecoder {
mod tests {
use super::*;
- use crate::util::test_common::random_numbers_range;
+ use crate::util::test_common::rand_gen::random_numbers_range;
fn test_internal_roundtrip(enc: Encoding, levels: &[i16], max_level: i16,
v2: bool) {
let mut encoder = if v2 {
diff --git a/parquet/src/encodings/rle.rs b/parquet/src/encodings/rle.rs
index aad833e0e..39a0aa4d0 100644
--- a/parquet/src/encodings/rle.rs
+++ b/parquet/src/encodings/rle.rs
@@ -45,7 +45,6 @@ use crate::util::{
/// Maximum groups per bit-packed run. Current value is 64.
const MAX_GROUPS_PER_BIT_PACKED_RUN: usize = 1 << 6;
const MAX_VALUES_PER_BIT_PACKED_RUN: usize = MAX_GROUPS_PER_BIT_PACKED_RUN * 8;
-const MAX_WRITER_BUF_SIZE: usize = 1 << 10;
/// A RLE/Bit-Packing hybrid encoder.
// TODO: tracking memory usage
@@ -56,9 +55,6 @@ pub struct RleEncoder {
// Underlying writer which holds an internal buffer.
bit_writer: BitWriter,
- // The maximum byte size a single run can take.
- max_run_byte_size: usize,
-
// Buffered values for bit-packed runs.
buffered_values: [u64; 8],
@@ -82,6 +78,7 @@ pub struct RleEncoder {
}
impl RleEncoder {
+ #[allow(unused)]
pub fn new(bit_width: u8, buffer_len: usize) -> Self {
let buffer = Vec::with_capacity(buffer_len);
RleEncoder::new_from_buf(bit_width, buffer)
@@ -89,12 +86,10 @@ impl RleEncoder {
/// Initialize the encoder from existing `buffer`
pub fn new_from_buf(bit_width: u8, buffer: Vec<u8>) -> Self {
- let max_run_byte_size = RleEncoder::min_buffer_size(bit_width);
let bit_writer = BitWriter::new_from_buf(buffer);
RleEncoder {
bit_width,
bit_writer,
- max_run_byte_size,
buffered_values: [0; 8],
num_buffered_values: 0,
current_value: 0,
@@ -162,6 +157,7 @@ impl RleEncoder {
}
#[inline]
+ #[allow(unused)]
pub fn buffer(&self) -> &[u8] {
self.bit_writer.buffer()
}
@@ -171,6 +167,7 @@ impl RleEncoder {
self.bit_writer.bytes_written()
}
+ #[allow(unused)]
pub fn is_empty(&self) -> bool {
self.bit_writer.bytes_written() == 0
}
@@ -184,6 +181,7 @@ impl RleEncoder {
/// Borrow equivalent of the `consume` method.
/// Call `clear()` after invoking this method.
#[inline]
+ #[allow(unused)]
pub fn flush_buffer(&mut self) -> &[u8] {
self.flush();
self.bit_writer.flush_buffer()
@@ -192,6 +190,7 @@ impl RleEncoder {
/// Clears the internal state so this encoder can be reused (e.g., after
becoming
/// full).
#[inline]
+ #[allow(unused)]
pub fn clear(&mut self) {
self.bit_writer.clear();
self.num_buffered_values = 0;
diff --git a/parquet/src/errors.rs b/parquet/src/errors.rs
index c2fb5bd66..683784193 100644
--- a/parquet/src/errors.rs
+++ b/parquet/src/errors.rs
@@ -148,8 +148,8 @@ macro_rules! arrow_err {
// Convert parquet error into other errors
#[cfg(any(feature = "arrow", test))]
-impl Into<ArrowError> for ParquetError {
- fn into(self) -> ArrowError {
- ArrowError::ParquetError(format!("{}", self))
+impl From<ParquetError> for ArrowError {
+ fn from(p: ParquetError) -> Self {
+ Self::ParquetError(format!("{}", p))
}
}
diff --git a/parquet/src/file/metadata.rs b/parquet/src/file/metadata.rs
index 58eaf7a8c..018dd95d9 100644
--- a/parquet/src/file/metadata.rs
+++ b/parquet/src/file/metadata.rs
@@ -834,6 +834,12 @@ pub struct ColumnIndexBuilder {
valid: bool,
}
+impl Default for ColumnIndexBuilder {
+ fn default() -> Self {
+ Self::new()
+ }
+}
+
impl ColumnIndexBuilder {
pub fn new() -> Self {
ColumnIndexBuilder {
@@ -887,6 +893,12 @@ pub struct OffsetIndexBuilder {
current_first_row_index: i64,
}
+impl Default for OffsetIndexBuilder {
+ fn default() -> Self {
+ Self::new()
+ }
+}
+
impl OffsetIndexBuilder {
pub fn new() -> Self {
OffsetIndexBuilder {
diff --git a/parquet/src/file/page_index/index.rs
b/parquet/src/file/page_index/index.rs
index 45381234c..f29b80acc 100644
--- a/parquet/src/file/page_index/index.rs
+++ b/parquet/src/file/page_index/index.rs
@@ -47,6 +47,7 @@ impl<T> PageIndex<T> {
}
#[derive(Debug, Clone, PartialEq)]
+#[allow(non_camel_case_types)]
pub enum Index {
/// Sometimes reading page index from parquet file
/// will only return pageLocations without min_max index,
diff --git a/parquet/src/file/page_index/mod.rs
b/parquet/src/file/page_index/mod.rs
index fc87ef204..bb7808f16 100644
--- a/parquet/src/file/page_index/mod.rs
+++ b/parquet/src/file/page_index/mod.rs
@@ -17,4 +17,6 @@
pub mod index;
pub mod index_reader;
+
+#[cfg(test)]
pub(crate) mod range;
diff --git a/parquet/src/file/page_index/range.rs
b/parquet/src/file/page_index/range.rs
index 06c06553c..e9741ec8e 100644
--- a/parquet/src/file/page_index/range.rs
+++ b/parquet/src/file/page_index/range.rs
@@ -213,6 +213,7 @@ impl RowRanges {
result
}
+ #[allow(unused)]
pub fn row_count(&self) -> usize {
self.ranges.iter().map(|x| x.count()).sum()
}
diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs
index 9ca7c4daa..c96439820 100644
--- a/parquet/src/file/properties.rs
+++ b/parquet/src/file/properties.rs
@@ -69,6 +69,7 @@ const DEFAULT_CREATED_BY: &str = env!("PARQUET_CREATED_BY");
///
/// Basic constant, which is not part of the Thrift definition.
#[derive(Debug, Clone, Copy, PartialEq)]
+#[allow(non_camel_case_types)]
pub enum WriterVersion {
PARQUET_1_0,
PARQUET_2_0,
@@ -360,7 +361,7 @@ impl WriterPropertiesBuilder {
fn get_mut_props(&mut self, col: ColumnPath) -> &mut ColumnProperties {
self.column_properties
.entry(col)
- .or_insert(ColumnProperties::new())
+ .or_insert_with(ColumnProperties::new)
}
/// Sets encoding for a column.
diff --git a/parquet/src/file/serialized_reader.rs
b/parquet/src/file/serialized_reader.rs
index e0053225d..045410d03 100644
--- a/parquet/src/file/serialized_reader.rs
+++ b/parquet/src/file/serialized_reader.rs
@@ -141,6 +141,7 @@ pub struct SerializedFileReader<R: ChunkReader> {
/// A builder for [`ReadOptions`].
/// For the predicates that are added to the builder,
/// they will be chained using 'AND' to filter the row groups.
+#[derive(Default)]
pub struct ReadOptionsBuilder {
predicates: Vec<Box<dyn FnMut(&RowGroupMetaData, usize) -> bool>>,
enable_page_index: bool,
@@ -149,10 +150,7 @@ pub struct ReadOptionsBuilder {
impl ReadOptionsBuilder {
/// New builder
pub fn new() -> Self {
- ReadOptionsBuilder {
- predicates: vec![],
- enable_page_index: false,
- }
+ Self::default()
}
/// Add a predicate on row group metadata to the reading option,
@@ -692,7 +690,7 @@ mod tests {
use crate::record::RowAccessor;
use crate::schema::parser::parse_message_type;
use crate::util::bit_util::from_le_slice;
- use crate::util::test_common::{get_test_file, get_test_path};
+ use crate::util::test_common::file_util::{get_test_file, get_test_path};
use parquet_format::BoundaryOrder;
use std::sync::Arc;
diff --git a/parquet/src/lib.rs b/parquet/src/lib.rs
index 5ee43f8ad..90fe399e7 100644
--- a/parquet/src/lib.rs
+++ b/parquet/src/lib.rs
@@ -33,14 +33,6 @@
//!
//! 3. [arrow::async_reader] for `async` reading and writing parquet
//! files to Arrow `RecordBatch`es (requires the `async` feature).
-#![allow(dead_code)]
-#![allow(non_camel_case_types)]
-#![allow(
- clippy::from_over_into,
- clippy::new_without_default,
- clippy::or_fun_call,
- clippy::too_many_arguments
-)]
/// Defines a an item with an experimental public API
///
diff --git a/parquet/src/record/reader.rs b/parquet/src/record/reader.rs
index 05b63661f..0b7e04587 100644
--- a/parquet/src/record/reader.rs
+++ b/parquet/src/record/reader.rs
@@ -40,6 +40,12 @@ pub struct TreeBuilder {
batch_size: usize,
}
+impl Default for TreeBuilder {
+ fn default() -> Self {
+ Self::new()
+ }
+}
+
impl TreeBuilder {
/// Creates new tree builder with default parameters.
pub fn new() -> Self {
@@ -822,7 +828,7 @@ mod tests {
use crate::file::reader::{FileReader, SerializedFileReader};
use crate::record::api::{Field, Row, RowAccessor, RowFormatter};
use crate::schema::parser::parse_message_type;
- use crate::util::test_common::{get_test_file, get_test_path};
+ use crate::util::test_common::file_util::{get_test_file, get_test_path};
use std::convert::TryFrom;
// Convenient macros to assemble row, list, map, and group.
diff --git a/parquet/src/record/triplet.rs b/parquet/src/record/triplet.rs
index de566a122..5a7e2a0ca 100644
--- a/parquet/src/record/triplet.rs
+++ b/parquet/src/record/triplet.rs
@@ -363,7 +363,7 @@ mod tests {
use crate::file::reader::{FileReader, SerializedFileReader};
use crate::schema::types::ColumnPath;
- use crate::util::test_common::get_test_file;
+ use crate::util::test_common::file_util::get_test_file;
#[test]
#[should_panic(expected = "Expected positive batch size, found: 0")]
diff --git a/parquet/src/util/bit_util.rs b/parquet/src/util/bit_util.rs
index 1dec9b03f..c70384e7a 100644
--- a/parquet/src/util/bit_util.rs
+++ b/parquet/src/util/bit_util.rs
@@ -101,38 +101,6 @@ macro_rules! read_num_bytes {
}};
}
-/// Converts value `val` of type `T` to a byte vector, by reading `num_bytes`
from `val`.
-/// NOTE: if `val` is less than the size of `T` then it can be truncated.
-#[inline]
-pub fn convert_to_bytes<T>(val: &T, num_bytes: usize) -> Vec<u8>
-where
- T: ?Sized + AsBytes,
-{
- let mut bytes: Vec<u8> = vec![0; num_bytes];
- memcpy_value(val.as_bytes(), num_bytes, &mut bytes);
- bytes
-}
-
-#[inline]
-pub fn memcpy(source: &[u8], target: &mut [u8]) {
- assert!(target.len() >= source.len());
- target[..source.len()].copy_from_slice(source)
-}
-
-#[inline]
-pub fn memcpy_value<T>(source: &T, num_bytes: usize, target: &mut [u8])
-where
- T: ?Sized + AsBytes,
-{
- assert!(
- target.len() >= num_bytes,
- "Not enough space. Only had {} bytes but need to put {} bytes",
- target.len(),
- num_bytes
- );
- memcpy(&source.as_bytes()[..num_bytes], target)
-}
-
/// Returns the ceil of value/divisor.
///
/// This function should be removed after
@@ -152,16 +120,6 @@ pub fn trailing_bits(v: u64, num_bits: usize) -> u64 {
}
}
-#[inline]
-pub fn set_array_bit(bits: &mut [u8], i: usize) {
- bits[i / 8] |= 1 << (i % 8);
-}
-
-#[inline]
-pub fn unset_array_bit(bits: &mut [u8], i: usize) {
- bits[i / 8] &= !(1 << (i % 8));
-}
-
/// Returns the minimum number of bits needed to represent the value 'x'
#[inline]
pub fn num_required_bits(x: u64) -> u8 {
@@ -728,20 +686,11 @@ impl From<Vec<u8>> for BitReader {
}
}
-/// Returns the nearest multiple of `factor` that is `>=` than `num`. Here
`factor` must
-/// be a power of 2.
-///
-/// Copied from the arrow crate to make arrow optional
-pub fn round_upto_power_of_2(num: usize, factor: usize) -> usize {
- debug_assert!(factor > 0 && (factor & (factor - 1)) == 0);
- (num + (factor - 1)) & !(factor - 1)
-}
-
#[cfg(test)]
mod tests {
- use super::super::test_common::*;
use super::*;
+ use crate::util::test_common::rand_gen::random_numbers;
use rand::distributions::{Distribution, Standard};
use std::fmt::Debug;
@@ -874,25 +823,6 @@ mod tests {
assert_eq!(bit_reader.get_zigzag_vlq_int(), Some(-2));
}
- #[test]
- fn test_set_array_bit() {
- let mut buffer = vec![0, 0, 0];
- set_array_bit(&mut buffer[..], 1);
- assert_eq!(buffer, vec![2, 0, 0]);
- set_array_bit(&mut buffer[..], 4);
- assert_eq!(buffer, vec![18, 0, 0]);
- unset_array_bit(&mut buffer[..], 1);
- assert_eq!(buffer, vec![16, 0, 0]);
- set_array_bit(&mut buffer[..], 10);
- assert_eq!(buffer, vec![16, 4, 0]);
- set_array_bit(&mut buffer[..], 10);
- assert_eq!(buffer, vec![16, 4, 0]);
- set_array_bit(&mut buffer[..], 11);
- assert_eq!(buffer, vec![16, 12, 0]);
- unset_array_bit(&mut buffer[..], 10);
- assert_eq!(buffer, vec![16, 8, 0]);
- }
-
#[test]
fn test_num_required_bits() {
assert_eq!(num_required_bits(0), 0);
diff --git a/parquet/src/util/io.rs b/parquet/src/util/io.rs
index 016d45075..1fb92063e 100644
--- a/parquet/src/util/io.rs
+++ b/parquet/src/util/io.rs
@@ -167,7 +167,7 @@ mod tests {
use std::iter;
- use crate::util::test_common::get_test_file;
+ use crate::util::test_common::file_util::get_test_file;
#[test]
fn test_io_read_fully() {
diff --git a/parquet/src/util/test_common/mod.rs
b/parquet/src/util/test_common/mod.rs
index f0beb16ca..504219eca 100644
--- a/parquet/src/util/test_common/mod.rs
+++ b/parquet/src/util/test_common/mod.rs
@@ -15,17 +15,10 @@
// specific language governing permissions and limitations
// under the License.
-pub mod file_util;
pub mod page_util;
-pub mod rand_gen;
-
-pub use self::rand_gen::random_bools;
-pub use self::rand_gen::random_bytes;
-pub use self::rand_gen::random_numbers;
-pub use self::rand_gen::random_numbers_range;
-pub use self::rand_gen::RandGen;
-pub use self::file_util::get_test_file;
-pub use self::file_util::get_test_path;
+#[cfg(test)]
+pub mod file_util;
-pub use self::page_util::make_pages;
+#[cfg(test)]
+pub mod rand_gen;
\ No newline at end of file
diff --git a/parquet/src/util/test_common/page_util.rs
b/parquet/src/util/test_common/page_util.rs
index bc197d00e..dffcb2a44 100644
--- a/parquet/src/util/test_common/page_util.rs
+++ b/parquet/src/util/test_common/page_util.rs
@@ -19,14 +19,11 @@ use crate::basic::Encoding;
use crate::column::page::{Page, PageIterator};
use crate::column::page::{PageMetadata, PageReader};
use crate::data_type::DataType;
-use crate::encodings::encoding::{get_encoder, DictEncoder, Encoder};
+use crate::encodings::encoding::{get_encoder, Encoder};
use crate::encodings::levels::LevelEncoder;
use crate::errors::Result;
use crate::schema::types::{ColumnDescPtr, SchemaDescPtr};
use crate::util::memory::ByteBufferPtr;
-use crate::util::test_common::random_numbers_range;
-use rand::distributions::uniform::SampleUniform;
-use std::collections::VecDeque;
use std::mem;
pub trait DataPageBuilder {
@@ -44,7 +41,6 @@ pub trait DataPageBuilder {
/// - consume()
/// in order to populate and obtain a data page.
pub struct DataPageBuilderImpl {
- desc: ColumnDescPtr,
encoding: Option<Encoding>,
num_values: u32,
buffer: Vec<u8>,
@@ -57,9 +53,8 @@ impl DataPageBuilderImpl {
// `num_values` is the number of non-null values to put in the data page.
// `datapage_v2` flag is used to indicate if the generated data page
should use V2
// format or not.
- pub fn new(desc: ColumnDescPtr, num_values: u32, datapage_v2: bool) ->
Self {
+ pub fn new(_desc: ColumnDescPtr, num_values: u32, datapage_v2: bool) ->
Self {
DataPageBuilderImpl {
- desc,
encoding: None,
num_values,
buffer: vec![],
@@ -111,8 +106,7 @@ impl DataPageBuilder for DataPageBuilderImpl {
);
self.encoding = Some(encoding);
let mut encoder: Box<dyn Encoder<T>> =
- get_encoder::<T>(self.desc.clone(), encoding)
- .expect("get_encoder() should be OK");
+ get_encoder::<T>(encoding).expect("get_encoder() should be OK");
encoder.put(values).expect("put() should be OK");
let encoded_values = encoder
.flush_buffer()
@@ -229,88 +223,3 @@ impl<I: Iterator<Item = Vec<Page>> + Send> PageIterator
for InMemoryPageIterator
Ok(self.column_desc.clone())
}
}
-
-pub fn make_pages<T: DataType>(
- desc: ColumnDescPtr,
- encoding: Encoding,
- num_pages: usize,
- levels_per_page: usize,
- min: T::T,
- max: T::T,
- def_levels: &mut Vec<i16>,
- rep_levels: &mut Vec<i16>,
- values: &mut Vec<T::T>,
- pages: &mut VecDeque<Page>,
- use_v2: bool,
-) where
- T::T: PartialOrd + SampleUniform + Copy,
-{
- let mut num_values = 0;
- let max_def_level = desc.max_def_level();
- let max_rep_level = desc.max_rep_level();
-
- let mut dict_encoder = DictEncoder::<T>::new(desc.clone());
-
- for i in 0..num_pages {
- let mut num_values_cur_page = 0;
- let level_range = i * levels_per_page..(i + 1) * levels_per_page;
-
- if max_def_level > 0 {
- random_numbers_range(levels_per_page, 0, max_def_level + 1,
def_levels);
- for dl in &def_levels[level_range.clone()] {
- if *dl == max_def_level {
- num_values_cur_page += 1;
- }
- }
- } else {
- num_values_cur_page = levels_per_page;
- }
- if max_rep_level > 0 {
- random_numbers_range(levels_per_page, 0, max_rep_level + 1,
rep_levels);
- }
- random_numbers_range(num_values_cur_page, min, max, values);
-
- // Generate the current page
-
- let mut pb =
- DataPageBuilderImpl::new(desc.clone(), num_values_cur_page as u32,
use_v2);
- if max_rep_level > 0 {
- pb.add_rep_levels(max_rep_level, &rep_levels[level_range.clone()]);
- }
- if max_def_level > 0 {
- pb.add_def_levels(max_def_level, &def_levels[level_range]);
- }
-
- let value_range = num_values..num_values + num_values_cur_page;
- match encoding {
- Encoding::PLAIN_DICTIONARY | Encoding::RLE_DICTIONARY => {
- let _ = dict_encoder.put(&values[value_range.clone()]);
- let indices = dict_encoder
- .write_indices()
- .expect("write_indices() should be OK");
- pb.add_indices(indices);
- }
- Encoding::PLAIN => {
- pb.add_values::<T>(encoding, &values[value_range]);
- }
- enc => panic!("Unexpected encoding {}", enc),
- }
-
- let data_page = pb.consume();
- pages.push_back(data_page);
- num_values += num_values_cur_page;
- }
-
- if encoding == Encoding::PLAIN_DICTIONARY || encoding ==
Encoding::RLE_DICTIONARY {
- let dict = dict_encoder
- .write_dict()
- .expect("write_dict() should be OK");
- let dict_page = Page::DictionaryPage {
- buf: dict,
- num_values: dict_encoder.num_entries() as u32,
- encoding: Encoding::RLE_DICTIONARY,
- is_sorted: false,
- };
- pages.push_front(dict_page);
- }
-}
diff --git a/parquet/src/util/test_common/rand_gen.rs
b/parquet/src/util/test_common/rand_gen.rs
index d9c256577..4e54aa799 100644
--- a/parquet/src/util/test_common/rand_gen.rs
+++ b/parquet/src/util/test_common/rand_gen.rs
@@ -15,13 +15,19 @@
// specific language governing permissions and limitations
// under the License.
+use crate::basic::Encoding;
+use crate::column::page::Page;
use rand::{
distributions::{uniform::SampleUniform, Distribution, Standard},
thread_rng, Rng,
};
+use std::collections::VecDeque;
use crate::data_type::*;
+use crate::encodings::encoding::{DictEncoder, Encoder};
+use crate::schema::types::ColumnDescPtr;
use crate::util::memory::ByteBufferPtr;
+use crate::util::{DataPageBuilder, DataPageBuilderImpl};
/// Random generator of data type `T` values and sequences.
pub trait RandGen<T: DataType> {
@@ -106,15 +112,6 @@ pub fn random_bytes(n: usize) -> Vec<u8> {
result
}
-pub fn random_bools(n: usize) -> Vec<bool> {
- let mut result = vec![];
- let mut rng = thread_rng();
- for _ in 0..n {
- result.push(rng.gen::<bool>());
- }
- result
-}
-
pub fn random_numbers<T>(n: usize) -> Vec<T>
where
Standard: Distribution<T>,
@@ -132,3 +129,89 @@ where
result.push(rng.gen_range(low..high));
}
}
+
+#[allow(clippy::too_many_arguments)]
+pub fn make_pages<T: DataType>(
+ desc: ColumnDescPtr,
+ encoding: Encoding,
+ num_pages: usize,
+ levels_per_page: usize,
+ min: T::T,
+ max: T::T,
+ def_levels: &mut Vec<i16>,
+ rep_levels: &mut Vec<i16>,
+ values: &mut Vec<T::T>,
+ pages: &mut VecDeque<Page>,
+ use_v2: bool,
+) where
+ T::T: PartialOrd + SampleUniform + Copy,
+{
+ let mut num_values = 0;
+ let max_def_level = desc.max_def_level();
+ let max_rep_level = desc.max_rep_level();
+
+ let mut dict_encoder = DictEncoder::<T>::new(desc.clone());
+
+ for i in 0..num_pages {
+ let mut num_values_cur_page = 0;
+ let level_range = i * levels_per_page..(i + 1) * levels_per_page;
+
+ if max_def_level > 0 {
+ random_numbers_range(levels_per_page, 0, max_def_level + 1,
def_levels);
+ for dl in &def_levels[level_range.clone()] {
+ if *dl == max_def_level {
+ num_values_cur_page += 1;
+ }
+ }
+ } else {
+ num_values_cur_page = levels_per_page;
+ }
+ if max_rep_level > 0 {
+ random_numbers_range(levels_per_page, 0, max_rep_level + 1,
rep_levels);
+ }
+ random_numbers_range(num_values_cur_page, min, max, values);
+
+ // Generate the current page
+
+ let mut pb =
+ DataPageBuilderImpl::new(desc.clone(), num_values_cur_page as u32,
use_v2);
+ if max_rep_level > 0 {
+ pb.add_rep_levels(max_rep_level, &rep_levels[level_range.clone()]);
+ }
+ if max_def_level > 0 {
+ pb.add_def_levels(max_def_level, &def_levels[level_range]);
+ }
+
+ let value_range = num_values..num_values + num_values_cur_page;
+ match encoding {
+ Encoding::PLAIN_DICTIONARY | Encoding::RLE_DICTIONARY => {
+ let _ = dict_encoder.put(&values[value_range.clone()]);
+ let indices = dict_encoder
+ .write_indices()
+ .expect("write_indices() should be OK");
+ pb.add_indices(indices);
+ }
+ Encoding::PLAIN => {
+ pb.add_values::<T>(encoding, &values[value_range]);
+ }
+ enc => panic!("Unexpected encoding {}", enc),
+ }
+
+ let data_page = pb.consume();
+ pages.push_back(data_page);
+ num_values += num_values_cur_page;
+ }
+
+ if encoding == Encoding::PLAIN_DICTIONARY || encoding ==
Encoding::RLE_DICTIONARY {
+ let dict = dict_encoder
+ .write_dict()
+ .expect("write_dict() should be OK");
+ let dict_page = Page::DictionaryPage {
+ buf: dict,
+ num_values: dict_encoder.num_entries() as u32,
+ encoding: Encoding::RLE_DICTIONARY,
+ is_sorted: false,
+ };
+ pages.push_front(dict_page);
+ }
+}