This is an automated email from the ASF dual-hosted git repository.
nevime pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 894dd17 ARROW-12043: [Rust] [Parquet] Write FSB arrays
894dd17 is described below
commit 894dd17c9602439c2b84c0b849fb0966606ceb1c
Author: Neville Dipale <[email protected]>
AuthorDate: Sun Mar 28 11:01:56 2021 +0200
ARROW-12043: [Rust] [Parquet] Write FSB arrays
Minor change to compute the levels for FSB arrays and write them out. Added
a roundtrip test.
Closes #9771 from nevi-me/ARROW-12043
Authored-by: Neville Dipale <[email protected]>
Signed-off-by: Neville Dipale <[email protected]>
---
rust/parquet/src/arrow/arrow_writer.rs | 28 ++++++++++++++++++++++------
rust/parquet/src/arrow/levels.rs | 30 ++++++++++++++++++++----------
rust/parquet/src/arrow/mod.rs | 2 +-
3 files changed, 43 insertions(+), 17 deletions(-)
diff --git a/rust/parquet/src/arrow/arrow_writer.rs
b/rust/parquet/src/arrow/arrow_writer.rs
index 1ce907f..a3577ca 100644
--- a/rust/parquet/src/arrow/arrow_writer.rs
+++ b/rust/parquet/src/arrow/arrow_writer.rs
@@ -146,7 +146,8 @@ fn write_leaves(
| ArrowDataType::Binary
| ArrowDataType::Utf8
| ArrowDataType::LargeUtf8
- | ArrowDataType::Decimal(_, _) => {
+ | ArrowDataType::Decimal(_, _)
+ | ArrowDataType::FixedSizeBinary(_) => {
let mut col_writer = get_col_writer(&mut row_group_writer)?;
write_leaf(
&mut col_writer,
@@ -189,11 +190,14 @@ fn write_leaves(
ArrowDataType::Float16 => Err(ParquetError::ArrowError(
"Float16 arrays not supported".to_string(),
)),
- ArrowDataType::FixedSizeList(_, _)
- | ArrowDataType::FixedSizeBinary(_)
- | ArrowDataType::Union(_) => Err(ParquetError::NYI(
- "Attempting to write an Arrow type that is not yet
implemented".to_string(),
- )),
+ ArrowDataType::FixedSizeList(_, _) | ArrowDataType::Union(_) => {
+ Err(ParquetError::NYI(
+ format!(
+ "Attempting to write an Arrow type {:?} to parquet that is
not yet implemented",
+ array.data_type()
+ )
+ ))
+ }
}
}
@@ -1225,6 +1229,18 @@ mod tests {
}
#[test]
+ fn fixed_size_binary_single_column() {
+ let mut builder = FixedSizeBinaryBuilder::new(16, 4);
+ builder.append_value(b"0123").unwrap();
+ builder.append_null().unwrap();
+ builder.append_value(b"8910").unwrap();
+ builder.append_value(b"1112").unwrap();
+ let array = Arc::new(builder.finish());
+
+ one_column_roundtrip("timestamp_millisecond_single_column", array,
true);
+ }
+
+ #[test]
fn string_single_column() {
let raw_values: Vec<_> = (0..SMALL_SIZE).map(|i|
i.to_string()).collect();
let raw_strs = raw_values.iter().map(|s| s.as_str());
diff --git a/rust/parquet/src/arrow/levels.rs b/rust/parquet/src/arrow/levels.rs
index 641e330..2168670 100644
--- a/rust/parquet/src/arrow/levels.rs
+++ b/rust/parquet/src/arrow/levels.rs
@@ -136,7 +136,8 @@ impl LevelInfo {
| DataType::Interval(_)
| DataType::Binary
| DataType::LargeBinary
- | DataType::Decimal(_, _) => {
+ | DataType::Decimal(_, _)
+ | DataType::FixedSizeBinary(_) => {
// we return a vector of 1 value to represent the primitive
vec![self.calculate_child_levels(
array_offsets,
@@ -145,7 +146,6 @@ impl LevelInfo {
field.is_nullable(),
)]
}
- DataType::FixedSizeBinary(_) => unimplemented!(),
DataType::List(list_field) | DataType::LargeList(list_field) => {
// Calculate the list level
let list_level = self.calculate_child_levels(
@@ -189,7 +189,8 @@ impl LevelInfo {
| DataType::Utf8
| DataType::LargeUtf8
| DataType::Dictionary(_, _)
- | DataType::Decimal(_, _) => {
+ | DataType::Decimal(_, _)
+ | DataType::FixedSizeBinary(_) => {
vec![list_level.calculate_child_levels(
child_offsets,
child_mask,
@@ -197,7 +198,6 @@ impl LevelInfo {
list_field.is_nullable(),
)]
}
- DataType::FixedSizeBinary(_) => unimplemented!(),
DataType::List(_) | DataType::LargeList(_) |
DataType::Struct(_) => {
list_level.calculate_array_levels(&child_array,
list_field)
}
@@ -297,9 +297,10 @@ impl LevelInfo {
is_list: bool,
is_nullable: bool,
) -> Self {
- let mut definition = vec![];
- let mut repetition = vec![];
- let mut merged_array_mask = vec![];
+ let min_len = *(array_offsets.last().unwrap()) as usize;
+ let mut definition = Vec::with_capacity(min_len);
+ let mut repetition = Vec::with_capacity(min_len);
+ let mut merged_array_mask = Vec::with_capacity(min_len);
// determine the total level increment based on data types
let max_definition = match is_list {
@@ -624,9 +625,18 @@ impl LevelInfo {
let masks = offsets.windows(2).map(|w| w[1] > w[0]).collect();
(offsets, masks)
}
- DataType::FixedSizeBinary(_)
- | DataType::FixedSizeList(_, _)
- | DataType::Union(_) => {
+ DataType::FixedSizeBinary(value_len) => {
+ let array_mask = match array.data().null_buffer() {
+ Some(buf) => get_bool_array_slice(buf, array.offset(),
array.len()),
+ None => vec![true; array.len()],
+ };
+ let value_len = *value_len as i64;
+ (
+ (0..=(array.len() as i64)).map(|v| v *
value_len).collect(),
+ array_mask,
+ )
+ }
+ DataType::FixedSizeList(_, _) | DataType::Union(_) => {
unimplemented!("Getting offsets not yet implemented")
}
}
diff --git a/rust/parquet/src/arrow/mod.rs b/rust/parquet/src/arrow/mod.rs
index 9095259..b1aa39e 100644
--- a/rust/parquet/src/arrow/mod.rs
+++ b/rust/parquet/src/arrow/mod.rs
@@ -53,7 +53,7 @@ pub(in crate::arrow) mod array_reader;
pub mod arrow_reader;
pub mod arrow_writer;
pub(in crate::arrow) mod converter;
-pub mod levels;
+pub(in crate::arrow) mod levels;
pub(in crate::arrow) mod record_reader;
pub mod schema;