This is an automated email from the ASF dual-hosted git repository. agrove pushed a commit to branch rust-parquet-arrow-writer in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/rust-parquet-arrow-writer by this push: new e456dfc ARROW-10191: [Rust] [Parquet] Add roundtrip Arrow -> Parquet tests for all supported Arrow DataTypes e456dfc is described below commit e456dfc6f2d4519a1bf2ccca9531a75065216c2f Author: Carol (Nichols || Goulding) <carol.nich...@gmail.com> AuthorDate: Tue Oct 6 08:44:26 2020 -0600 ARROW-10191: [Rust] [Parquet] Add roundtrip Arrow -> Parquet tests for all supported Arrow DataTypes Note that this PR goes to the rust-parquet-arrow-writer branch, not master. Inspired by tests in cpp/src/parquet/arrow/arrow_reader_writer_test.cc These perform round-trip Arrow -> Parquet -> Arrow of a single RecordBatch with a single column of values of each the supported data types and some of the unsupported ones. Tests that currently fail are either marked with `#[should_panic]` (if the reason they fail is because of a panic) or `#[ignore]` (if the reason they fail is because the values don't match). I am comparing the RecordBatch's column's data before and after the round trip directly; I'm not sure that this is appropriate or not because for some data types, the `null_bitmap` isn't matching and I'm not sure if it's supposed to or not. So I would love advice on that front, and I would love to know if these tests are useful or not! Closes #8330 from carols10cents/roundtrip-tests Lead-authored-by: Carol (Nichols || Goulding) <carol.nich...@gmail.com> Co-authored-by: Neville Dipale <nevilled...@gmail.com> Signed-off-by: Andy Grove <andygr...@nvidia.com> --- rust/arrow/src/compute/kernels/cast.rs | 20 +- rust/parquet/src/arrow/array_reader.rs | 102 +++++--- rust/parquet/src/arrow/arrow_writer.rs | 413 ++++++++++++++++++++++++++++++++- rust/parquet/src/arrow/converter.rs | 25 +- 4 files changed, 523 insertions(+), 37 deletions(-) diff --git a/rust/arrow/src/compute/kernels/cast.rs b/rust/arrow/src/compute/kernels/cast.rs index d8cb480..a1142b4 100644 --- a/rust/arrow/src/compute/kernels/cast.rs +++ b/rust/arrow/src/compute/kernels/cast.rs @@ -327,11 +327,27 @@ pub fn cast(array: &ArrayRef, to_type: &DataType) -> Result<ArrayRef> { // temporal casts (Int32, Date32(_)) => cast_array_data::<Date32Type>(array, to_type.clone()), - (Int32, Time32(_)) => cast_array_data::<Date32Type>(array, to_type.clone()), + (Int32, Time32(unit)) => match unit { + TimeUnit::Second => { + cast_array_data::<Time32SecondType>(array, to_type.clone()) + } + TimeUnit::Millisecond => { + cast_array_data::<Time32MillisecondType>(array, to_type.clone()) + } + _ => unreachable!(), + }, (Date32(_), Int32) => cast_array_data::<Int32Type>(array, to_type.clone()), (Time32(_), Int32) => cast_array_data::<Int32Type>(array, to_type.clone()), (Int64, Date64(_)) => cast_array_data::<Date64Type>(array, to_type.clone()), - (Int64, Time64(_)) => cast_array_data::<Date64Type>(array, to_type.clone()), + (Int64, Time64(unit)) => match unit { + TimeUnit::Microsecond => { + cast_array_data::<Time64MicrosecondType>(array, to_type.clone()) + } + TimeUnit::Nanosecond => { + cast_array_data::<Time64NanosecondType>(array, to_type.clone()) + } + _ => unreachable!(), + }, (Date64(_), Int64) => cast_array_data::<Int64Type>(array, to_type.clone()), (Time64(_), Int64) => cast_array_data::<Int64Type>(array, to_type.clone()), (Date32(DateUnit::Day), Date64(DateUnit::Millisecond)) => { diff --git a/rust/parquet/src/arrow/array_reader.rs b/rust/parquet/src/arrow/array_reader.rs index 14bf7d2..4fbc54d 100644 --- a/rust/parquet/src/arrow/array_reader.rs +++ b/rust/parquet/src/arrow/array_reader.rs @@ -35,9 +35,10 @@ use crate::arrow::converter::{ BinaryArrayConverter, BinaryConverter, BoolConverter, BooleanArrayConverter, Converter, Date32Converter, FixedLenBinaryConverter, FixedSizeArrayConverter, Float32Converter, Float64Converter, Int16Converter, Int32Converter, Int64Converter, - Int8Converter, Int96ArrayConverter, Int96Converter, TimestampMicrosecondConverter, - TimestampMillisecondConverter, UInt16Converter, UInt32Converter, UInt64Converter, - UInt8Converter, Utf8ArrayConverter, Utf8Converter, + Int8Converter, Int96ArrayConverter, Int96Converter, Time32MillisecondConverter, + Time32SecondConverter, Time64MicrosecondConverter, Time64NanosecondConverter, + TimestampMicrosecondConverter, TimestampMillisecondConverter, UInt16Converter, + UInt32Converter, UInt64Converter, UInt8Converter, Utf8ArrayConverter, Utf8Converter, }; use crate::arrow::record_reader::RecordReader; use crate::arrow::schema::parquet_to_arrow_field; @@ -196,11 +197,27 @@ impl<T: DataType> ArrayReader for PrimitiveArrayReader<T> { .convert(self.record_reader.cast::<Int32Type>()), _ => Err(general_err!("No conversion from parquet type to arrow type for date with unit {:?}", unit)), } - (ArrowType::Time32(_), PhysicalType::INT32) => { - UInt32Converter::new().convert(self.record_reader.cast::<Int32Type>()) + (ArrowType::Time32(unit), PhysicalType::INT32) => { + match unit { + TimeUnit::Second => { + Time32SecondConverter::new().convert(self.record_reader.cast::<Int32Type>()) + } + TimeUnit::Millisecond => { + Time32MillisecondConverter::new().convert(self.record_reader.cast::<Int32Type>()) + } + _ => Err(general_err!("Invalid or unsupported arrow array with datatype {:?}", self.get_data_type())) + } } - (ArrowType::Time64(_), PhysicalType::INT64) => { - UInt64Converter::new().convert(self.record_reader.cast::<Int64Type>()) + (ArrowType::Time64(unit), PhysicalType::INT64) => { + match unit { + TimeUnit::Microsecond => { + Time64MicrosecondConverter::new().convert(self.record_reader.cast::<Int64Type>()) + } + TimeUnit::Nanosecond => { + Time64NanosecondConverter::new().convert(self.record_reader.cast::<Int64Type>()) + } + _ => Err(general_err!("Invalid or unsupported arrow array with datatype {:?}", self.get_data_type())) + } } (ArrowType::Interval(IntervalUnit::YearMonth), PhysicalType::INT32) => { UInt32Converter::new().convert(self.record_reader.cast::<Int32Type>()) @@ -941,10 +958,12 @@ mod tests { use crate::util::test_common::{get_test_file, make_pages}; use arrow::array::{Array, ArrayRef, PrimitiveArray, StringArray, StructArray}; use arrow::datatypes::{ - DataType as ArrowType, Date32Type as ArrowDate32, Field, Int32Type as ArrowInt32, + ArrowPrimitiveType, DataType as ArrowType, Date32Type as ArrowDate32, Field, + Int32Type as ArrowInt32, Int64Type as ArrowInt64, + Time32MillisecondType as ArrowTime32MillisecondArray, + Time64MicrosecondType as ArrowTime64MicrosecondArray, TimestampMicrosecondType as ArrowTimestampMicrosecondType, TimestampMillisecondType as ArrowTimestampMillisecondType, - UInt32Type as ArrowUInt32, UInt64Type as ArrowUInt64, }; use rand::distributions::uniform::SampleUniform; use rand::{thread_rng, Rng}; @@ -1101,7 +1120,7 @@ mod tests { } macro_rules! test_primitive_array_reader_one_type { - ($arrow_parquet_type:ty, $physical_type:expr, $logical_type_str:expr, $result_arrow_type:ty, $result_primitive_type:ty) => {{ + ($arrow_parquet_type:ty, $physical_type:expr, $logical_type_str:expr, $result_arrow_type:ty, $result_arrow_cast_type:ty, $result_primitive_type:ty) => {{ let message_type = format!( " message test_schema {{ @@ -1112,7 +1131,7 @@ mod tests { ); let schema = parse_message_type(&message_type) .map(|t| Rc::new(SchemaDescriptor::new(Rc::new(t)))) - .unwrap(); + .expect("Unable to parse message type into a schema descriptor"); let column_desc = schema.column(0); @@ -1142,24 +1161,48 @@ mod tests { Box::new(page_iterator), column_desc.clone(), ) - .unwrap(); + .expect("Unable to get array reader"); - let array = array_reader.next_batch(50).unwrap(); + let array = array_reader + .next_batch(50) + .expect("Unable to get batch from reader"); + let result_data_type = <$result_arrow_type>::get_data_type(); let array = array .as_any() .downcast_ref::<PrimitiveArray<$result_arrow_type>>() - .unwrap(); - - assert_eq!( - &PrimitiveArray::<$result_arrow_type>::from( - data[0..50] - .iter() - .map(|x| *x as $result_primitive_type) - .collect::<Vec<$result_primitive_type>>() - ), - array + .expect( + format!( + "Unable to downcast {:?} to {:?}", + array.data_type(), + result_data_type + ) + .as_str(), + ); + + // create expected array as primitive, and cast to result type + let expected = PrimitiveArray::<$result_arrow_cast_type>::from( + data[0..50] + .iter() + .map(|x| *x as $result_primitive_type) + .collect::<Vec<$result_primitive_type>>(), ); + let expected = Arc::new(expected) as ArrayRef; + let expected = arrow::compute::cast(&expected, &result_data_type) + .expect("Unable to cast expected array"); + assert_eq!(expected.data_type(), &result_data_type); + let expected = expected + .as_any() + .downcast_ref::<PrimitiveArray<$result_arrow_type>>() + .expect( + format!( + "Unable to downcast expected {:?} to {:?}", + expected.data_type(), + result_data_type + ) + .as_str(), + ); + assert_eq!(expected, array); } }}; } @@ -1171,27 +1214,31 @@ mod tests { PhysicalType::INT32, "DATE", ArrowDate32, + ArrowInt32, i32 ); test_primitive_array_reader_one_type!( Int32Type, PhysicalType::INT32, "TIME_MILLIS", - ArrowUInt32, - u32 + ArrowTime32MillisecondArray, + ArrowInt32, + i32 ); test_primitive_array_reader_one_type!( Int64Type, PhysicalType::INT64, "TIME_MICROS", - ArrowUInt64, - u64 + ArrowTime64MicrosecondArray, + ArrowInt64, + i64 ); test_primitive_array_reader_one_type!( Int64Type, PhysicalType::INT64, "TIMESTAMP_MILLIS", ArrowTimestampMillisecondType, + ArrowInt64, i64 ); test_primitive_array_reader_one_type!( @@ -1199,6 +1246,7 @@ mod tests { PhysicalType::INT64, "TIMESTAMP_MICROS", ArrowTimestampMicrosecondType, + ArrowInt64, i64 ); } diff --git a/rust/parquet/src/arrow/arrow_writer.rs b/rust/parquet/src/arrow/arrow_writer.rs index e0ad207..cf7b9a2 100644 --- a/rust/parquet/src/arrow/arrow_writer.rs +++ b/rust/parquet/src/arrow/arrow_writer.rs @@ -136,7 +136,6 @@ fn write_leaves( | ArrowDataType::UInt16 | ArrowDataType::UInt32 | ArrowDataType::UInt64 - | ArrowDataType::Float16 | ArrowDataType::Float32 | ArrowDataType::Float64 | ArrowDataType::Timestamp(_, _) @@ -176,6 +175,9 @@ fn write_leaves( } Ok(()) } + ArrowDataType::Float16 => Err(ParquetError::ArrowError( + "Float16 arrays not supported".to_string(), + )), ArrowDataType::FixedSizeList(_, _) | ArrowDataType::Null | ArrowDataType::Boolean @@ -493,7 +495,7 @@ mod tests { use arrow::array::*; use arrow::datatypes::ToByteSlice; use arrow::datatypes::{DataType, Field, Schema}; - use arrow::record_batch::{RecordBatch, RecordBatchReader}; + use arrow::record_batch::RecordBatch; use crate::arrow::{ArrowReader, ParquetFileArrowReader}; use crate::file::{metadata::KeyValue, reader::SerializedFileReader}; @@ -597,7 +599,7 @@ mod tests { let mut arrow_reader = ParquetFileArrowReader::new(Rc::new(file_reader)); let mut record_batch_reader = arrow_reader.get_record_reader(1024).unwrap(); - let batch = record_batch_reader.next_batch().unwrap().unwrap(); + let batch = record_batch_reader.next().unwrap().unwrap(); let string_col = batch .column(0) .as_any() @@ -688,4 +690,409 @@ mod tests { writer.write(&batch).unwrap(); writer.close().unwrap(); } + + const SMALL_SIZE: usize = 100; + + fn roundtrip(filename: &str, expected_batch: RecordBatch) { + let file = get_temp_file(filename, &[]); + + let mut writer = ArrowWriter::try_new( + file.try_clone().unwrap(), + expected_batch.schema(), + None, + ) + .unwrap(); + writer.write(&expected_batch).unwrap(); + writer.close().unwrap(); + + let reader = SerializedFileReader::new(file).unwrap(); + let mut arrow_reader = ParquetFileArrowReader::new(Rc::new(reader)); + let mut record_batch_reader = arrow_reader.get_record_reader(1024).unwrap(); + + let actual_batch = record_batch_reader.next().unwrap().unwrap(); + + assert_eq!(expected_batch.schema(), actual_batch.schema()); + assert_eq!(expected_batch.num_columns(), actual_batch.num_columns()); + assert_eq!(expected_batch.num_rows(), actual_batch.num_rows()); + for i in 0..expected_batch.num_columns() { + let expected_data = expected_batch.column(i).data(); + let actual_data = actual_batch.column(i).data(); + + assert_eq!(expected_data.data_type(), actual_data.data_type()); + assert_eq!(expected_data.len(), actual_data.len()); + assert_eq!(expected_data.null_count(), actual_data.null_count()); + assert_eq!(expected_data.offset(), actual_data.offset()); + assert_eq!(expected_data.buffers(), actual_data.buffers()); + assert_eq!(expected_data.child_data(), actual_data.child_data()); + assert_eq!(expected_data.null_bitmap(), actual_data.null_bitmap()); + } + } + + fn one_column_roundtrip(filename: &str, values: ArrayRef, nullable: bool) { + let schema = Schema::new(vec![Field::new( + "col", + values.data_type().clone(), + nullable, + )]); + let expected_batch = + RecordBatch::try_new(Arc::new(schema), vec![values]).unwrap(); + + roundtrip(filename, expected_batch); + } + + fn values_required<A, I>(iter: I, filename: &str) + where + A: From<Vec<I::Item>> + Array + 'static, + I: IntoIterator, + { + let raw_values: Vec<_> = iter.into_iter().collect(); + let values = Arc::new(A::from(raw_values)); + one_column_roundtrip(filename, values, false); + } + + fn values_optional<A, I>(iter: I, filename: &str) + where + A: From<Vec<Option<I::Item>>> + Array + 'static, + I: IntoIterator, + { + let optional_raw_values: Vec<_> = iter + .into_iter() + .enumerate() + .map(|(i, v)| if i % 2 == 0 { None } else { Some(v) }) + .collect(); + let optional_values = Arc::new(A::from(optional_raw_values)); + one_column_roundtrip(filename, optional_values, true); + } + + fn required_and_optional<A, I>(iter: I, filename: &str) + where + A: From<Vec<I::Item>> + From<Vec<Option<I::Item>>> + Array + 'static, + I: IntoIterator + Clone, + { + values_required::<A, I>(iter.clone(), filename); + values_optional::<A, I>(iter, filename); + } + + #[test] + #[should_panic(expected = "Null arrays not supported")] + fn null_single_column() { + let values = Arc::new(NullArray::new(SMALL_SIZE)); + one_column_roundtrip("null_single_column", values.clone(), true); + one_column_roundtrip("null_single_column", values, false); + } + + #[test] + #[should_panic( + expected = "Attempting to write an Arrow type that is not yet implemented" + )] + fn bool_single_column() { + required_and_optional::<BooleanArray, _>( + [true, false].iter().cycle().copied().take(SMALL_SIZE), + "bool_single_column", + ); + } + + #[test] + fn i8_single_column() { + required_and_optional::<Int8Array, _>(0..SMALL_SIZE as i8, "i8_single_column"); + } + + #[test] + fn i16_single_column() { + required_and_optional::<Int16Array, _>(0..SMALL_SIZE as i16, "i16_single_column"); + } + + #[test] + fn i32_single_column() { + required_and_optional::<Int32Array, _>(0..SMALL_SIZE as i32, "i32_single_column"); + } + + #[test] + fn i64_single_column() { + required_and_optional::<Int64Array, _>(0..SMALL_SIZE as i64, "i64_single_column"); + } + + #[test] + fn u8_single_column() { + required_and_optional::<UInt8Array, _>(0..SMALL_SIZE as u8, "u8_single_column"); + } + + #[test] + fn u16_single_column() { + required_and_optional::<UInt16Array, _>( + 0..SMALL_SIZE as u16, + "u16_single_column", + ); + } + + #[test] + fn u32_single_column() { + required_and_optional::<UInt32Array, _>( + 0..SMALL_SIZE as u32, + "u32_single_column", + ); + } + + #[test] + fn u64_single_column() { + required_and_optional::<UInt64Array, _>( + 0..SMALL_SIZE as u64, + "u64_single_column", + ); + } + + #[test] + fn f32_single_column() { + required_and_optional::<Float32Array, _>( + (0..SMALL_SIZE).map(|i| i as f32), + "f32_single_column", + ); + } + + #[test] + fn f64_single_column() { + required_and_optional::<Float64Array, _>( + (0..SMALL_SIZE).map(|i| i as f64), + "f64_single_column", + ); + } + + // The timestamp array types don't implement From<Vec<T>> because they need the timezone + // argument, and they also doesn't support building from a Vec<Option<T>>, so call + // one_column_roundtrip manually instead of calling required_and_optional for these tests. + + #[test] + #[ignore] // Timestamp support isn't correct yet + fn timestamp_second_single_column() { + let raw_values: Vec<_> = (0..SMALL_SIZE as i64).collect(); + let values = Arc::new(TimestampSecondArray::from_vec(raw_values, None)); + + one_column_roundtrip("timestamp_second_single_column", values, false); + } + + #[test] + fn timestamp_millisecond_single_column() { + let raw_values: Vec<_> = (0..SMALL_SIZE as i64).collect(); + let values = Arc::new(TimestampMillisecondArray::from_vec(raw_values, None)); + + one_column_roundtrip("timestamp_millisecond_single_column", values, false); + } + + #[test] + fn timestamp_microsecond_single_column() { + let raw_values: Vec<_> = (0..SMALL_SIZE as i64).collect(); + let values = Arc::new(TimestampMicrosecondArray::from_vec(raw_values, None)); + + one_column_roundtrip("timestamp_microsecond_single_column", values, false); + } + + #[test] + #[ignore] // Timestamp support isn't correct yet + fn timestamp_nanosecond_single_column() { + let raw_values: Vec<_> = (0..SMALL_SIZE as i64).collect(); + let values = Arc::new(TimestampNanosecondArray::from_vec(raw_values, None)); + + one_column_roundtrip("timestamp_nanosecond_single_column", values, false); + } + + #[test] + fn date32_single_column() { + required_and_optional::<Date32Array, _>( + 0..SMALL_SIZE as i32, + "date32_single_column", + ); + } + + #[test] + #[ignore] // Date support isn't correct yet + fn date64_single_column() { + required_and_optional::<Date64Array, _>( + 0..SMALL_SIZE as i64, + "date64_single_column", + ); + } + + #[test] + #[ignore] // DateUnit resolution mismatch + fn time32_second_single_column() { + required_and_optional::<Time32SecondArray, _>( + 0..SMALL_SIZE as i32, + "time32_second_single_column", + ); + } + + #[test] + fn time32_millisecond_single_column() { + required_and_optional::<Time32MillisecondArray, _>( + 0..SMALL_SIZE as i32, + "time32_millisecond_single_column", + ); + } + + #[test] + fn time64_microsecond_single_column() { + required_and_optional::<Time64MicrosecondArray, _>( + 0..SMALL_SIZE as i64, + "time64_microsecond_single_column", + ); + } + + #[test] + #[ignore] // DateUnit resolution mismatch + fn time64_nanosecond_single_column() { + required_and_optional::<Time64NanosecondArray, _>( + 0..SMALL_SIZE as i64, + "time64_nanosecond_single_column", + ); + } + + #[test] + #[should_panic(expected = "Converting Duration to parquet not supported")] + fn duration_second_single_column() { + required_and_optional::<DurationSecondArray, _>( + 0..SMALL_SIZE as i64, + "duration_second_single_column", + ); + } + + #[test] + #[should_panic(expected = "Converting Duration to parquet not supported")] + fn duration_millisecond_single_column() { + required_and_optional::<DurationMillisecondArray, _>( + 0..SMALL_SIZE as i64, + "duration_millisecond_single_column", + ); + } + + #[test] + #[should_panic(expected = "Converting Duration to parquet not supported")] + fn duration_microsecond_single_column() { + required_and_optional::<DurationMicrosecondArray, _>( + 0..SMALL_SIZE as i64, + "duration_microsecond_single_column", + ); + } + + #[test] + #[should_panic(expected = "Converting Duration to parquet not supported")] + fn duration_nanosecond_single_column() { + required_and_optional::<DurationNanosecondArray, _>( + 0..SMALL_SIZE as i64, + "duration_nanosecond_single_column", + ); + } + + #[test] + #[should_panic(expected = "Currently unreachable because data type not supported")] + fn interval_year_month_single_column() { + required_and_optional::<IntervalYearMonthArray, _>( + 0..SMALL_SIZE as i32, + "interval_year_month_single_column", + ); + } + + #[test] + #[should_panic(expected = "Currently unreachable because data type not supported")] + fn interval_day_time_single_column() { + required_and_optional::<IntervalDayTimeArray, _>( + 0..SMALL_SIZE as i64, + "interval_day_time_single_column", + ); + } + + #[test] + #[ignore] // Binary support isn't correct yet - null_bitmap doesn't match + fn binary_single_column() { + let one_vec: Vec<u8> = (0..SMALL_SIZE as u8).collect(); + let many_vecs: Vec<_> = std::iter::repeat(one_vec).take(SMALL_SIZE).collect(); + let many_vecs_iter = many_vecs.iter().map(|v| v.as_slice()); + + // BinaryArrays can't be built from Vec<Option<&str>>, so only call `values_required` + values_required::<BinaryArray, _>(many_vecs_iter, "binary_single_column"); + } + + #[test] + #[ignore] // Large Binary support isn't correct yet + fn large_binary_single_column() { + let one_vec: Vec<u8> = (0..SMALL_SIZE as u8).collect(); + let many_vecs: Vec<_> = std::iter::repeat(one_vec).take(SMALL_SIZE).collect(); + let many_vecs_iter = many_vecs.iter().map(|v| v.as_slice()); + + // LargeBinaryArrays can't be built from Vec<Option<&str>>, so only call `values_required` + values_required::<LargeBinaryArray, _>( + many_vecs_iter, + "large_binary_single_column", + ); + } + + #[test] + #[ignore] // String support isn't correct yet - null_bitmap doesn't match + fn string_single_column() { + let raw_values: Vec<_> = (0..SMALL_SIZE).map(|i| i.to_string()).collect(); + let raw_strs = raw_values.iter().map(|s| s.as_str()); + + required_and_optional::<StringArray, _>(raw_strs, "string_single_column"); + } + + #[test] + #[ignore] // Large String support isn't correct yet - null_bitmap and buffers don't match + fn large_string_single_column() { + let raw_values: Vec<_> = (0..SMALL_SIZE).map(|i| i.to_string()).collect(); + let raw_strs = raw_values.iter().map(|s| s.as_str()); + + required_and_optional::<LargeStringArray, _>( + raw_strs, + "large_string_single_column", + ); + } + + #[test] + #[should_panic( + expected = "Reading parquet list array into arrow is not supported yet!" + )] + fn list_single_column() { + let a_values = Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]); + let a_value_offsets = + arrow::buffer::Buffer::from(&[0, 1, 3, 3, 6, 10].to_byte_slice()); + let a_list_data = ArrayData::builder(DataType::List(Box::new(DataType::Int32))) + .len(5) + .add_buffer(a_value_offsets) + .add_child_data(a_values.data()) + .build(); + let a = ListArray::from(a_list_data); + + let values = Arc::new(a); + one_column_roundtrip("list_single_column", values, false); + } + + #[test] + #[should_panic( + expected = "Reading parquet list array into arrow is not supported yet!" + )] + fn large_list_single_column() { + let a_values = Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]); + let a_value_offsets = + arrow::buffer::Buffer::from(&[0i64, 1, 3, 3, 6, 10].to_byte_slice()); + let a_list_data = + ArrayData::builder(DataType::LargeList(Box::new(DataType::Int32))) + .len(5) + .add_buffer(a_value_offsets) + .add_child_data(a_values.data()) + .build(); + let a = LargeListArray::from(a_list_data); + + let values = Arc::new(a); + one_column_roundtrip("large_list_single_column", values, false); + } + + #[test] + #[ignore] // Struct support isn't correct yet - null_bitmap doesn't match + fn struct_single_column() { + let a_values = Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]); + let struct_field_a = Field::new("f", DataType::Int32, false); + let s = StructArray::from(vec![(struct_field_a, Arc::new(a_values) as ArrayRef)]); + + let values = Arc::new(s); + one_column_roundtrip("struct_single_column", values, false); + } } diff --git a/rust/parquet/src/arrow/converter.rs b/rust/parquet/src/arrow/converter.rs index 9fbfa33..c988aae 100644 --- a/rust/parquet/src/arrow/converter.rs +++ b/rust/parquet/src/arrow/converter.rs @@ -17,12 +17,19 @@ use crate::arrow::record_reader::RecordReader; use crate::data_type::{ByteArray, DataType, Int96}; -use arrow::array::{ - Array, ArrayRef, BinaryBuilder, BooleanArray, BooleanBufferBuilder, - BufferBuilderTrait, FixedSizeBinaryBuilder, StringBuilder, - TimestampNanosecondBuilder, +// TODO: clean up imports (best done when there are few moving parts) +use arrow::{ + array::{ + Array, ArrayRef, BinaryBuilder, BooleanArray, BooleanBufferBuilder, + BufferBuilderTrait, FixedSizeBinaryBuilder, StringBuilder, + TimestampNanosecondBuilder, + }, + datatypes::Time32MillisecondType, +}; +use arrow::{ + compute::cast, datatypes::Time32SecondType, datatypes::Time64MicrosecondType, + datatypes::Time64NanosecondType, }; -use arrow::compute::cast; use std::convert::From; use std::sync::Arc; @@ -226,6 +233,14 @@ pub type TimestampMillisecondConverter = CastConverter<ParquetInt64Type, TimestampMillisecondType, TimestampMillisecondType>; pub type TimestampMicrosecondConverter = CastConverter<ParquetInt64Type, TimestampMicrosecondType, TimestampMicrosecondType>; +pub type Time32SecondConverter = + CastConverter<ParquetInt32Type, Time32SecondType, Time32SecondType>; +pub type Time32MillisecondConverter = + CastConverter<ParquetInt32Type, Time32MillisecondType, Time32MillisecondType>; +pub type Time64MicrosecondConverter = + CastConverter<ParquetInt64Type, Time64MicrosecondType, Time64MicrosecondType>; +pub type Time64NanosecondConverter = + CastConverter<ParquetInt64Type, Time64NanosecondType, Time64NanosecondType>; pub type UInt64Converter = CastConverter<ParquetInt64Type, UInt64Type, UInt64Type>; pub type Float32Converter = CastConverter<ParquetFloatType, Float32Type, Float32Type>; pub type Float64Converter = CastConverter<ParquetDoubleType, Float64Type, Float64Type>;