This is an automated email from the ASF dual-hosted git repository.
xuanwo pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-rust.git
The following commit(s) were added to refs/heads/main by this push:
new 2daa2c9 arrow/schema.rs: refactor tests (#531)
2daa2c9 is described below
commit 2daa2c942c7a8a050d8d8ab6b72a4f699e147522
Author: Shirly <[email protected]>
AuthorDate: Fri Aug 9 16:52:05 2024 +0800
arrow/schema.rs: refactor tests (#531)
* arrow/schema.rs: refactor tests
Signed-off-by: Shirly <[email protected]>
* *:address comments
Signed-off-by: Shirly <[email protected]>
---------
Signed-off-by: Shirly <[email protected]>
---
crates/iceberg/src/arrow/schema.rs | 396 ++++++++++++-------------------------
1 file changed, 125 insertions(+), 271 deletions(-)
diff --git a/crates/iceberg/src/arrow/schema.rs
b/crates/iceberg/src/arrow/schema.rs
index c927069..a69605e 100644
--- a/crates/iceberg/src/arrow/schema.rs
+++ b/crates/iceberg/src/arrow/schema.rs
@@ -649,178 +649,104 @@ mod tests {
use super::*;
use crate::spec::Schema;
+ /// Create a simple field with metadata.
+ fn simple_field(name: &str, ty: DataType, nullable: bool, value: &str) ->
Field {
+ Field::new(name, ty, nullable).with_metadata(HashMap::from([(
+ PARQUET_FIELD_ID_META_KEY.to_string(),
+ value.to_string(),
+ )]))
+ }
+
fn arrow_schema_for_arrow_schema_to_schema_test() -> ArrowSchema {
let fields = Fields::from(vec![
- Field::new("key", DataType::Int32,
false).with_metadata(HashMap::from([(
- PARQUET_FIELD_ID_META_KEY.to_string(),
- "17".to_string(),
- )])),
- Field::new("value", DataType::Utf8,
true).with_metadata(HashMap::from([(
- PARQUET_FIELD_ID_META_KEY.to_string(),
- "18".to_string(),
- )])),
+ simple_field("key", DataType::Int32, false, "17"),
+ simple_field("value", DataType::Utf8, true, "18"),
]);
let r#struct = DataType::Struct(fields);
let map = DataType::Map(
- Arc::new(
- Field::new(DEFAULT_MAP_FIELD_NAME, r#struct,
false).with_metadata(HashMap::from([
- (PARQUET_FIELD_ID_META_KEY.to_string(), "19".to_string()),
- ])),
- ),
+ Arc::new(simple_field(DEFAULT_MAP_FIELD_NAME, r#struct, false,
"17")),
false,
);
let fields = Fields::from(vec![
- Field::new("aa", DataType::Int32,
false).with_metadata(HashMap::from([(
- PARQUET_FIELD_ID_META_KEY.to_string(),
- "18".to_string(),
- )])),
- Field::new("bb", DataType::Utf8,
true).with_metadata(HashMap::from([(
- PARQUET_FIELD_ID_META_KEY.to_string(),
- "19".to_string(),
- )])),
- Field::new(
+ simple_field("aa", DataType::Int32, false, "18"),
+ simple_field("bb", DataType::Utf8, true, "19"),
+ simple_field(
"cc",
DataType::Timestamp(TimeUnit::Microsecond, None),
false,
- )
- .with_metadata(HashMap::from([(
- PARQUET_FIELD_ID_META_KEY.to_string(),
- "20".to_string(),
- )])),
+ "20",
+ ),
]);
let r#struct = DataType::Struct(fields);
ArrowSchema::new(vec![
- Field::new("a", DataType::Int32,
false).with_metadata(HashMap::from([(
- PARQUET_FIELD_ID_META_KEY.to_string(),
- "2".to_string(),
- )])),
- Field::new("b", DataType::Int64,
false).with_metadata(HashMap::from([(
- PARQUET_FIELD_ID_META_KEY.to_string(),
- "1".to_string(),
- )])),
- Field::new("c", DataType::Utf8,
false).with_metadata(HashMap::from([(
- PARQUET_FIELD_ID_META_KEY.to_string(),
- "3".to_string(),
- )])),
- Field::new("n", DataType::LargeUtf8,
false).with_metadata(HashMap::from([(
- PARQUET_FIELD_ID_META_KEY.to_string(),
- "21".to_string(),
- )])),
- Field::new("d", DataType::Timestamp(TimeUnit::Microsecond, None),
true).with_metadata(
- HashMap::from([(PARQUET_FIELD_ID_META_KEY.to_string(),
"4".to_string())]),
- ),
- Field::new("e", DataType::Boolean,
true).with_metadata(HashMap::from([(
- PARQUET_FIELD_ID_META_KEY.to_string(),
- "6".to_string(),
- )])),
- Field::new("f", DataType::Float32,
false).with_metadata(HashMap::from([(
- PARQUET_FIELD_ID_META_KEY.to_string(),
- "5".to_string(),
- )])),
- Field::new("g", DataType::Float64,
false).with_metadata(HashMap::from([(
- PARQUET_FIELD_ID_META_KEY.to_string(),
- "7".to_string(),
- )])),
- Field::new("p", DataType::Decimal128(10, 2),
false).with_metadata(HashMap::from([(
- PARQUET_FIELD_ID_META_KEY.to_string(),
- "27".to_string(),
- )])),
- Field::new("h", DataType::Date32,
false).with_metadata(HashMap::from([(
- PARQUET_FIELD_ID_META_KEY.to_string(),
- "8".to_string(),
- )])),
- Field::new("i", DataType::Time64(TimeUnit::Microsecond),
false).with_metadata(
- HashMap::from([(PARQUET_FIELD_ID_META_KEY.to_string(),
"9".to_string())]),
+ simple_field("a", DataType::Int32, false, "2"),
+ simple_field("b", DataType::Int64, false, "1"),
+ simple_field("c", DataType::Utf8, false, "3"),
+ simple_field("n", DataType::Utf8, false, "21"),
+ simple_field(
+ "d",
+ DataType::Timestamp(TimeUnit::Microsecond, None),
+ true,
+ "4",
),
- Field::new(
+ simple_field("e", DataType::Boolean, true, "6"),
+ simple_field("f", DataType::Float32, false, "5"),
+ simple_field("g", DataType::Float64, false, "7"),
+ simple_field("p", DataType::Decimal128(10, 2), false, "27"),
+ simple_field("h", DataType::Date32, false, "8"),
+ simple_field("i", DataType::Time64(TimeUnit::Microsecond), false,
"9"),
+ simple_field(
"j",
DataType::Timestamp(TimeUnit::Microsecond, Some("UTC".into())),
false,
- )
- .with_metadata(HashMap::from([(
- PARQUET_FIELD_ID_META_KEY.to_string(),
- "10".to_string(),
- )])),
- Field::new(
+ "10",
+ ),
+ simple_field(
"k",
DataType::Timestamp(TimeUnit::Microsecond,
Some("+00:00".into())),
false,
- )
- .with_metadata(HashMap::from([(
- PARQUET_FIELD_ID_META_KEY.to_string(),
- "12".to_string(),
- )])),
- Field::new("l", DataType::Binary,
false).with_metadata(HashMap::from([(
- PARQUET_FIELD_ID_META_KEY.to_string(),
- "13".to_string(),
- )])),
- Field::new("o", DataType::LargeBinary,
false).with_metadata(HashMap::from([(
- PARQUET_FIELD_ID_META_KEY.to_string(),
- "22".to_string(),
- )])),
- Field::new("m", DataType::FixedSizeBinary(10),
false).with_metadata(HashMap::from([(
- PARQUET_FIELD_ID_META_KEY.to_string(),
- "11".to_string(),
- )])),
- Field::new(
+ "12",
+ ),
+ simple_field("l", DataType::Binary, false, "13"),
+ simple_field("o", DataType::LargeBinary, false, "22"),
+ simple_field("m", DataType::FixedSizeBinary(10), false, "11"),
+ simple_field(
"list",
- DataType::List(Arc::new(
- Field::new("element", DataType::Int32,
false).with_metadata(HashMap::from([(
- PARQUET_FIELD_ID_META_KEY.to_string(),
- "15".to_string(),
- )])),
- )),
+ DataType::List(Arc::new(simple_field(
+ "element",
+ DataType::Int32,
+ false,
+ "15",
+ ))),
true,
- )
- .with_metadata(HashMap::from([(
- PARQUET_FIELD_ID_META_KEY.to_string(),
- "14".to_string(),
- )])),
- Field::new(
+ "14",
+ ),
+ simple_field(
"large_list",
- DataType::LargeList(Arc::new(
- Field::new("element", DataType::Utf8,
false).with_metadata(HashMap::from([(
- PARQUET_FIELD_ID_META_KEY.to_string(),
- "23".to_string(),
- )])),
- )),
+ DataType::LargeList(Arc::new(simple_field(
+ "element",
+ DataType::Utf8,
+ false,
+ "23",
+ ))),
true,
- )
- .with_metadata(HashMap::from([(
- PARQUET_FIELD_ID_META_KEY.to_string(),
- "24".to_string(),
- )])),
- Field::new(
+ "24",
+ ),
+ simple_field(
"fixed_list",
DataType::FixedSizeList(
- Arc::new(
- Field::new("element", DataType::Binary,
false).with_metadata(
- HashMap::from([(
- PARQUET_FIELD_ID_META_KEY.to_string(),
- "26".to_string(),
- )]),
- ),
- ),
+ Arc::new(simple_field("element", DataType::Binary, false,
"26")),
10,
),
true,
- )
- .with_metadata(HashMap::from([(
- PARQUET_FIELD_ID_META_KEY.to_string(),
- "25".to_string(),
- )])),
- Field::new("map", map, false).with_metadata(HashMap::from([(
- PARQUET_FIELD_ID_META_KEY.to_string(),
- "16".to_string(),
- )])),
- Field::new("struct", r#struct,
false).with_metadata(HashMap::from([(
- PARQUET_FIELD_ID_META_KEY.to_string(),
- "17".to_string(),
- )])),
+ "25",
+ ),
+ simple_field("map", map, false, "16"),
+ simple_field("struct", r#struct, false, "17"),
])
}
@@ -1017,14 +943,8 @@ mod tests {
fn arrow_schema_for_schema_to_arrow_schema_test() -> ArrowSchema {
let fields = Fields::from(vec![
- Field::new("key", DataType::Int32,
false).with_metadata(HashMap::from([(
- PARQUET_FIELD_ID_META_KEY.to_string(),
- "17".to_string(),
- )])),
- Field::new("value", DataType::Utf8,
true).with_metadata(HashMap::from([(
- PARQUET_FIELD_ID_META_KEY.to_string(),
- "18".to_string(),
- )])),
+ simple_field("key", DataType::Int32, false, "17"),
+ simple_field("value", DataType::Utf8, true, "18"),
]);
let r#struct = DataType::Struct(fields);
@@ -1034,152 +954,86 @@ mod tests {
);
let fields = Fields::from(vec![
- Field::new("aa", DataType::Int32,
false).with_metadata(HashMap::from([(
- PARQUET_FIELD_ID_META_KEY.to_string(),
- "18".to_string(),
- )])),
- Field::new("bb", DataType::Utf8,
true).with_metadata(HashMap::from([(
- PARQUET_FIELD_ID_META_KEY.to_string(),
- "19".to_string(),
- )])),
- Field::new(
+ simple_field("aa", DataType::Int32, false, "18"),
+ simple_field("bb", DataType::Utf8, true, "19"),
+ simple_field(
"cc",
DataType::Timestamp(TimeUnit::Microsecond, None),
false,
- )
- .with_metadata(HashMap::from([(
- PARQUET_FIELD_ID_META_KEY.to_string(),
- "20".to_string(),
- )])),
+ "20",
+ ),
]);
let r#struct = DataType::Struct(fields);
ArrowSchema::new(vec![
- Field::new("a", DataType::Int32,
false).with_metadata(HashMap::from([(
- PARQUET_FIELD_ID_META_KEY.to_string(),
- "2".to_string(),
- )])),
- Field::new("b", DataType::Int64,
false).with_metadata(HashMap::from([(
- PARQUET_FIELD_ID_META_KEY.to_string(),
- "1".to_string(),
- )])),
- Field::new("c", DataType::Utf8,
false).with_metadata(HashMap::from([(
- PARQUET_FIELD_ID_META_KEY.to_string(),
- "3".to_string(),
- )])),
- Field::new("n", DataType::Utf8,
false).with_metadata(HashMap::from([(
- PARQUET_FIELD_ID_META_KEY.to_string(),
- "21".to_string(),
- )])),
- Field::new("d", DataType::Timestamp(TimeUnit::Microsecond, None),
true).with_metadata(
- HashMap::from([(PARQUET_FIELD_ID_META_KEY.to_string(),
"4".to_string())]),
- ),
- Field::new("e", DataType::Boolean,
true).with_metadata(HashMap::from([(
- PARQUET_FIELD_ID_META_KEY.to_string(),
- "6".to_string(),
- )])),
- Field::new("f", DataType::Float32,
false).with_metadata(HashMap::from([(
- PARQUET_FIELD_ID_META_KEY.to_string(),
- "5".to_string(),
- )])),
- Field::new("g", DataType::Float64,
false).with_metadata(HashMap::from([(
- PARQUET_FIELD_ID_META_KEY.to_string(),
- "7".to_string(),
- )])),
- Field::new("p", DataType::Decimal128(10, 2),
false).with_metadata(HashMap::from([(
- PARQUET_FIELD_ID_META_KEY.to_string(),
- "27".to_string(),
- )])),
- Field::new("h", DataType::Date32,
false).with_metadata(HashMap::from([(
- PARQUET_FIELD_ID_META_KEY.to_string(),
- "8".to_string(),
- )])),
- Field::new("i", DataType::Time64(TimeUnit::Microsecond),
false).with_metadata(
- HashMap::from([(PARQUET_FIELD_ID_META_KEY.to_string(),
"9".to_string())]),
+ simple_field("a", DataType::Int32, false, "2"),
+ simple_field("b", DataType::Int64, false, "1"),
+ simple_field("c", DataType::Utf8, false, "3"),
+ simple_field("n", DataType::Utf8, false, "21"),
+ simple_field(
+ "d",
+ DataType::Timestamp(TimeUnit::Microsecond, None),
+ true,
+ "4",
),
- Field::new(
+ simple_field("e", DataType::Boolean, true, "6"),
+ simple_field("f", DataType::Float32, false, "5"),
+ simple_field("g", DataType::Float64, false, "7"),
+ simple_field("p", DataType::Decimal128(10, 2), false, "27"),
+ simple_field("h", DataType::Date32, false, "8"),
+ simple_field("i", DataType::Time64(TimeUnit::Microsecond), false,
"9"),
+ simple_field(
"j",
DataType::Timestamp(TimeUnit::Microsecond,
Some("+00:00".into())),
false,
- )
- .with_metadata(HashMap::from([(
- PARQUET_FIELD_ID_META_KEY.to_string(),
- "10".to_string(),
- )])),
- Field::new(
+ "10",
+ ),
+ simple_field(
"k",
DataType::Timestamp(TimeUnit::Microsecond,
Some("+00:00".into())),
false,
- )
- .with_metadata(HashMap::from([(
- PARQUET_FIELD_ID_META_KEY.to_string(),
- "12".to_string(),
- )])),
- Field::new("l", DataType::LargeBinary,
false).with_metadata(HashMap::from([(
- PARQUET_FIELD_ID_META_KEY.to_string(),
- "13".to_string(),
- )])),
- Field::new("o", DataType::LargeBinary,
false).with_metadata(HashMap::from([(
- PARQUET_FIELD_ID_META_KEY.to_string(),
- "22".to_string(),
- )])),
- Field::new("m", DataType::FixedSizeBinary(10),
false).with_metadata(HashMap::from([(
- PARQUET_FIELD_ID_META_KEY.to_string(),
- "11".to_string(),
- )])),
- Field::new(
+ "12",
+ ),
+ simple_field("l", DataType::LargeBinary, false, "13"),
+ simple_field("o", DataType::LargeBinary, false, "22"),
+ simple_field("m", DataType::FixedSizeBinary(10), false, "11"),
+ simple_field(
"list",
- DataType::List(Arc::new(
- Field::new("element", DataType::Int32,
false).with_metadata(HashMap::from([(
- PARQUET_FIELD_ID_META_KEY.to_string(),
- "15".to_string(),
- )])),
- )),
+ DataType::List(Arc::new(simple_field(
+ "element",
+ DataType::Int32,
+ false,
+ "15",
+ ))),
true,
- )
- .with_metadata(HashMap::from([(
- PARQUET_FIELD_ID_META_KEY.to_string(),
- "14".to_string(),
- )])),
- Field::new(
+ "14",
+ ),
+ simple_field(
"large_list",
- DataType::List(Arc::new(
- Field::new("element", DataType::Utf8,
false).with_metadata(HashMap::from([(
- PARQUET_FIELD_ID_META_KEY.to_string(),
- "23".to_string(),
- )])),
- )),
+ DataType::List(Arc::new(simple_field(
+ "element",
+ DataType::Utf8,
+ false,
+ "23",
+ ))),
true,
- )
- .with_metadata(HashMap::from([(
- PARQUET_FIELD_ID_META_KEY.to_string(),
- "24".to_string(),
- )])),
- Field::new(
+ "24",
+ ),
+ simple_field(
"fixed_list",
- DataType::List(Arc::new(
- Field::new("element", DataType::LargeBinary,
false).with_metadata(
- HashMap::from([(PARQUET_FIELD_ID_META_KEY.to_string(),
"26".to_string())]),
- ),
- )),
+ DataType::List(Arc::new(simple_field(
+ "element",
+ DataType::LargeBinary,
+ false,
+ "26",
+ ))),
true,
- )
- .with_metadata(HashMap::from([(
- PARQUET_FIELD_ID_META_KEY.to_string(),
- "25".to_string(),
- )])),
- Field::new("map", map, false).with_metadata(HashMap::from([(
- PARQUET_FIELD_ID_META_KEY.to_string(),
- "16".to_string(),
- )])),
- Field::new("struct", r#struct,
false).with_metadata(HashMap::from([(
- PARQUET_FIELD_ID_META_KEY.to_string(),
- "17".to_string(),
- )])),
- Field::new("uuid", DataType::FixedSizeBinary(16),
false).with_metadata(HashMap::from(
- [(PARQUET_FIELD_ID_META_KEY.to_string(), "26".to_string())],
- )),
+ "25",
+ ),
+ simple_field("map", map, false, "16"),
+ simple_field("struct", r#struct, false, "17"),
+ simple_field("uuid", DataType::FixedSizeBinary(16), false, "26"),
])
}