>From Hussain Towaileb <[email protected]>:
Hussain Towaileb has uploaded this change for review. (
https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/21323?usp=email )
Change subject: [NO ISSUE][EXT]: map the iceberg MAP type to array of key-value
objects
......................................................................
[NO ISSUE][EXT]: map the iceberg MAP type to array of key-value objects
Ext-ref: MB-72280
Change-Id: I562fcb2c4c62770a67d2adb7c1f8dcaa1fca5d59
---
M
asterixdb/asterix-app/src/test/resources/runtimets/results/iceberg/all-data-types/result.020.adm
M
asterixdb/asterix-app/src/test/resources/runtimets/results/iceberg/all-data-types/result.030.adm
M
asterixdb/asterix-app/src/test/resources/runtimets/results/iceberg/all-data-types/result.040.adm
M
asterixdb/asterix-app/src/test/resources/runtimets/results/iceberg/all-data-types/result.050.adm
M
asterixdb/asterix-app/src/test/resources/runtimets/results/iceberg/all-data-types/result.060.adm
M
asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/IcebergParquetDataParser.java
6 files changed, 23 insertions(+), 20 deletions(-)
git pull ssh://asterix-gerrit.ics.uci.edu:29418/asterixdb
refs/changes/23/21323/1
diff --git
a/asterixdb/asterix-app/src/test/resources/runtimets/results/iceberg/all-data-types/result.020.adm
b/asterixdb/asterix-app/src/test/resources/runtimets/results/iceberg/all-data-types/result.020.adm
index d5f405e..cb3d36d 100644
---
a/asterixdb/asterix-app/src/test/resources/runtimets/results/iceberg/all-data-types/result.020.adm
+++
b/asterixdb/asterix-app/src/test/resources/runtimets/results/iceberg/all-data-types/result.020.adm
@@ -1 +1 @@
-{ "binary_field": "AQIDBAUGBwg=", "fixed_field": "SGVsbG8gV29ybGQ=",
"geometry_field": "AQIDBAUGBwgJCg==", "geography_field": "AQIDBAUGBwgJCg0=",
"bool_field": true, "byte_field": 42, "short_field": 1000, "int_field": 42,
"long_field": 9223372036854775807, "float_field": 3.14, "double_field":
2.718281828459045, "decimal_field": 12345.6789, "string_field": "hello world",
"varchar_field": "varchar value one", "char_field": "Hi", "uuid_field":
uuid("550e8400-e29b-41d4-a716-446655440000"), "date_field": 19723,
"time_field": 37230000, "timestamp_field": 1707000000000000,
"timestamp_ntz_field": 1707048000000000, "timestamp_nano_field":
1707000000000000000, "interval_ym_field": 14, "interval_dt_field": 37230000000,
"struct_field": { "name": "Alice", "age": 30, "active": true }, "list_field": [
"a", "b", "c" ], "map_field": { "key1": "value1", "key2": "100" },
"variant_field": "string value", "unknown_field": null }
\ No newline at end of file
+{ "binary_field": "AQIDBAUGBwg=", "fixed_field": "SGVsbG8gV29ybGQ=",
"geometry_field": "AQIDBAUGBwgJCg==", "geography_field": "AQIDBAUGBwgJCg0=",
"bool_field": true, "byte_field": 42, "short_field": 1000, "int_field": 42,
"long_field": 9223372036854775807, "float_field": 3.14, "double_field":
2.718281828459045, "decimal_field": 12345.6789, "string_field": "hello world",
"varchar_field": "varchar value one", "char_field": "Hi", "uuid_field":
uuid("550e8400-e29b-41d4-a716-446655440000"), "date_field": 19723,
"time_field": 37230000, "timestamp_field": 1707000000000000,
"timestamp_ntz_field": 1707048000000000, "timestamp_nano_field":
1707000000000000000, "interval_ym_field": 14, "interval_dt_field": 37230000000,
"struct_field": { "name": "Alice", "age": 30, "active": true }, "list_field": [
"a", "b", "c" ], "map_field": [ { "key": "key1", "value": "value1" }, { "key":
"key2", "value": "100" } ], "variant_field": "string value", "unknown_field":
null }
\ No newline at end of file
diff --git
a/asterixdb/asterix-app/src/test/resources/runtimets/results/iceberg/all-data-types/result.030.adm
b/asterixdb/asterix-app/src/test/resources/runtimets/results/iceberg/all-data-types/result.030.adm
index 4dfcb3a..618e682 100644
---
a/asterixdb/asterix-app/src/test/resources/runtimets/results/iceberg/all-data-types/result.030.adm
+++
b/asterixdb/asterix-app/src/test/resources/runtimets/results/iceberg/all-data-types/result.030.adm
@@ -1 +1 @@
-{ "binary_field": "AQIDBAUGBwg=", "fixed_field": "SGVsbG8gV29ybGQ=",
"geometry_field": "AQIDBAUGBwgJCg==", "geography_field": "AQIDBAUGBwgJCg0=",
"bool_field": true, "byte_field": 42, "short_field": 1000, "int_field": 42,
"long_field": 9223372036854775807, "float_field": 3.14, "double_field":
2.718281828459045, "decimal_field": 12345.6789, "string_field": "hello world",
"varchar_field": "varchar value one", "char_field": "Hi", "uuid_field":
uuid("550e8400-e29b-41d4-a716-446655440000"), "date_field": date("2024-01-01"),
"time_field": time("10:20:30.000"), "timestamp_field":
datetime("2024-02-03T22:40:00.000"), "timestamp_ntz_field":
datetime("2024-02-04T12:00:00.000"), "timestamp_nano_field":
datetime("2024-02-03T22:40:00.000"), "interval_ym_field": 14,
"interval_dt_field": 37230000000, "struct_field": { "name": "Alice", "age": 30,
"active": true }, "list_field": [ "a", "b", "c" ], "map_field": { "key1":
"value1", "key2": "100" }, "variant_field": "string value", "unknown_field":
null }
\ No newline at end of file
+{ "binary_field": "AQIDBAUGBwg=", "fixed_field": "SGVsbG8gV29ybGQ=",
"geometry_field": "AQIDBAUGBwgJCg==", "geography_field": "AQIDBAUGBwgJCg0=",
"bool_field": true, "byte_field": 42, "short_field": 1000, "int_field": 42,
"long_field": 9223372036854775807, "float_field": 3.14, "double_field":
2.718281828459045, "decimal_field": 12345.6789, "string_field": "hello world",
"varchar_field": "varchar value one", "char_field": "Hi", "uuid_field":
uuid("550e8400-e29b-41d4-a716-446655440000"), "date_field": date("2024-01-01"),
"time_field": time("10:20:30.000"), "timestamp_field":
datetime("2024-02-03T22:40:00.000"), "timestamp_ntz_field":
datetime("2024-02-04T12:00:00.000"), "timestamp_nano_field":
datetime("2024-02-03T22:40:00.000"), "interval_ym_field": 14,
"interval_dt_field": 37230000000, "struct_field": { "name": "Alice", "age": 30,
"active": true }, "list_field": [ "a", "b", "c" ], "map_field": [ { "key":
"key1", "value": "value1" }, { "key": "key2", "value": "100" } ],
"variant_field": "string value", "unknown_field": null }
\ No newline at end of file
diff --git
a/asterixdb/asterix-app/src/test/resources/runtimets/results/iceberg/all-data-types/result.040.adm
b/asterixdb/asterix-app/src/test/resources/runtimets/results/iceberg/all-data-types/result.040.adm
index 4dfcb3a..618e682 100644
---
a/asterixdb/asterix-app/src/test/resources/runtimets/results/iceberg/all-data-types/result.040.adm
+++
b/asterixdb/asterix-app/src/test/resources/runtimets/results/iceberg/all-data-types/result.040.adm
@@ -1 +1 @@
-{ "binary_field": "AQIDBAUGBwg=", "fixed_field": "SGVsbG8gV29ybGQ=",
"geometry_field": "AQIDBAUGBwgJCg==", "geography_field": "AQIDBAUGBwgJCg0=",
"bool_field": true, "byte_field": 42, "short_field": 1000, "int_field": 42,
"long_field": 9223372036854775807, "float_field": 3.14, "double_field":
2.718281828459045, "decimal_field": 12345.6789, "string_field": "hello world",
"varchar_field": "varchar value one", "char_field": "Hi", "uuid_field":
uuid("550e8400-e29b-41d4-a716-446655440000"), "date_field": date("2024-01-01"),
"time_field": time("10:20:30.000"), "timestamp_field":
datetime("2024-02-03T22:40:00.000"), "timestamp_ntz_field":
datetime("2024-02-04T12:00:00.000"), "timestamp_nano_field":
datetime("2024-02-03T22:40:00.000"), "interval_ym_field": 14,
"interval_dt_field": 37230000000, "struct_field": { "name": "Alice", "age": 30,
"active": true }, "list_field": [ "a", "b", "c" ], "map_field": { "key1":
"value1", "key2": "100" }, "variant_field": "string value", "unknown_field":
null }
\ No newline at end of file
+{ "binary_field": "AQIDBAUGBwg=", "fixed_field": "SGVsbG8gV29ybGQ=",
"geometry_field": "AQIDBAUGBwgJCg==", "geography_field": "AQIDBAUGBwgJCg0=",
"bool_field": true, "byte_field": 42, "short_field": 1000, "int_field": 42,
"long_field": 9223372036854775807, "float_field": 3.14, "double_field":
2.718281828459045, "decimal_field": 12345.6789, "string_field": "hello world",
"varchar_field": "varchar value one", "char_field": "Hi", "uuid_field":
uuid("550e8400-e29b-41d4-a716-446655440000"), "date_field": date("2024-01-01"),
"time_field": time("10:20:30.000"), "timestamp_field":
datetime("2024-02-03T22:40:00.000"), "timestamp_ntz_field":
datetime("2024-02-04T12:00:00.000"), "timestamp_nano_field":
datetime("2024-02-03T22:40:00.000"), "interval_ym_field": 14,
"interval_dt_field": 37230000000, "struct_field": { "name": "Alice", "age": 30,
"active": true }, "list_field": [ "a", "b", "c" ], "map_field": [ { "key":
"key1", "value": "value1" }, { "key": "key2", "value": "100" } ],
"variant_field": "string value", "unknown_field": null }
\ No newline at end of file
diff --git
a/asterixdb/asterix-app/src/test/resources/runtimets/results/iceberg/all-data-types/result.050.adm
b/asterixdb/asterix-app/src/test/resources/runtimets/results/iceberg/all-data-types/result.050.adm
index a6c1db5..5d49e61 100644
---
a/asterixdb/asterix-app/src/test/resources/runtimets/results/iceberg/all-data-types/result.050.adm
+++
b/asterixdb/asterix-app/src/test/resources/runtimets/results/iceberg/all-data-types/result.050.adm
@@ -1 +1 @@
-{ "binary_field": "AQIDBAUGBwg=", "fixed_field": "SGVsbG8gV29ybGQ=",
"geometry_field": "AQIDBAUGBwgJCg==", "geography_field": "AQIDBAUGBwgJCg0=",
"bool_field": true, "byte_field": 42, "short_field": 1000, "int_field": 42,
"long_field": 9223372036854775807, "float_field": 3.14, "double_field":
2.718281828459045, "decimal_field": 12345.6789, "string_field": "hello world",
"varchar_field": "varchar value one", "char_field": "Hi", "uuid_field":
uuid("550e8400-e29b-41d4-a716-446655440000"), "date_field": date("2024-01-01"),
"time_field": time("10:20:30.000"), "timestamp_field":
datetime("2024-02-03T23:40:00.000"), "timestamp_ntz_field":
datetime("2024-02-04T12:00:00.000"), "timestamp_nano_field":
datetime("2024-02-03T23:40:00.000"), "interval_ym_field": 14,
"interval_dt_field": 37230000000, "struct_field": { "name": "Alice", "age": 30,
"active": true }, "list_field": [ "a", "b", "c" ], "map_field": { "key1":
"value1", "key2": "100" }, "variant_field": "string value", "unknown_field":
null }
\ No newline at end of file
+{ "binary_field": "AQIDBAUGBwg=", "fixed_field": "SGVsbG8gV29ybGQ=",
"geometry_field": "AQIDBAUGBwgJCg==", "geography_field": "AQIDBAUGBwgJCg0=",
"bool_field": true, "byte_field": 42, "short_field": 1000, "int_field": 42,
"long_field": 9223372036854775807, "float_field": 3.14, "double_field":
2.718281828459045, "decimal_field": 12345.6789, "string_field": "hello world",
"varchar_field": "varchar value one", "char_field": "Hi", "uuid_field":
uuid("550e8400-e29b-41d4-a716-446655440000"), "date_field": date("2024-01-01"),
"time_field": time("10:20:30.000"), "timestamp_field":
datetime("2024-02-03T23:40:00.000"), "timestamp_ntz_field":
datetime("2024-02-04T12:00:00.000"), "timestamp_nano_field":
datetime("2024-02-03T23:40:00.000"), "interval_ym_field": 14,
"interval_dt_field": 37230000000, "struct_field": { "name": "Alice", "age": 30,
"active": true }, "list_field": [ "a", "b", "c" ], "map_field": [ { "key":
"key1", "value": "value1" }, { "key": "key2", "value": "100" } ],
"variant_field": "string value", "unknown_field": null }
\ No newline at end of file
diff --git
a/asterixdb/asterix-app/src/test/resources/runtimets/results/iceberg/all-data-types/result.060.adm
b/asterixdb/asterix-app/src/test/resources/runtimets/results/iceberg/all-data-types/result.060.adm
index a6c1db5..5d49e61 100644
---
a/asterixdb/asterix-app/src/test/resources/runtimets/results/iceberg/all-data-types/result.060.adm
+++
b/asterixdb/asterix-app/src/test/resources/runtimets/results/iceberg/all-data-types/result.060.adm
@@ -1 +1 @@
-{ "binary_field": "AQIDBAUGBwg=", "fixed_field": "SGVsbG8gV29ybGQ=",
"geometry_field": "AQIDBAUGBwgJCg==", "geography_field": "AQIDBAUGBwgJCg0=",
"bool_field": true, "byte_field": 42, "short_field": 1000, "int_field": 42,
"long_field": 9223372036854775807, "float_field": 3.14, "double_field":
2.718281828459045, "decimal_field": 12345.6789, "string_field": "hello world",
"varchar_field": "varchar value one", "char_field": "Hi", "uuid_field":
uuid("550e8400-e29b-41d4-a716-446655440000"), "date_field": date("2024-01-01"),
"time_field": time("10:20:30.000"), "timestamp_field":
datetime("2024-02-03T23:40:00.000"), "timestamp_ntz_field":
datetime("2024-02-04T12:00:00.000"), "timestamp_nano_field":
datetime("2024-02-03T23:40:00.000"), "interval_ym_field": 14,
"interval_dt_field": 37230000000, "struct_field": { "name": "Alice", "age": 30,
"active": true }, "list_field": [ "a", "b", "c" ], "map_field": { "key1":
"value1", "key2": "100" }, "variant_field": "string value", "unknown_field":
null }
\ No newline at end of file
+{ "binary_field": "AQIDBAUGBwg=", "fixed_field": "SGVsbG8gV29ybGQ=",
"geometry_field": "AQIDBAUGBwgJCg==", "geography_field": "AQIDBAUGBwgJCg0=",
"bool_field": true, "byte_field": 42, "short_field": 1000, "int_field": 42,
"long_field": 9223372036854775807, "float_field": 3.14, "double_field":
2.718281828459045, "decimal_field": 12345.6789, "string_field": "hello world",
"varchar_field": "varchar value one", "char_field": "Hi", "uuid_field":
uuid("550e8400-e29b-41d4-a716-446655440000"), "date_field": date("2024-01-01"),
"time_field": time("10:20:30.000"), "timestamp_field":
datetime("2024-02-03T23:40:00.000"), "timestamp_ntz_field":
datetime("2024-02-04T12:00:00.000"), "timestamp_nano_field":
datetime("2024-02-03T23:40:00.000"), "interval_ym_field": 14,
"interval_dt_field": 37230000000, "struct_field": { "name": "Alice", "age": 30,
"active": true }, "list_field": [ "a", "b", "c" ], "map_field": [ { "key":
"key1", "value": "value1" }, { "key": "key2", "value": "100" } ],
"variant_field": "string value", "unknown_field": null }
\ No newline at end of file
diff --git
a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/IcebergParquetDataParser.java
b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/IcebergParquetDataParser.java
index aaf37d0..7ed1b24 100644
---
a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/IcebergParquetDataParser.java
+++
b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/IcebergParquetDataParser.java
@@ -232,28 +232,31 @@
}
private void parseMap(Types.MapType mapSchema, Map<?, ?> map, DataOutput
out) throws IOException {
- IMutableValueStorage valueBuffer = parserContext.enterObject();
+ final IMutableValueStorage item = parserContext.enterCollection();
+ final IMutableValueStorage valueBuffer = parserContext.enterObject();
IARecordBuilder objectBuilder =
parserContext.getObjectBuilder(DefaultOpenFieldType.NESTED_OPEN_RECORD_TYPE);
- valueEmbedder.enterObject();
+ IAsterixListBuilder listBuilder =
+
parserContext.getCollectionBuilder(DefaultOpenFieldType.NESTED_OPEN_AORDERED_LIST_TYPE);
Type keyType = mapSchema.keyType();
Type valueType = mapSchema.valueType();
- // TODO: we can't support non-string keys since we map MAP-TYPE to
OBJECT-TYPE in AsterixDB
- if (keyType != Types.StringType.get()) {
- throw new RuntimeDataException(ErrorCode.TYPE_UNSUPPORTED,
"Iceberg Parser", "MAP with non-string keys");
- }
-
for (Map.Entry<?, ?> entry : map.entrySet()) {
- String fieldName = (String) entry.getKey();
- Object fieldValue = entry.getValue();
- parseValueAndAddObjectField(valueBuffer, objectBuilder, valueType,
fieldName, fieldValue);
+ objectBuilder.reset(DefaultOpenFieldType.NESTED_OPEN_RECORD_TYPE);
+ valueBuffer.reset();
+ parseValue(keyType, entry.getKey(), valueBuffer.getDataOutput());
+
objectBuilder.addField(parserContext.getSerializedFieldName("key"),
valueBuffer);
+ valueBuffer.reset();
+ parseValue(valueType, entry.getValue(),
valueBuffer.getDataOutput());
+
objectBuilder.addField(parserContext.getSerializedFieldName("value"),
valueBuffer);
+ item.reset();
+ objectBuilder.write(item.getDataOutput(), true);
+ listBuilder.addItem(item);
}
- embedMissingValues(objectBuilder, parserContext, valueEmbedder);
- objectBuilder.write(out, true);
- valueEmbedder.exitObject();
+ listBuilder.write(out, true);
parserContext.exitObject(valueBuffer, null, objectBuilder);
+ parserContext.exitCollection(item, listBuilder);
}
private void parseValueAndAddObjectField(IMutableValueStorage valueBuffer,
IARecordBuilder objectBuilder,
@@ -467,8 +470,8 @@
ensureDecimalToDoubleEnabled(type, parserContext);
yield ATypeTag.DOUBLE;
}
- case STRUCT, MAP -> ATypeTag.OBJECT;
- case LIST -> ATypeTag.ARRAY;
+ case STRUCT -> ATypeTag.OBJECT;
+ case LIST, MAP -> ATypeTag.ARRAY;
case DATE -> {
if (parserContext.isDateAsInt()) {
yield ATypeTag.INTEGER;
--
To view, visit https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/21323?usp=email
To unsubscribe, or for help writing mail filters, visit
https://asterix-gerrit.ics.uci.edu/settings?usp=email
Gerrit-MessageType: newchange
Gerrit-Project: asterixdb
Gerrit-Branch: lumina
Gerrit-Change-Id: I562fcb2c4c62770a67d2adb7c1f8dcaa1fca5d59
Gerrit-Change-Number: 21323
Gerrit-PatchSet: 1
Gerrit-Owner: Hussain Towaileb <[email protected]>