>From Hussain Towaileb <[email protected]>: Hussain Towaileb has submitted this change. ( https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/21323?usp=email )
Change subject: [NO ISSUE][EXT]: map the iceberg MAP type to array of key-value objects ...................................................................... [NO ISSUE][EXT]: map the iceberg MAP type to array of key-value objects Ext-ref: MB-72280 Change-Id: I562fcb2c4c62770a67d2adb7c1f8dcaa1fca5d59 Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/21323 Reviewed-by: Hussain Towaileb <[email protected]> Reviewed-by: Murtadha Hubail <[email protected]> Tested-by: Hussain Towaileb <[email protected]> Integration-Tests: Jenkins <[email protected]> --- M asterixdb/asterix-app/src/test/resources/runtimets/results/iceberg/all-data-types/result.020.adm M asterixdb/asterix-app/src/test/resources/runtimets/results/iceberg/all-data-types/result.030.adm M asterixdb/asterix-app/src/test/resources/runtimets/results/iceberg/all-data-types/result.040.adm M asterixdb/asterix-app/src/test/resources/runtimets/results/iceberg/all-data-types/result.050.adm M asterixdb/asterix-app/src/test/resources/runtimets/results/iceberg/all-data-types/result.060.adm M asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/IcebergParquetDataParser.java 6 files changed, 23 insertions(+), 20 deletions(-) Approvals: Murtadha Hubail: Looks good to me, approved Hussain Towaileb: Looks good to me, but someone else must approve; Verified Jenkins: Verified diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/iceberg/all-data-types/result.020.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/iceberg/all-data-types/result.020.adm index d5f405e..cb3d36d 100644 --- a/asterixdb/asterix-app/src/test/resources/runtimets/results/iceberg/all-data-types/result.020.adm +++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/iceberg/all-data-types/result.020.adm @@ -1 +1 @@ -{ "binary_field": "AQIDBAUGBwg=", "fixed_field": "SGVsbG8gV29ybGQ=", "geometry_field": "AQIDBAUGBwgJCg==", "geography_field": "AQIDBAUGBwgJCg0=", "bool_field": true, "byte_field": 42, "short_field": 1000, "int_field": 42, "long_field": 9223372036854775807, "float_field": 3.14, "double_field": 2.718281828459045, "decimal_field": 12345.6789, "string_field": "hello world", "varchar_field": "varchar value one", "char_field": "Hi", "uuid_field": uuid("550e8400-e29b-41d4-a716-446655440000"), "date_field": 19723, "time_field": 37230000, "timestamp_field": 1707000000000000, "timestamp_ntz_field": 1707048000000000, "timestamp_nano_field": 1707000000000000000, "interval_ym_field": 14, "interval_dt_field": 37230000000, "struct_field": { "name": "Alice", "age": 30, "active": true }, "list_field": [ "a", "b", "c" ], "map_field": { "key1": "value1", "key2": "100" }, "variant_field": "string value", "unknown_field": null } \ No newline at end of file +{ "binary_field": "AQIDBAUGBwg=", "fixed_field": "SGVsbG8gV29ybGQ=", "geometry_field": "AQIDBAUGBwgJCg==", "geography_field": "AQIDBAUGBwgJCg0=", "bool_field": true, "byte_field": 42, "short_field": 1000, "int_field": 42, "long_field": 9223372036854775807, "float_field": 3.14, "double_field": 2.718281828459045, "decimal_field": 12345.6789, "string_field": "hello world", "varchar_field": "varchar value one", "char_field": "Hi", "uuid_field": uuid("550e8400-e29b-41d4-a716-446655440000"), "date_field": 19723, "time_field": 37230000, "timestamp_field": 1707000000000000, "timestamp_ntz_field": 1707048000000000, "timestamp_nano_field": 1707000000000000000, "interval_ym_field": 14, "interval_dt_field": 37230000000, "struct_field": { "name": "Alice", "age": 30, "active": true }, "list_field": [ "a", "b", "c" ], "map_field": [ { "key": "key1", "value": "value1" }, { "key": "key2", "value": "100" } ], "variant_field": "string value", "unknown_field": null } \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/iceberg/all-data-types/result.030.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/iceberg/all-data-types/result.030.adm index 4dfcb3a..618e682 100644 --- a/asterixdb/asterix-app/src/test/resources/runtimets/results/iceberg/all-data-types/result.030.adm +++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/iceberg/all-data-types/result.030.adm @@ -1 +1 @@ -{ "binary_field": "AQIDBAUGBwg=", "fixed_field": "SGVsbG8gV29ybGQ=", "geometry_field": "AQIDBAUGBwgJCg==", "geography_field": "AQIDBAUGBwgJCg0=", "bool_field": true, "byte_field": 42, "short_field": 1000, "int_field": 42, "long_field": 9223372036854775807, "float_field": 3.14, "double_field": 2.718281828459045, "decimal_field": 12345.6789, "string_field": "hello world", "varchar_field": "varchar value one", "char_field": "Hi", "uuid_field": uuid("550e8400-e29b-41d4-a716-446655440000"), "date_field": date("2024-01-01"), "time_field": time("10:20:30.000"), "timestamp_field": datetime("2024-02-03T22:40:00.000"), "timestamp_ntz_field": datetime("2024-02-04T12:00:00.000"), "timestamp_nano_field": datetime("2024-02-03T22:40:00.000"), "interval_ym_field": 14, "interval_dt_field": 37230000000, "struct_field": { "name": "Alice", "age": 30, "active": true }, "list_field": [ "a", "b", "c" ], "map_field": { "key1": "value1", "key2": "100" }, "variant_field": "string value", "unknown_field": null } \ No newline at end of file +{ "binary_field": "AQIDBAUGBwg=", "fixed_field": "SGVsbG8gV29ybGQ=", "geometry_field": "AQIDBAUGBwgJCg==", "geography_field": "AQIDBAUGBwgJCg0=", "bool_field": true, "byte_field": 42, "short_field": 1000, "int_field": 42, "long_field": 9223372036854775807, "float_field": 3.14, "double_field": 2.718281828459045, "decimal_field": 12345.6789, "string_field": "hello world", "varchar_field": "varchar value one", "char_field": "Hi", "uuid_field": uuid("550e8400-e29b-41d4-a716-446655440000"), "date_field": date("2024-01-01"), "time_field": time("10:20:30.000"), "timestamp_field": datetime("2024-02-03T22:40:00.000"), "timestamp_ntz_field": datetime("2024-02-04T12:00:00.000"), "timestamp_nano_field": datetime("2024-02-03T22:40:00.000"), "interval_ym_field": 14, "interval_dt_field": 37230000000, "struct_field": { "name": "Alice", "age": 30, "active": true }, "list_field": [ "a", "b", "c" ], "map_field": [ { "key": "key1", "value": "value1" }, { "key": "key2", "value": "100" } ], "variant_field": "string value", "unknown_field": null } \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/iceberg/all-data-types/result.040.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/iceberg/all-data-types/result.040.adm index 4dfcb3a..618e682 100644 --- a/asterixdb/asterix-app/src/test/resources/runtimets/results/iceberg/all-data-types/result.040.adm +++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/iceberg/all-data-types/result.040.adm @@ -1 +1 @@ -{ "binary_field": "AQIDBAUGBwg=", "fixed_field": "SGVsbG8gV29ybGQ=", "geometry_field": "AQIDBAUGBwgJCg==", "geography_field": "AQIDBAUGBwgJCg0=", "bool_field": true, "byte_field": 42, "short_field": 1000, "int_field": 42, "long_field": 9223372036854775807, "float_field": 3.14, "double_field": 2.718281828459045, "decimal_field": 12345.6789, "string_field": "hello world", "varchar_field": "varchar value one", "char_field": "Hi", "uuid_field": uuid("550e8400-e29b-41d4-a716-446655440000"), "date_field": date("2024-01-01"), "time_field": time("10:20:30.000"), "timestamp_field": datetime("2024-02-03T22:40:00.000"), "timestamp_ntz_field": datetime("2024-02-04T12:00:00.000"), "timestamp_nano_field": datetime("2024-02-03T22:40:00.000"), "interval_ym_field": 14, "interval_dt_field": 37230000000, "struct_field": { "name": "Alice", "age": 30, "active": true }, "list_field": [ "a", "b", "c" ], "map_field": { "key1": "value1", "key2": "100" }, "variant_field": "string value", "unknown_field": null } \ No newline at end of file +{ "binary_field": "AQIDBAUGBwg=", "fixed_field": "SGVsbG8gV29ybGQ=", "geometry_field": "AQIDBAUGBwgJCg==", "geography_field": "AQIDBAUGBwgJCg0=", "bool_field": true, "byte_field": 42, "short_field": 1000, "int_field": 42, "long_field": 9223372036854775807, "float_field": 3.14, "double_field": 2.718281828459045, "decimal_field": 12345.6789, "string_field": "hello world", "varchar_field": "varchar value one", "char_field": "Hi", "uuid_field": uuid("550e8400-e29b-41d4-a716-446655440000"), "date_field": date("2024-01-01"), "time_field": time("10:20:30.000"), "timestamp_field": datetime("2024-02-03T22:40:00.000"), "timestamp_ntz_field": datetime("2024-02-04T12:00:00.000"), "timestamp_nano_field": datetime("2024-02-03T22:40:00.000"), "interval_ym_field": 14, "interval_dt_field": 37230000000, "struct_field": { "name": "Alice", "age": 30, "active": true }, "list_field": [ "a", "b", "c" ], "map_field": [ { "key": "key1", "value": "value1" }, { "key": "key2", "value": "100" } ], "variant_field": "string value", "unknown_field": null } \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/iceberg/all-data-types/result.050.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/iceberg/all-data-types/result.050.adm index a6c1db5..5d49e61 100644 --- a/asterixdb/asterix-app/src/test/resources/runtimets/results/iceberg/all-data-types/result.050.adm +++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/iceberg/all-data-types/result.050.adm @@ -1 +1 @@ -{ "binary_field": "AQIDBAUGBwg=", "fixed_field": "SGVsbG8gV29ybGQ=", "geometry_field": "AQIDBAUGBwgJCg==", "geography_field": "AQIDBAUGBwgJCg0=", "bool_field": true, "byte_field": 42, "short_field": 1000, "int_field": 42, "long_field": 9223372036854775807, "float_field": 3.14, "double_field": 2.718281828459045, "decimal_field": 12345.6789, "string_field": "hello world", "varchar_field": "varchar value one", "char_field": "Hi", "uuid_field": uuid("550e8400-e29b-41d4-a716-446655440000"), "date_field": date("2024-01-01"), "time_field": time("10:20:30.000"), "timestamp_field": datetime("2024-02-03T23:40:00.000"), "timestamp_ntz_field": datetime("2024-02-04T12:00:00.000"), "timestamp_nano_field": datetime("2024-02-03T23:40:00.000"), "interval_ym_field": 14, "interval_dt_field": 37230000000, "struct_field": { "name": "Alice", "age": 30, "active": true }, "list_field": [ "a", "b", "c" ], "map_field": { "key1": "value1", "key2": "100" }, "variant_field": "string value", "unknown_field": null } \ No newline at end of file +{ "binary_field": "AQIDBAUGBwg=", "fixed_field": "SGVsbG8gV29ybGQ=", "geometry_field": "AQIDBAUGBwgJCg==", "geography_field": "AQIDBAUGBwgJCg0=", "bool_field": true, "byte_field": 42, "short_field": 1000, "int_field": 42, "long_field": 9223372036854775807, "float_field": 3.14, "double_field": 2.718281828459045, "decimal_field": 12345.6789, "string_field": "hello world", "varchar_field": "varchar value one", "char_field": "Hi", "uuid_field": uuid("550e8400-e29b-41d4-a716-446655440000"), "date_field": date("2024-01-01"), "time_field": time("10:20:30.000"), "timestamp_field": datetime("2024-02-03T23:40:00.000"), "timestamp_ntz_field": datetime("2024-02-04T12:00:00.000"), "timestamp_nano_field": datetime("2024-02-03T23:40:00.000"), "interval_ym_field": 14, "interval_dt_field": 37230000000, "struct_field": { "name": "Alice", "age": 30, "active": true }, "list_field": [ "a", "b", "c" ], "map_field": [ { "key": "key1", "value": "value1" }, { "key": "key2", "value": "100" } ], "variant_field": "string value", "unknown_field": null } \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/iceberg/all-data-types/result.060.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/iceberg/all-data-types/result.060.adm index a6c1db5..5d49e61 100644 --- a/asterixdb/asterix-app/src/test/resources/runtimets/results/iceberg/all-data-types/result.060.adm +++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/iceberg/all-data-types/result.060.adm @@ -1 +1 @@ -{ "binary_field": "AQIDBAUGBwg=", "fixed_field": "SGVsbG8gV29ybGQ=", "geometry_field": "AQIDBAUGBwgJCg==", "geography_field": "AQIDBAUGBwgJCg0=", "bool_field": true, "byte_field": 42, "short_field": 1000, "int_field": 42, "long_field": 9223372036854775807, "float_field": 3.14, "double_field": 2.718281828459045, "decimal_field": 12345.6789, "string_field": "hello world", "varchar_field": "varchar value one", "char_field": "Hi", "uuid_field": uuid("550e8400-e29b-41d4-a716-446655440000"), "date_field": date("2024-01-01"), "time_field": time("10:20:30.000"), "timestamp_field": datetime("2024-02-03T23:40:00.000"), "timestamp_ntz_field": datetime("2024-02-04T12:00:00.000"), "timestamp_nano_field": datetime("2024-02-03T23:40:00.000"), "interval_ym_field": 14, "interval_dt_field": 37230000000, "struct_field": { "name": "Alice", "age": 30, "active": true }, "list_field": [ "a", "b", "c" ], "map_field": { "key1": "value1", "key2": "100" }, "variant_field": "string value", "unknown_field": null } \ No newline at end of file +{ "binary_field": "AQIDBAUGBwg=", "fixed_field": "SGVsbG8gV29ybGQ=", "geometry_field": "AQIDBAUGBwgJCg==", "geography_field": "AQIDBAUGBwgJCg0=", "bool_field": true, "byte_field": 42, "short_field": 1000, "int_field": 42, "long_field": 9223372036854775807, "float_field": 3.14, "double_field": 2.718281828459045, "decimal_field": 12345.6789, "string_field": "hello world", "varchar_field": "varchar value one", "char_field": "Hi", "uuid_field": uuid("550e8400-e29b-41d4-a716-446655440000"), "date_field": date("2024-01-01"), "time_field": time("10:20:30.000"), "timestamp_field": datetime("2024-02-03T23:40:00.000"), "timestamp_ntz_field": datetime("2024-02-04T12:00:00.000"), "timestamp_nano_field": datetime("2024-02-03T23:40:00.000"), "interval_ym_field": 14, "interval_dt_field": 37230000000, "struct_field": { "name": "Alice", "age": 30, "active": true }, "list_field": [ "a", "b", "c" ], "map_field": [ { "key": "key1", "value": "value1" }, { "key": "key2", "value": "100" } ], "variant_field": "string value", "unknown_field": null } \ No newline at end of file diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/IcebergParquetDataParser.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/IcebergParquetDataParser.java index aaf37d0..7ed1b24 100644 --- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/IcebergParquetDataParser.java +++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/IcebergParquetDataParser.java @@ -232,28 +232,31 @@ } private void parseMap(Types.MapType mapSchema, Map<?, ?> map, DataOutput out) throws IOException { - IMutableValueStorage valueBuffer = parserContext.enterObject(); + final IMutableValueStorage item = parserContext.enterCollection(); + final IMutableValueStorage valueBuffer = parserContext.enterObject(); IARecordBuilder objectBuilder = parserContext.getObjectBuilder(DefaultOpenFieldType.NESTED_OPEN_RECORD_TYPE); - valueEmbedder.enterObject(); + IAsterixListBuilder listBuilder = + parserContext.getCollectionBuilder(DefaultOpenFieldType.NESTED_OPEN_AORDERED_LIST_TYPE); Type keyType = mapSchema.keyType(); Type valueType = mapSchema.valueType(); - // TODO: we can't support non-string keys since we map MAP-TYPE to OBJECT-TYPE in AsterixDB - if (keyType != Types.StringType.get()) { - throw new RuntimeDataException(ErrorCode.TYPE_UNSUPPORTED, "Iceberg Parser", "MAP with non-string keys"); - } - for (Map.Entry<?, ?> entry : map.entrySet()) { - String fieldName = (String) entry.getKey(); - Object fieldValue = entry.getValue(); - parseValueAndAddObjectField(valueBuffer, objectBuilder, valueType, fieldName, fieldValue); + objectBuilder.reset(DefaultOpenFieldType.NESTED_OPEN_RECORD_TYPE); + valueBuffer.reset(); + parseValue(keyType, entry.getKey(), valueBuffer.getDataOutput()); + objectBuilder.addField(parserContext.getSerializedFieldName("key"), valueBuffer); + valueBuffer.reset(); + parseValue(valueType, entry.getValue(), valueBuffer.getDataOutput()); + objectBuilder.addField(parserContext.getSerializedFieldName("value"), valueBuffer); + item.reset(); + objectBuilder.write(item.getDataOutput(), true); + listBuilder.addItem(item); } - embedMissingValues(objectBuilder, parserContext, valueEmbedder); - objectBuilder.write(out, true); - valueEmbedder.exitObject(); + listBuilder.write(out, true); parserContext.exitObject(valueBuffer, null, objectBuilder); + parserContext.exitCollection(item, listBuilder); } private void parseValueAndAddObjectField(IMutableValueStorage valueBuffer, IARecordBuilder objectBuilder, @@ -467,8 +470,8 @@ ensureDecimalToDoubleEnabled(type, parserContext); yield ATypeTag.DOUBLE; } - case STRUCT, MAP -> ATypeTag.OBJECT; - case LIST -> ATypeTag.ARRAY; + case STRUCT -> ATypeTag.OBJECT; + case LIST, MAP -> ATypeTag.ARRAY; case DATE -> { if (parserContext.isDateAsInt()) { yield ATypeTag.INTEGER; -- To view, visit https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/21323?usp=email To unsubscribe, or for help writing mail filters, visit https://asterix-gerrit.ics.uci.edu/settings?usp=email Gerrit-MessageType: merged Gerrit-Project: asterixdb Gerrit-Branch: lumina Gerrit-Change-Id: I562fcb2c4c62770a67d2adb7c1f8dcaa1fca5d59 Gerrit-Change-Number: 21323 Gerrit-PatchSet: 2 Gerrit-Owner: Hussain Towaileb <[email protected]> Gerrit-Reviewer: Anon. E. Moose #1000171 Gerrit-Reviewer: Hussain Towaileb <[email protected]> Gerrit-Reviewer: Jenkins <[email protected]> Gerrit-Reviewer: Murtadha Hubail <[email protected]>
