This is an automated email from the ASF dual-hosted git repository. mblow pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/asterixdb.git
commit e5a535fa562f2e3b73c0e79d0a5c1ffafa049e31 Author: preetham0202 <[email protected]> AuthorDate: Sun Jul 27 01:45:19 2025 +0530 [ASTERIXDB-3392] Fix Hdfs tests - user model changes: no - storage format changes: no - interface changes: no Ext-ref: MB-66710 Change-Id: I60fcbc11db2a9b1b4973bdcdb803db8481863071 Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/20144 Integration-Tests: Jenkins <[email protected]> Tested-by: Jenkins <[email protected]> Reviewed-by: Preetham Poluparthi <[email protected]> Reviewed-by: Ali Alsuliman <[email protected]> --- .../parquet-tweet/parquet-tweet.03.update.sqlpp | 181 --------------------- .../parquet-utf8/parquet-utf8.03.update.sqlpp | 1 - .../copy-to-hdfs/parquet-utf8/parquet-utf8.05.adm | 16 +- .../resources/runtimets/testsuite_sqlpp_hdfs.xml | 2 + .../printer/parquet/ParquetRecordLazyVisitor.java | 4 +- .../printer/parquet/ParquetSchemaLazyVisitor.java | 21 +-- .../printer/parquet/SchemaCheckerLazyVisitor.java | 13 +- 7 files changed, 29 insertions(+), 209 deletions(-) diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to-hdfs/parquet-tweet/parquet-tweet.03.update.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to-hdfs/parquet-tweet/parquet-tweet.03.update.sqlpp index 9bae74bfec..2b265dcccc 100644 --- a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to-hdfs/parquet-tweet/parquet-tweet.03.update.sqlpp +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to-hdfs/parquet-tweet/parquet-tweet.03.update.sqlpp @@ -24,187 +24,6 @@ COPY ( ) toWriter TO hdfs PATH ("copy-to-result", "parquet-tweet") -TYPE ( { - coordinates: { - coordinates: [ - double - ], - `type` : string - }, - created_at: string, - entities: { - urls: [ - { - display_url: string, - expanded_url: string, - indices: [ - int - ], - url: string - } - ], - user_mentions: [ - { - id: int, - id_str: string, - indices: [ - int - ], - name: string, - screen_name: string - } - ] - }, - favorite_count: int, - favorited: boolean, - filter_level: string, - geo: { - coordinates: [ - double - ], - `type`: string - }, - id: string, - id_str: string, - in_reply_to_screen_name: string, - in_reply_to_status_id: int, - in_reply_to_status_id_str: string, - in_reply_to_user_id: int, - in_reply_to_user_id_str: string, - is_quote_status: boolean, - lang: string, - place: { - bounding_box: { - coordinates: [ - [ - [ - double - ] - ] - ], - `type`: string - }, - country: string, - country_code: string, - full_name: string, - id: string, - name: string, - place_type: string, - url: string - }, - possibly_sensitive: boolean, - quoted_status: { - created_at: string, - entities: { - user_mentions: [ - { - id: int, - id_str: string, - indices: [ - int - ], - name: string, - screen_name: string - } - ] - }, - favorite_count: int, - favorited: boolean, - filter_level: string, - id: int, - id_str: string, - in_reply_to_screen_name: string, - in_reply_to_status_id: int, - in_reply_to_status_id_str: string, - in_reply_to_user_id: int, - in_reply_to_user_id_str: string, - is_quote_status: boolean, - lang: string, - retweet_count: int, - retweeted: boolean, - source: string, - text: string, - truncated: boolean, - user: { - contributors_enabled: boolean, - created_at: string, - default_profile: boolean, - default_profile_image: boolean, - description: string, - favourites_count: int, - followers_count: int, - friends_count: int, - geo_enabled: boolean, - id: int, - id_str: string, - is_translator: boolean, - lang: string, - listed_count: int, - name: string, - profile_background_color: string, - profile_background_image_url: string, - profile_background_image_url_https: string, - profile_background_tile: boolean, - profile_banner_url: string, - profile_image_url: string, - profile_image_url_https: string, - profile_link_color: string, - profile_sidebar_border_color: string, - profile_sidebar_fill_color: string, - profile_text_color: string, - profile_use_background_image: boolean, - protected: boolean, - screen_name: string, - statuses_count: int, - verified: boolean - } - }, - quoted_status_id: int, - quoted_status_id_str: string, - retweet_count: int, - retweeted: boolean, - source: string, - text: string, - timestamp_ms: string, - truncated: boolean, - user: { - contributors_enabled: boolean, - created_at: string, - default_profile: boolean, - default_profile_image: boolean, - description: string, - favourites_count: int, - followers_count: int, - friends_count: int, - geo_enabled: boolean, - id: int, - id_str: string, - is_translator: boolean, - lang: string, - listed_count: int, - location: string, - name: string, - profile_background_color: string, - profile_background_image_url: string, - profile_background_image_url_https: string, - profile_background_tile: boolean, - profile_banner_url: string, - profile_image_url: string, - profile_image_url_https: string, - profile_link_color: string, - profile_sidebar_border_color: string, - profile_sidebar_fill_color: string, - profile_text_color: string, - profile_use_background_image: boolean, - protected: boolean, - screen_name: string, - statuses_count: int, - time_zone: string, - url: string, - utc_offset: int, - verified: boolean - } - } ) WITH { "hdfs":"hdfs://127.0.0.1:31888", "format":"parquet" diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to-hdfs/parquet-utf8/parquet-utf8.03.update.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to-hdfs/parquet-utf8/parquet-utf8.03.update.sqlpp index 79b2ddabd7..4d84bed15d 100644 --- a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to-hdfs/parquet-utf8/parquet-utf8.03.update.sqlpp +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to-hdfs/parquet-utf8/parquet-utf8.03.update.sqlpp @@ -24,7 +24,6 @@ COPY ( ) toWriter TO hdfs PATH ("copy-to-result", "parquet-utf8") -TYPE ( { comment:string, id:bigint, name:string } ) WITH { "hdfs":"hdfs://127.0.0.1:31888", "format":"parquet" diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/copy-to-hdfs/parquet-utf8/parquet-utf8.05.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/copy-to-hdfs/parquet-utf8/parquet-utf8.05.adm index c60145d7f4..3ea2e8d765 100644 --- a/asterixdb/asterix-app/src/test/resources/runtimets/results/copy-to-hdfs/parquet-utf8/parquet-utf8.05.adm +++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/copy-to-hdfs/parquet-utf8/parquet-utf8.05.adm @@ -1,8 +1,8 @@ -{ "id": 1, "name": "John" } -{ "id": 2, "name": "Abel" } -{ "id": 3, "name": "Sandy" } -{ "id": 4, "name": "Alex" } -{ "id": 5, "name": "Mike" } -{ "id": 6, "name": "Tom" } -{ "comment": "😢😢💉💉 = 𩸽 😢😢💉💉. Coffee ☕‼️😃. حسنا", "id": 7, "name": "Jerry" } -{ "comment": "😢😢💉💉 = 𩸽 😢😢💉💉. Coffee ☕‼️😃. حسنا😢😢💉💉 = 𩸽 😢😢💉💉. Coffee ☕‼️😃. حسنا😢😢💉💉 = 𩸽 😢😢💉💉. Coffee ☕‼️😃. حسنا😢😢💉💉 = 𩸽 😢😢💉💉. Coffee ☕‼️😃. حسنا😢😢💉💉 = 𩸽 😢😢💉💉. Coffee ☕‼️😃. حسنا😢😢💉💉 = 𩸽 😢😢💉💉. Coffee ☕‼️😃. حسنا😢😢💉💉 = 𩸽 😢😢💉💉. Coffee ☕‼️😃. حسنا😢😢💉💉 = 𩸽 😢😢💉💉. Coffee ☕‼️😃. حسنا😢😢💉💉 = 𩸽 😢😢💉💉. Coffee ☕‼️😃. حسنا😢😢💉💉 = 𩸽 😢😢💉💉. Coffee ☕‼️😃. حسنا😢😢💉💉 = 𩸽 😢😢💉💉. Coffee ☕‼️😃. حسنا😢😢💉💉 = 𩸽 😢😢💉💉. Coffee ☕‼️😃. حسنا😢😢💉💉 = 𩸽 😢😢💉💉. Coffee ☕‼️😃. حسنا😢😢💉💉 = 𩸽 😢😢💉💉. Coffe e ☕‼️😃. حسنا😢😢💉💉 = 𩸽 😢😢💉💉. Coffee ☕‼️😃. حس [...] +{ "name": "John", "id": 1 } +{ "name": "Abel", "id": 2 } +{ "name": "Sandy", "id": 3 } +{ "name": "Alex", "id": 4 } +{ "name": "Mike", "id": 5 } +{ "name": "Tom", "id": 6 } +{ "name": "Jerry", "comment": "😢😢💉💉 = 𩸽 😢😢💉💉. Coffee ☕‼️😃. حسنا", "id": 7 } +{ "name": "William", "comment": "😢😢💉💉 = 𩸽 😢😢💉💉. Coffee ☕‼️😃. حسنا😢😢💉💉 = 𩸽 😢😢💉💉. Coffee ☕‼️😃. حسنا😢😢💉💉 = 𩸽 😢😢💉💉. Coffee ☕‼️😃. حسنا😢😢💉💉 = 𩸽 😢😢💉💉. Coffee ☕‼️😃. حسنا😢😢💉💉 = 𩸽 😢😢💉💉. Coffee ☕‼️😃. حسنا😢😢💉💉 = 𩸽 😢😢💉💉. Coffee ☕‼️😃. حسنا😢😢💉💉 = 𩸽 😢😢💉💉. Coffee ☕‼️😃. حسنا😢😢💉💉 = 𩸽 😢😢💉💉. Coffee ☕‼️😃. حسنا😢😢💉💉 = 𩸽 😢😢💉💉. Coffee ☕‼️😃. حسنا😢😢💉💉 = 𩸽 😢😢💉💉. Coffee ☕‼️😃. حسنا😢😢💉💉 = 𩸽 😢😢💉💉. Coffee ☕‼️😃. حسنا😢😢💉💉 = 𩸽 😢😢💉💉. Coffee ☕‼️😃. حسنا😢😢💉💉 = 𩸽 😢😢💉💉. Coffee ☕‼️😃. حسنا😢😢💉💉 = 𩸽 😢 😢💉💉. Coffee ☕‼️😃. حسنا😢😢💉💉 = 𩸽 😢😢 [...] diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp_hdfs.xml b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp_hdfs.xml index c163fd2a04..05e871d32a 100644 --- a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp_hdfs.xml +++ b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp_hdfs.xml @@ -263,6 +263,8 @@ <expected-error>ASX0072: Parquet does not support arrays containing mixed data types</expected-error> <expected-error>ASX0072: Parquet does not support arrays containing mixed data types</expected-error> <expected-error>ASX0072: Parquet does not support arrays containing mixed data types</expected-error> + <expected-error>HYR0132: Extra field in the result, field 'c' does not exist at 'root' in the schema</expected-error> + <expected-error>ASX0072: Parquet does not support arrays containing mixed data types</expected-error> <source-location>false</source-location> </compilation-unit> </test-case> diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/ParquetRecordLazyVisitor.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/ParquetRecordLazyVisitor.java index d57d3c7e11..cc6f98fbbd 100644 --- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/ParquetRecordLazyVisitor.java +++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/ParquetRecordLazyVisitor.java @@ -184,11 +184,11 @@ public class ParquetRecordLazyVisitor implements ILazyVisitablePointableVisitor< for (int i = 0; i < pointable.getNumberOfChildren(); i++) { pointable.nextChild(); AbstractLazyVisitablePointable child = pointable.getChildVisitablePointable(); - String columnName = fieldNamesDictionary.getOrCreateFieldNameIndex(pointable.getFieldName()); if (child.getTypeTag() == ATypeTag.MISSING) { continue; } nonMissingChildren++; + String columnName = fieldNamesDictionary.getOrCreateFieldNameIndex(pointable.getFieldName()); if (!groupType.containsField(columnName)) { LOGGER.info("Group type: {} does not contain field in record type: {}", LogRedactionUtil.userData(groupType.getName()), LogRedactionUtil.userData(columnName)); @@ -299,12 +299,12 @@ public class ParquetRecordLazyVisitor implements ILazyVisitablePointableVisitor< recordConsumer.startMessage(); for (int i = 0; i < rec.getNumberOfChildren(); i++) { rec.nextChild(); - String columnName = fieldNamesDictionary.getOrCreateFieldNameIndex(rec.getFieldName()); AbstractLazyVisitablePointable child = rec.getChildVisitablePointable(); if (child.getTypeTag() == ATypeTag.MISSING) { continue; } nonMissingChildren++; + String columnName = fieldNamesDictionary.getOrCreateFieldNameIndex(rec.getFieldName()); if (!schema.containsField(columnName)) { LOGGER.info("Schema: {} does not contain field: {}", LogRedactionUtil.userData(schema.toString()), LogRedactionUtil.userData(columnName)); diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/ParquetSchemaLazyVisitor.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/ParquetSchemaLazyVisitor.java index 3f0de608d1..d43947c449 100644 --- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/ParquetSchemaLazyVisitor.java +++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/ParquetSchemaLazyVisitor.java @@ -97,7 +97,8 @@ public class ParquetSchemaLazyVisitor implements ILazyVisitablePointableVisitor< schemaNode.setType(new ParquetSchemaTree.ListType()); } if (!(schemaNode.getType() instanceof ParquetSchemaTree.ListType listType)) { - LOGGER.info("Incompatible type found in list: {} and {}" ,LogRedactionUtil.userData(schemaNode.toString()) ,pointable.getTypeTag()); + LOGGER.info("Incompatible type found in list: {} and {}", LogRedactionUtil.userData(schemaNode.toString()), + pointable.getTypeTag()); throw RuntimeDataException.create(PARQUET_UNSUPPORTED_MIXED_TYPE_ARRAY); } int numChildren = pointable.getNumberOfChildren(); @@ -105,7 +106,7 @@ public class ParquetSchemaLazyVisitor implements ILazyVisitablePointableVisitor< pointable.nextChild(); AbstractLazyVisitablePointable child = pointable.getChildVisitablePointable(); - if(child.getTypeTag()==ATypeTag.MISSING) { + if (child.getTypeTag() == ATypeTag.MISSING) { throw RuntimeDataException.create(PARQUET_UNSUPPORTED_MIXED_TYPE_ARRAY); } @@ -118,15 +119,14 @@ public class ParquetSchemaLazyVisitor implements ILazyVisitablePointableVisitor< } @Override - public Void visit(FlatLazyVisitablePointable pointable,ParquetSchemaTree.SchemaNode schemaNode) + public Void visit(FlatLazyVisitablePointable pointable, ParquetSchemaTree.SchemaNode schemaNode) throws HyracksDataException { - if(pointable.getTypeTag() == ATypeTag.NULL) { - return null; + if (pointable.getTypeTag() == ATypeTag.NULL) { + return null; } if (schemaNode.getType() == null) { - if (pointable.getTypeTag() == ATypeTag.MISSING) - { + if (pointable.getTypeTag() == ATypeTag.MISSING) { foundMissing = true; schemaNode.setType(new ParquetSchemaTree.FlatType(ATypeTag.MISSING)); return null; @@ -186,11 +186,12 @@ public class ParquetSchemaLazyVisitor implements ILazyVisitablePointableVisitor< private static void removeMissing(ParquetSchemaTree.SchemaNode schemaNode) { if (schemaNode.getType() == null) { - return; + return; } if (schemaNode.getType() instanceof ParquetSchemaTree.RecordType recordType) { - recordType.getChildren().entrySet().removeIf( - entry -> (entry.getValue().getType() instanceof ParquetSchemaTree.FlatType flatType && flatType.getTypeTag() == ATypeTag.MISSING)); + recordType.getChildren().entrySet() + .removeIf(entry -> (entry.getValue().getType() instanceof ParquetSchemaTree.FlatType flatType + && flatType.getTypeTag() == ATypeTag.MISSING)); for (Map.Entry<String, ParquetSchemaTree.SchemaNode> entry : recordType.getChildren().entrySet()) { removeMissing(entry.getValue()); diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/SchemaCheckerLazyVisitor.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/SchemaCheckerLazyVisitor.java index e484e84035..6e03144ac9 100644 --- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/SchemaCheckerLazyVisitor.java +++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/SchemaCheckerLazyVisitor.java @@ -64,19 +64,18 @@ public class SchemaCheckerLazyVisitor implements ISchemaChecker, for (int i = 0; i < pointable.getNumberOfChildren(); i++) { pointable.nextChild(); AbstractLazyVisitablePointable child = pointable.getChildVisitablePointable(); - if(child.getTypeTag() == ATypeTag.MISSING){ - continue; + if (child.getTypeTag() == ATypeTag.MISSING) { + continue; } nonMissingChildren++; String childColumnName = fieldNamesDictionary.getOrCreateFieldNameIndex(pointable.getFieldName()); ParquetSchemaTree.SchemaNode childType = recordType.getChildren().get(childColumnName); if (childType == null) { - schemaComparisonType = ISchemaChecker.max(schemaComparisonType, SchemaComparisonType.CONFLICTING); - continue; + return SchemaComparisonType.CONFLICTING; } schemaComparisonType = ISchemaChecker.max(schemaComparisonType, child.accept(this, childType)); } - if(nonMissingChildren!= recordType.getChildren().size()) { + if (nonMissingChildren != recordType.getChildren().size()) { return SchemaComparisonType.CONFLICTING; } return schemaComparisonType; @@ -118,7 +117,7 @@ public class SchemaCheckerLazyVisitor implements ISchemaChecker, return SchemaComparisonType.GROWING; } // SchemaNode.getTypeTag can never be MISSING here - if(currentValue.getTypeTag()==ATypeTag.NULL){ + if (currentValue.getTypeTag() == ATypeTag.NULL) { return SchemaComparisonType.EQUIVALENT; } if (!(schemaNode.getType() instanceof ParquetSchemaTree.FlatType inferredType)) { @@ -131,7 +130,7 @@ public class SchemaCheckerLazyVisitor implements ISchemaChecker, if (!inferredType.isCompatibleWith(currentValue.getTypeTag())) { return ISchemaChecker.SchemaComparisonType.CONFLICTING; } - if(inferredType.isStrictChildOf(currentValue.getTypeTag())) { + if (inferredType.isStrictChildOf(currentValue.getTypeTag())) { return ISchemaChecker.SchemaComparisonType.GROWING; } if (inferredType.isStrictParentOf(currentValue.getTypeTag())) {
