This is an automated email from the ASF dual-hosted git repository. mhubail pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/asterixdb.git
commit f8e7c85cb2cbcf706eba43d6beb86d41387fb01e Author: preetham0202 <[email protected]> AuthorDate: Mon Mar 24 11:36:45 2025 +0530 [ASTERIXDB-3392] Error out on mixed-type arrays in Parquet Details: Parquet does not support arrays with mixed data types. This patch introduces a check to detect such cases and explicitly error out when a user attempts to write an array containing different types. Ext-ref: MB-65899 Change-Id: I07f382f802fd61e55c23c3ce23ccfcc634ede13d Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/19545 Integration-Tests: Jenkins <[email protected]> Tested-by: Hussain Towaileb <[email protected]> Reviewed-by: Hussain Towaileb <[email protected]> --- .../parquet-error-checks.17.update.sqlpp | 2 +- ....update.sqlpp => parquet-error-checks.18.ddl.sqlpp} | 18 ++++-------------- ...date.sqlpp => parquet-error-checks.19.update.sqlpp} | 11 ++++++----- ...date.sqlpp => parquet-error-checks.20.update.sqlpp} | 11 ++++++----- ...date.sqlpp => parquet-error-checks.21.update.sqlpp} | 11 ++++++----- ...date.sqlpp => parquet-error-checks.22.update.sqlpp} | 11 ++++++----- .../runtimets/testsuite_external_dataset_s3.xml | 4 ++++ .../apache/asterix/common/exceptions/ErrorCode.java | 1 + .../src/main/resources/asx_errormsg/en.properties | 1 + .../printer/parquet/ParquetSchemaLazyVisitor.java | 10 +++++----- 10 files changed, 40 insertions(+), 40 deletions(-) diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.17.update.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.17.update.sqlpp index e956812458..9ebd4730e3 100644 --- a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.17.update.sqlpp +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.17.update.sqlpp @@ -26,7 +26,7 @@ COPY ( select id,name from TestCollection c ) toWriter TO %adapter% -PATH (%pathprefix% "copy-to-result", "parquet-error-checks16") +PATH (%pathprefix% "copy-to-result", "parquet-error-checks17") TYPE ( { id:int, rect: rectangle }) WITH { %template_colons%, diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.17.update.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.18.ddl.sqlpp similarity index 74% copy from asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.17.update.sqlpp copy to asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.18.ddl.sqlpp index e956812458..8e13397a57 100644 --- a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.17.update.sqlpp +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.18.ddl.sqlpp @@ -21,20 +21,10 @@ USE test; - -COPY ( -select id,name from TestCollection c - ) toWriter -TO %adapter% -PATH (%pathprefix% "copy-to-result", "parquet-error-checks16") -TYPE ( { id:int, rect: rectangle }) -WITH { - %template_colons%, - %additionalProperties% - "format":"parquet", - "max-schemas" : "2" - } - +CREATE COLLECTION TestCollection2(ColumnType1) PRIMARY KEY id; +CREATE COLLECTION TestCollection3(ColumnType1) PRIMARY KEY id; +CREATE COLLECTION TestCollection4(ColumnType1) PRIMARY KEY id; +CREATE COLLECTION TestCollection5(ColumnType1) PRIMARY KEY id; diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.17.update.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.19.update.sqlpp similarity index 83% copy from asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.17.update.sqlpp copy to asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.19.update.sqlpp index e956812458..7add0a4459 100644 --- a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.17.update.sqlpp +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.19.update.sqlpp @@ -22,17 +22,18 @@ USE test; +insert into TestCollection2({"id":10, "names": [ "Virat" , 18 ] }); + + COPY ( -select id,name from TestCollection c +select * from TestCollection2 c ) toWriter TO %adapter% -PATH (%pathprefix% "copy-to-result", "parquet-error-checks16") -TYPE ( { id:int, rect: rectangle }) +PATH (%pathprefix% "copy-to-result", "parquet-error-checks19") WITH { %template_colons%, %additionalProperties% - "format":"parquet", - "max-schemas" : "2" + "format":"parquet" } diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.17.update.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.20.update.sqlpp similarity index 82% copy from asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.17.update.sqlpp copy to asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.20.update.sqlpp index e956812458..c0a8308929 100644 --- a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.17.update.sqlpp +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.20.update.sqlpp @@ -22,17 +22,18 @@ USE test; +insert into TestCollection3({"id":10, "names": [ { "first":"Virat"} , 18 ] }); + + COPY ( -select id,name from TestCollection c +select * from TestCollection3 c ) toWriter TO %adapter% -PATH (%pathprefix% "copy-to-result", "parquet-error-checks16") -TYPE ( { id:int, rect: rectangle }) +PATH (%pathprefix% "copy-to-result", "parquet-error-checks20") WITH { %template_colons%, %additionalProperties% - "format":"parquet", - "max-schemas" : "2" + "format":"parquet" } diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.17.update.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.21.update.sqlpp similarity index 82% copy from asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.17.update.sqlpp copy to asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.21.update.sqlpp index e956812458..8fc87b47c6 100644 --- a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.17.update.sqlpp +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.21.update.sqlpp @@ -22,17 +22,18 @@ USE test; +insert into TestCollection4({"id":10, "names": [ "Virat" , [18]] }); + + COPY ( -select id,name from TestCollection c +select * from TestCollection4 c ) toWriter TO %adapter% -PATH (%pathprefix% "copy-to-result", "parquet-error-checks16") -TYPE ( { id:int, rect: rectangle }) +PATH (%pathprefix% "copy-to-result", "parquet-error-checks21") WITH { %template_colons%, %additionalProperties% - "format":"parquet", - "max-schemas" : "2" + "format":"parquet" } diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.17.update.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.22.update.sqlpp similarity index 82% copy from asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.17.update.sqlpp copy to asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.22.update.sqlpp index e956812458..ba501a26c9 100644 --- a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.17.update.sqlpp +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.22.update.sqlpp @@ -22,17 +22,18 @@ USE test; +insert into TestCollection5({"id":10, "names": [ 18, { "first":"Virat"} ] }); + + COPY ( -select id,name from TestCollection c +select * from TestCollection5 c ) toWriter TO %adapter% -PATH (%pathprefix% "copy-to-result", "parquet-error-checks16") -TYPE ( { id:int, rect: rectangle }) +PATH (%pathprefix% "copy-to-result", "parquet-error-checks22") WITH { %template_colons%, %additionalProperties% - "format":"parquet", - "max-schemas" : "2" + "format":"parquet" } diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset_s3.xml b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset_s3.xml index db72e10aba..72d2575647 100644 --- a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset_s3.xml +++ b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset_s3.xml @@ -221,6 +221,10 @@ <expected-error>HYR0133: Schema could not be inferred, empty types found in the result</expected-error> <expected-error>HYR0134: Schema Limit exceeded, maximum number of heterogeneous schemas allowed : '2'</expected-error> <expected-error>ASX1204: 'rectangle' type not supported in parquet format</expected-error> + <expected-error>ASX0072: Parquet does not support arrays containing mixed data types</expected-error> + <expected-error>ASX0072: Parquet does not support arrays containing mixed data types</expected-error> + <expected-error>ASX0072: Parquet does not support arrays containing mixed data types</expected-error> + <expected-error>ASX0072: Parquet does not support arrays containing mixed data types</expected-error> </compilation-unit> </test-case> <test-case FilePath="copy-to/negative"> diff --git a/asterixdb/asterix-common/src/main/java/org/apache/asterix/common/exceptions/ErrorCode.java b/asterixdb/asterix-common/src/main/java/org/apache/asterix/common/exceptions/ErrorCode.java index 81b71ed9eb..46d63bb072 100644 --- a/asterixdb/asterix-common/src/main/java/org/apache/asterix/common/exceptions/ErrorCode.java +++ b/asterixdb/asterix-common/src/main/java/org/apache/asterix/common/exceptions/ErrorCode.java @@ -100,6 +100,7 @@ public enum ErrorCode implements IError { FAILED_TO_READ_KEY(69), AVRO_SUPPORTED_TYPE_WITH_OPTION(70), CLOUD_IO_FAILURE(71), + PARQUET_UNSUPPORTED_MIXED_TYPE_ARRAY(72), UNSUPPORTED_JRE(100), EXTERNAL_UDF_RESULT_TYPE_ERROR(200), diff --git a/asterixdb/asterix-common/src/main/resources/asx_errormsg/en.properties b/asterixdb/asterix-common/src/main/resources/asx_errormsg/en.properties index c9b87265ca..d6a171ff78 100644 --- a/asterixdb/asterix-common/src/main/resources/asx_errormsg/en.properties +++ b/asterixdb/asterix-common/src/main/resources/asx_errormsg/en.properties @@ -106,6 +106,7 @@ 69 = Failed to read key. Reason: %1$s. 70 = Avro type '%1$s' is not supported by default. To enable type conversion, recreate the external dataset with the option '%2$s' enabled 71 = Cloud I/O '%1$s' operation failed for file '%2$s' while operating on files '%3$s'. +72 = Parquet does not support arrays containing mixed data types 100 = Unsupported JRE: %1$s diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/ParquetSchemaLazyVisitor.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/ParquetSchemaLazyVisitor.java index 70872bb2ec..9ea6d77b0a 100644 --- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/ParquetSchemaLazyVisitor.java +++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/ParquetSchemaLazyVisitor.java @@ -18,6 +18,7 @@ */ package org.apache.asterix.external.writer.printer.parquet; +import static org.apache.asterix.common.exceptions.ErrorCode.PARQUET_UNSUPPORTED_MIXED_TYPE_ARRAY; import static org.apache.asterix.common.exceptions.ErrorCode.TYPE_UNSUPPORTED_PARQUET_WRITE; import static org.apache.asterix.external.writer.printer.parquet.ParquetSchemaTree.buildParquetSchema; @@ -33,7 +34,6 @@ import org.apache.asterix.om.lazy.TypedRecordLazyVisitablePointable; import org.apache.asterix.om.types.ARecordType; import org.apache.asterix.om.types.ATypeTag; import org.apache.asterix.om.types.IAType; -import org.apache.hyracks.api.exceptions.ErrorCode; import org.apache.hyracks.api.exceptions.HyracksDataException; import org.apache.hyracks.data.std.api.IValueReference; import org.apache.parquet.schema.MessageType; @@ -63,7 +63,7 @@ public class ParquetSchemaLazyVisitor implements ILazyVisitablePointableVisitor< schemaNode.setType(new ParquetSchemaTree.RecordType()); } if (!(schemaNode.getType() instanceof ParquetSchemaTree.RecordType)) { - throw new HyracksDataException(ErrorCode.RESULT_DOES_NOT_FOLLOW_SCHEMA); + throw RuntimeDataException.create(PARQUET_UNSUPPORTED_MIXED_TYPE_ARRAY); } ParquetSchemaTree.RecordType recordType = (ParquetSchemaTree.RecordType) schemaNode.getType(); for (int i = 0; i < pointable.getNumberOfChildren(); i++) { @@ -89,7 +89,7 @@ public class ParquetSchemaLazyVisitor implements ILazyVisitablePointableVisitor< schemaNode.setType(new ParquetSchemaTree.ListType()); } if (!(schemaNode.getType() instanceof ParquetSchemaTree.ListType)) { - throw new HyracksDataException(ErrorCode.RESULT_DOES_NOT_FOLLOW_SCHEMA); + throw RuntimeDataException.create(PARQUET_UNSUPPORTED_MIXED_TYPE_ARRAY); } ParquetSchemaTree.ListType listType = (ParquetSchemaTree.ListType) schemaNode.getType(); for (int i = 0; i < pointable.getNumberOfChildren(); i++) { @@ -116,14 +116,14 @@ public class ParquetSchemaLazyVisitor implements ILazyVisitablePointableVisitor< return null; } if (!(schemaNode.getType() instanceof ParquetSchemaTree.FlatType)) { - throw new HyracksDataException(ErrorCode.RESULT_DOES_NOT_FOLLOW_SCHEMA); + throw RuntimeDataException.create(PARQUET_UNSUPPORTED_MIXED_TYPE_ARRAY); } ParquetSchemaTree.FlatType flatType = (ParquetSchemaTree.FlatType) schemaNode.getType(); if (!(flatType.getPrimitiveTypeName() == AsterixParquetTypeMap.PRIMITIVE_TYPE_NAME_MAP .get(pointable.getTypeTag())) || !(flatType.getLogicalTypeAnnotation() == AsterixParquetTypeMap.LOGICAL_TYPE_ANNOTATION_MAP .get(pointable.getTypeTag()))) { - throw new HyracksDataException(ErrorCode.RESULT_DOES_NOT_FOLLOW_SCHEMA); + throw RuntimeDataException.create(PARQUET_UNSUPPORTED_MIXED_TYPE_ARRAY); } return null; }
