This is an automated email from the ASF dual-hosted git repository. mblow pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/asterixdb.git
commit 384570e47579980735a593682a451266c3f4830a Author: preetham0202 <[email protected]> AuthorDate: Fri Sep 19 11:52:59 2025 +0530 [ASTERIXDB-3392][EXT] Default to STRING for null-typed fields during schema inference in COPY TO parquet - user model changes: no - storage format changes: no - interface changes: no Details: When a compute partition consists entirely of null values, schema inference in copy to parquet defaults to STRING type in parquet. Ext-ref: MB-68610 Change-Id: I60069bf3d44c60effb69e5555bbf7e869327cc5b Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/20395 Reviewed-by: Ali Alsuliman <[email protected]> Integration-Tests: Jenkins <[email protected]> Tested-by: Ali Alsuliman <[email protected]> --- .../parquet-error-checks.23.ddl.sqlpp | 1 + .../parquet-null1.01.ddl.sqlpp} | 3 +++ .../parquet-null1.02.update.sqlpp} | 25 +++++++++++----------- .../parquet-null1.03.ddl.sqlpp} | 15 +++++++------ .../parquet-null1.04.query.sqlpp} | 8 +++---- .../parquet-null2.01.ddl.sqlpp} | 5 ++++- .../parquet-null2.02.update.sqlpp} | 17 ++++++++++----- .../parquet-null2.03.ddl.sqlpp} | 15 +++++++------ .../parquet-null2.04.query.sqlpp} | 8 +++---- .../parquet-simple/parquet-simple.01.ddl.sqlpp | 2 ++ .../parquet-simple/parquet-simple.02.update.sqlpp | 4 ++-- .../copy-to/parquet-null1/parquet-null1.04.adm | 2 ++ .../copy-to/parquet-null2/parquet-null2.04.adm | 3 +++ .../runtimets/testsuite_external_dataset_s3.xml | 21 +++++++++++++++++- .../writer/printer/parquet/ParquetSchemaTree.java | 21 ++++++++++++------ .../printer/parquet/SchemaCheckerLazyVisitor.java | 6 +++--- 16 files changed, 104 insertions(+), 52 deletions(-) diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.23.ddl.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.23.ddl.sqlpp index a2dbe7f51d..f504fd5d57 100644 --- a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.23.ddl.sqlpp +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.23.ddl.sqlpp @@ -23,6 +23,7 @@ USE test; CREATE COLLECTION TestCollection6(ColumnType1) PRIMARY KEY id; CREATE COLLECTION TestCollection7(ColumnType1) PRIMARY KEY id; +CREATE COLLECTION TestCollection8(ColumnType1) PRIMARY KEY id; diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-simple/parquet-simple.01.ddl.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-null1/parquet-null1.01.ddl.sqlpp similarity index 95% copy from asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-simple/parquet-simple.01.ddl.sqlpp copy to asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-null1/parquet-null1.01.ddl.sqlpp index 76970a5579..0f80378feb 100644 --- a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-simple/parquet-simple.01.ddl.sqlpp +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-null1/parquet-null1.01.ddl.sqlpp @@ -23,3 +23,6 @@ USE test; CREATE TYPE ColumnType2 AS { }; + + +CREATE DATASET col2 primary key(id:int); diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.15.update.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-null1/parquet-null1.02.update.sqlpp similarity index 80% rename from asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.15.update.sqlpp rename to asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-null1/parquet-null1.02.update.sqlpp index be198a29d3..3d5158ba8e 100644 --- a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.15.update.sqlpp +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-null1/parquet-null1.02.update.sqlpp @@ -20,22 +20,21 @@ USE test; - - - - -insert into TestCollection({"id":1, "name": []}); - - +INSERT INTO col2 ( + [ +{"id": 1, "name": "aqay awil"}, + {"id": 2} + ] +); COPY ( -select c.* from TestCollection c - ) toWriter + select * from col2 +) toWriter TO %adapter% -PATH (%pathprefix% "copy-to-result", "parquet-error-checks15") +PATH (%pathprefix% "copy-to-result", "parquet-null1") WITH { %template_colons%, %additionalProperties% - "format":"parquet" - } - + "format":"parquet", + "version" : "2" +}; \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.23.ddl.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-null1/parquet-null1.03.ddl.sqlpp similarity index 75% copy from asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.23.ddl.sqlpp copy to asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-null1/parquet-null1.03.ddl.sqlpp index a2dbe7f51d..ab296ea796 100644 --- a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.23.ddl.sqlpp +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-null1/parquet-null1.03.ddl.sqlpp @@ -20,9 +20,12 @@ USE test; - -CREATE COLLECTION TestCollection6(ColumnType1) PRIMARY KEY id; -CREATE COLLECTION TestCollection7(ColumnType1) PRIMARY KEY id; - - - +CREATE EXTERNAL DATASET DatasetCopy(ColumnType2) USING %adapter% +( + %template%, + %additional_Properties%, + ("definition"="%path_prefix%copy-to-result/parquet-null1"), + ("format" = "parquet"), + ("requireVersionChangeDetection"="false"), + ("include"="*.parquet") +); \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-simple/parquet-simple.01.ddl.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-null1/parquet-null1.04.query.sqlpp similarity index 90% copy from asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-simple/parquet-simple.01.ddl.sqlpp copy to asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-null1/parquet-null1.04.query.sqlpp index 76970a5579..22540697a1 100644 --- a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-simple/parquet-simple.01.ddl.sqlpp +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-null1/parquet-null1.04.query.sqlpp @@ -17,9 +17,9 @@ * under the License. */ -DROP DATAVERSE test if exists; -CREATE DATAVERSE test; USE test; -CREATE TYPE ColumnType2 AS { -}; + +SELECT * +FROM DatasetCopy c +ORDER BY c.col2.id; diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-simple/parquet-simple.01.ddl.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-null2/parquet-null2.01.ddl.sqlpp similarity index 94% copy from asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-simple/parquet-simple.01.ddl.sqlpp copy to asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-null2/parquet-null2.01.ddl.sqlpp index 76970a5579..8bac75ecc2 100644 --- a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-simple/parquet-simple.01.ddl.sqlpp +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-null2/parquet-null2.01.ddl.sqlpp @@ -22,4 +22,7 @@ CREATE DATAVERSE test; USE test; CREATE TYPE ColumnType2 AS { -}; + }; + + +CREATE DATASET col2 primary key(id:int); diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-simple/parquet-simple.02.update.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-null2/parquet-null2.02.update.sqlpp similarity index 78% copy from asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-simple/parquet-simple.02.update.sqlpp copy to asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-null2/parquet-null2.02.update.sqlpp index 9b21be7bfb..ceafc41736 100644 --- a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-simple/parquet-simple.02.update.sqlpp +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-null2/parquet-null2.02.update.sqlpp @@ -20,15 +20,22 @@ USE test; +INSERT INTO col2 ( + [ +{"id": 1, "name": "aqay awil", "centuries" : []}, + {"id": 2, "centuries" : []}, + {"id":3 , "name": null, "centuries" : []} + ] +); + COPY ( - select "123" as id -) toWriter +select * from col2 + ) toWriter TO %adapter% -PATH (%pathprefix% "copy-to-result", "parquet-simple") -TYPE ( {id:string} ) +PATH (%pathprefix% "copy-to-result", "parquet-null2") WITH { %template_colons%, %additionalProperties% "format":"parquet", "version" : "2" -}; \ No newline at end of file + }; \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.23.ddl.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-null2/parquet-null2.03.ddl.sqlpp similarity index 75% copy from asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.23.ddl.sqlpp copy to asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-null2/parquet-null2.03.ddl.sqlpp index a2dbe7f51d..2e6e1809ed 100644 --- a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/negative/parquet-error-checks/parquet-error-checks.23.ddl.sqlpp +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-null2/parquet-null2.03.ddl.sqlpp @@ -20,9 +20,12 @@ USE test; - -CREATE COLLECTION TestCollection6(ColumnType1) PRIMARY KEY id; -CREATE COLLECTION TestCollection7(ColumnType1) PRIMARY KEY id; - - - +CREATE EXTERNAL DATASET DatasetCopy(ColumnType2) USING %adapter% +( + %template%, + %additional_Properties%, + ("definition"="%path_prefix%copy-to-result/parquet-null2"), + ("format" = "parquet"), + ("requireVersionChangeDetection"="false"), + ("include"="*.parquet") +); \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-simple/parquet-simple.01.ddl.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-null2/parquet-null2.04.query.sqlpp similarity index 90% copy from asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-simple/parquet-simple.01.ddl.sqlpp copy to asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-null2/parquet-null2.04.query.sqlpp index 76970a5579..22540697a1 100644 --- a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-simple/parquet-simple.01.ddl.sqlpp +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-null2/parquet-null2.04.query.sqlpp @@ -17,9 +17,9 @@ * under the License. */ -DROP DATAVERSE test if exists; -CREATE DATAVERSE test; USE test; -CREATE TYPE ColumnType2 AS { -}; + +SELECT * +FROM DatasetCopy c +ORDER BY c.col2.id; diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-simple/parquet-simple.01.ddl.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-simple/parquet-simple.01.ddl.sqlpp index 76970a5579..abff4f0231 100644 --- a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-simple/parquet-simple.01.ddl.sqlpp +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-simple/parquet-simple.01.ddl.sqlpp @@ -23,3 +23,5 @@ USE test; CREATE TYPE ColumnType2 AS { }; + + diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-simple/parquet-simple.02.update.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-simple/parquet-simple.02.update.sqlpp index 9b21be7bfb..d598e52862 100644 --- a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-simple/parquet-simple.02.update.sqlpp +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-simple/parquet-simple.02.update.sqlpp @@ -21,8 +21,8 @@ USE test; COPY ( - select "123" as id -) toWriter +select "123" as id + ) toWriter TO %adapter% PATH (%pathprefix% "copy-to-result", "parquet-simple") TYPE ( {id:string} ) diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/copy-to/parquet-null1/parquet-null1.04.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/copy-to/parquet-null1/parquet-null1.04.adm new file mode 100644 index 0000000000..c1aed87d1a --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/copy-to/parquet-null1/parquet-null1.04.adm @@ -0,0 +1,2 @@ +{ "c": { "col2": { "name": "aqay awil", "id": 1 } } } +{ "c": { "col2": { "id": 2 } } } diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/copy-to/parquet-null2/parquet-null2.04.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/copy-to/parquet-null2/parquet-null2.04.adm new file mode 100644 index 0000000000..29ca9ecd0d --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/copy-to/parquet-null2/parquet-null2.04.adm @@ -0,0 +1,3 @@ +{ "c": { "col2": { "centuries": [ ], "name": "aqay awil", "id": 1 } } } +{ "c": { "col2": { "centuries": [ ], "id": 2 } } } +{ "c": { "col2": { "centuries": [ ], "id": 3 } } } diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset_s3.xml b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset_s3.xml index 5b1d265e20..a2677833b9 100644 --- a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset_s3.xml +++ b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset_s3.xml @@ -69,6 +69,26 @@ <output-dir compare="Text">parquet-simple</output-dir> </compilation-unit> </test-case> + <test-case FilePath="copy-to"> + <compilation-unit name="parquet-null1"> + <placeholder name="adapter" value="S3" /> + <placeholder name="pathprefix" value="" /> + <placeholder name="path_prefix" value="" /> + <placeholder name="additionalProperties" value='"container":"playground",' /> + <placeholder name="additional_Properties" value='("container"="playground")' /> + <output-dir compare="Text">parquet-null1</output-dir> + </compilation-unit> + </test-case> + <test-case FilePath="copy-to"> + <compilation-unit name="parquet-null2"> + <placeholder name="adapter" value="S3" /> + <placeholder name="pathprefix" value="" /> + <placeholder name="path_prefix" value="" /> + <placeholder name="additionalProperties" value='"container":"playground",' /> + <placeholder name="additional_Properties" value='("container"="playground")' /> + <output-dir compare="Text">parquet-null2</output-dir> + </compilation-unit> + </test-case> <test-case FilePath="copy-to"> <compilation-unit name="parquet-tweet"> <placeholder name="adapter" value="S3" /> @@ -278,7 +298,6 @@ <expected-error>ASX1205: Invalid Parquet Writer Version provided '3'. Supported values: [1, 2]</expected-error> <expected-error>ASX0039: Expected integer value, got yvghc (in line 22, at column 6)</expected-error> <expected-error>ASX1209: Maximum value allowed for 'max-schemas' is 10. Found 15</expected-error> - <expected-error>HYR0133: Schema could not be inferred, empty types found in the result</expected-error> <expected-error>HYR0134: Schema Limit exceeded, maximum number of heterogeneous schemas allowed : '2'</expected-error> <expected-error>ASX1204: 'rectangle' type not supported in parquet format</expected-error> <expected-error>ASX0072: Parquet does not support arrays containing mixed data types</expected-error> diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/ParquetSchemaTree.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/ParquetSchemaTree.java index 5fb2ac7c6e..a76426232f 100644 --- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/ParquetSchemaTree.java +++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/ParquetSchemaTree.java @@ -38,6 +38,12 @@ import org.apache.parquet.schema.Types; public class ParquetSchemaTree { private static final Logger LOGGER = LogManager.getLogger(); + private static final SchemaNode DEFAULT_SCHEMA_NODE_FOR_NULL = new SchemaNode(); + private static final FlatType DEFAULT_TYPE_TAG_FOR_NULL = new FlatType(ATypeTag.STRING); + static { + DEFAULT_SCHEMA_NODE_FOR_NULL.setType(DEFAULT_TYPE_TAG_FOR_NULL); + } + public static class SchemaNode { private AbstractType type; @@ -179,12 +185,13 @@ public class ParquetSchemaTree { public static void buildParquetSchema(Types.Builder builder, SchemaNode schemaNode, String columnName) throws HyracksDataException { if (schemaNode.getType() == null) { - LOGGER.info( - "Child type not set for record value with column name: " + LogRedactionUtil.userData(columnName)); - throw new HyracksDataException(ErrorCode.EMPTY_TYPE_INFERRED); + buildFlat(builder, DEFAULT_TYPE_TAG_FOR_NULL, columnName); + return; } AbstractType typeClass = schemaNode.getType(); - if (typeClass instanceof RecordType) { + if (typeClass == null) { + buildFlat(builder, DEFAULT_TYPE_TAG_FOR_NULL, columnName); + } else if (typeClass instanceof RecordType) { buildRecord(builder, (RecordType) schemaNode.getType(), columnName); } else if (typeClass instanceof ListType) { buildList(builder, (ListType) schemaNode.getType(), columnName); @@ -206,10 +213,10 @@ public class ParquetSchemaTree { Types.BaseListBuilder<?, ?> childBuilder = getListChild(builder); SchemaNode child = type.child; if (child == null) { - LOGGER.info("Child type not set for list with column name: " + LogRedactionUtil.userData(columnName)); - throw new HyracksDataException(ErrorCode.EMPTY_TYPE_INFERRED); + buildParquetSchema(childBuilder, DEFAULT_SCHEMA_NODE_FOR_NULL, columnName); + } else { + buildParquetSchema(childBuilder, child, columnName); } - buildParquetSchema(childBuilder, child, columnName); } private static void buildFlat(Types.Builder builder, FlatType type, String columnName) throws HyracksDataException { diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/SchemaCheckerLazyVisitor.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/SchemaCheckerLazyVisitor.java index 6e03144ac9..5e911a6ba2 100644 --- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/SchemaCheckerLazyVisitor.java +++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/SchemaCheckerLazyVisitor.java @@ -113,13 +113,13 @@ public class SchemaCheckerLazyVisitor implements ISchemaChecker, @Override public ISchemaChecker.SchemaComparisonType visit(FlatLazyVisitablePointable currentValue, ParquetSchemaTree.SchemaNode schemaNode) throws HyracksDataException { - if (schemaNode.getType() == null) { - return SchemaComparisonType.GROWING; - } // SchemaNode.getTypeTag can never be MISSING here if (currentValue.getTypeTag() == ATypeTag.NULL) { return SchemaComparisonType.EQUIVALENT; } + if (schemaNode.getType() == null) { + return SchemaComparisonType.GROWING; + } if (!(schemaNode.getType() instanceof ParquetSchemaTree.FlatType inferredType)) { return ISchemaChecker.SchemaComparisonType.CONFLICTING; }
