This is an automated email from the ASF dual-hosted git repository. mhubail pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/asterixdb.git
commit f19c3457ca6c481ae835d419ec21b5f6cdcd3f4c Author: preetham0202 <[email protected]> AuthorDate: Wed Mar 26 16:33:26 2025 +0530 [ASTERIXDB-3392] Add type hierarchy in schema inference for copy to parquet Details: Implemented type hierarchy: double > float > int64 > int32. Schema inference will now assign the largest type encountered for a given field. Ext-ref: MB-65895 Change-Id: Iabc5e2d8e68bf17f4c080ddaf6cc41141a0c3345 Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/19559 Integration-Tests: Jenkins <[email protected]> Tested-by: Ali Alsuliman <[email protected]> Reviewed-by: Ali Alsuliman <[email protected]> --- .../parquet-type-hierarchy.01.ddl.sqlpp | 29 ++++++++++++++++ .../parquet-type-hierarchy.02.update.sqlpp | 39 ++++++++++++++++++++++ .../parquet-type-hierarchy.03.update.sqlpp | 32 ++++++++++++++++++ .../parquet-type-hierarchy.04.update.sqlpp | 36 ++++++++++++++++++++ .../parquet-type-hierarchy.05.query.sqlpp | 25 ++++++++++++++ .../parquet-type-hierarchy.05.adm | 11 ++++++ .../runtimets/testsuite_external_dataset_s3.xml | 10 ++++++ .../resources/runtimets/testsuite_sqlpp_hdfs.xml | 4 +++ .../printer/parquet/AsterixParquetTypeMap.java | 8 +++++ .../printer/parquet/ParquetSchemaLazyVisitor.java | 13 ++++---- .../writer/printer/parquet/ParquetSchemaTree.java | 31 ++++++++++++----- .../printer/parquet/SchemaCheckerLazyVisitor.java | 4 +-- 12 files changed, 224 insertions(+), 18 deletions(-) diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-type-hierarchy/parquet-type-hierarchy.01.ddl.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-type-hierarchy/parquet-type-hierarchy.01.ddl.sqlpp new file mode 100644 index 0000000000..6ab6331154 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-type-hierarchy/parquet-type-hierarchy.01.ddl.sqlpp @@ -0,0 +1,29 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +DROP DATAVERSE test if exists; +CREATE DATAVERSE test; +USE test; + +CREATE TYPE ColumnType1 AS { + id: int + }; + + +CREATE COLLECTION TestCollection(ColumnType1) PRIMARY KEY id; \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-type-hierarchy/parquet-type-hierarchy.02.update.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-type-hierarchy/parquet-type-hierarchy.02.update.sqlpp new file mode 100644 index 0000000000..9a6f3c408d --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-type-hierarchy/parquet-type-hierarchy.02.update.sqlpp @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +use test; + + +insert into TestCollection({"id":2}); +insert into TestCollection({"id":5,"rating" :1 , "ratings" : [] }); +insert into TestCollection({"id":8,"rating" :2 , "ratings" : [ 1 ] }); +insert into TestCollection({"id":10,"rating" :3.0 , "ratings" : [ 1, 2, 3] }); +insert into TestCollection({"id":12,"rating" :4.3 , "ratings" : [1,2,3.0,4,5]}); +insert into TestCollection({"id":15,"rating" :4.7 , "ratings" : [1.0,2.0,3.0,4.0,5.0]}); +insert into TestCollection({"id":17,"rating" :4.22222 , "ratings" : [1.1111,2.222222,3.3333,4.44444,5.555555]}); +insert into TestCollection({"id":20,"rating" :5.455555555 , "ratings" : [1,2,3,4,5]}); +insert into TestCollection({"id":21,"rating" :1 , "ratings" : [0.0,6.7]}); +insert into TestCollection({"id":27,"rating" :8 , "ratings" : [1]}); +insert into TestCollection({"id":28,"rating" :3 , "ratings" : []}); + + + + + + diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-type-hierarchy/parquet-type-hierarchy.03.update.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-type-hierarchy/parquet-type-hierarchy.03.update.sqlpp new file mode 100644 index 0000000000..b4a7b18935 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-type-hierarchy/parquet-type-hierarchy.03.update.sqlpp @@ -0,0 +1,32 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +USE test; + +COPY ( +select c.* from TestCollection c + ) toWriter +TO %adapter% +PATH (%pathprefix% "copy-to-result", "parquet-type-hierarchy") +WITH { + %template_colons%, + %additionalProperties% + "format":"parquet", + "max-schemas" : "1" + }; \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-type-hierarchy/parquet-type-hierarchy.04.update.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-type-hierarchy/parquet-type-hierarchy.04.update.sqlpp new file mode 100644 index 0000000000..dad1090f88 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-type-hierarchy/parquet-type-hierarchy.04.update.sqlpp @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +USE test; + + +CREATE TYPE ColumnType2 AS { + }; + + + +CREATE EXTERNAL DATASET TestDataset(ColumnType2) USING %adapter% +( + %template%, + %additional_Properties%, + ("definition"="%path_prefix%copy-to-result/parquet-type-hierarchy/"), + ("include"="*.parquet"), + ("requireVersionChangeDetection"="false"), + ("format" = "parquet") +); \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-type-hierarchy/parquet-type-hierarchy.05.query.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-type-hierarchy/parquet-type-hierarchy.05.query.sqlpp new file mode 100644 index 0000000000..3d26c186be --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-type-hierarchy/parquet-type-hierarchy.05.query.sqlpp @@ -0,0 +1,25 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +USE test; + + +SELECT c.* +FROM TestDataset c +ORDER BY c.id; \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/copy-to/parquet-type-hierarchy/parquet-type-hierarchy.05.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/copy-to/parquet-type-hierarchy/parquet-type-hierarchy.05.adm new file mode 100644 index 0000000000..5c4c334aca --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/copy-to/parquet-type-hierarchy/parquet-type-hierarchy.05.adm @@ -0,0 +1,11 @@ +{ "id": 2 } +{ "ratings": [ ], "rating": 1.0, "id": 5 } +{ "ratings": [ 1 ], "rating": 2, "id": 8 } +{ "ratings": [ 1, 2, 3 ], "rating": 3, "id": 10 } +{ "ratings": [ 1.0, 2.0, 3.0, 4.0, 5.0 ], "rating": 4.3, "id": 12 } +{ "ratings": [ 1.0, 2.0, 3.0, 4.0, 5.0 ], "rating": 4.7, "id": 15 } +{ "ratings": [ 1.1111, 2.222222, 3.3333, 4.44444, 5.555555 ], "rating": 4.22222, "id": 17 } +{ "ratings": [ 1.0, 2.0, 3.0, 4.0, 5.0 ], "rating": 5.455555555, "id": 20 } +{ "ratings": [ 0.0, 6.7 ], "rating": 1.0, "id": 21 } +{ "ratings": [ 1.0 ], "rating": 8.0, "id": 27 } +{ "ratings": [ ], "rating": 3, "id": 28 } diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset_s3.xml b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset_s3.xml index e925dc60f6..970ccdd405 100644 --- a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset_s3.xml +++ b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset_s3.xml @@ -144,6 +144,16 @@ <output-dir compare="Text">parquet-empty-array</output-dir> </compilation-unit> </test-case> + <test-case FilePath="copy-to"> + <compilation-unit name="parquet-type-hierarchy"> + <placeholder name="adapter" value="S3" /> + <placeholder name="pathprefix" value="" /> + <placeholder name="path_prefix" value="" /> + <placeholder name="additionalProperties" value='"container":"playground",' /> + <placeholder name="additional_Properties" value='("container"="playground")' /> + <output-dir compare="Text">parquet-type-hierarchy</output-dir> + </compilation-unit> + </test-case> <test-case FilePath="copy-to"> <compilation-unit name="empty-path"> <output-dir compare="Text">empty-path</output-dir> diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp_hdfs.xml b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp_hdfs.xml index 36087a531f..c163fd2a04 100644 --- a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp_hdfs.xml +++ b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp_hdfs.xml @@ -259,6 +259,10 @@ <expected-error>HYR0133: Schema could not be inferred, empty types found in the result</expected-error> <expected-error>HYR0134: Schema Limit exceeded, maximum number of heterogeneous schemas allowed : '2'</expected-error> <expected-error>ASX1204: 'rectangle' type not supported in parquet format</expected-error> + <expected-error>ASX0072: Parquet does not support arrays containing mixed data types</expected-error> + <expected-error>ASX0072: Parquet does not support arrays containing mixed data types</expected-error> + <expected-error>ASX0072: Parquet does not support arrays containing mixed data types</expected-error> + <expected-error>ASX0072: Parquet does not support arrays containing mixed data types</expected-error> <source-location>false</source-location> </compilation-unit> </test-case> diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/AsterixParquetTypeMap.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/AsterixParquetTypeMap.java index 1eb8e844fe..6bdc586c10 100644 --- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/AsterixParquetTypeMap.java +++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/AsterixParquetTypeMap.java @@ -26,6 +26,14 @@ import org.apache.parquet.schema.PrimitiveType; public class AsterixParquetTypeMap { + public static final Map<ATypeTag, Integer> HIERARCHIAL_TYPES = Map.ofEntries(Map.entry(ATypeTag.TINYINT, 1), + Map.entry(ATypeTag.SMALLINT, 1), Map.entry(ATypeTag.INTEGER, 1), Map.entry(ATypeTag.BIGINT, 2), + Map.entry(ATypeTag.FLOAT, 3), Map.entry(ATypeTag.DOUBLE, 4)); + + public static ATypeTag maxHierarchicalType(ATypeTag a, ATypeTag b) { + return HIERARCHIAL_TYPES.get(a) > HIERARCHIAL_TYPES.get(b) ? a : b; + } + public static final Map<ATypeTag, PrimitiveType.PrimitiveTypeName> PRIMITIVE_TYPE_NAME_MAP = Map.ofEntries(Map.entry(ATypeTag.BOOLEAN, PrimitiveType.PrimitiveTypeName.BOOLEAN), Map.entry(ATypeTag.STRING, PrimitiveType.PrimitiveTypeName.BINARY), diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/ParquetSchemaLazyVisitor.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/ParquetSchemaLazyVisitor.java index 9ea6d77b0a..bd869e9ee6 100644 --- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/ParquetSchemaLazyVisitor.java +++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/ParquetSchemaLazyVisitor.java @@ -110,21 +110,20 @@ public class ParquetSchemaLazyVisitor implements ILazyVisitablePointableVisitor< if (!AsterixParquetTypeMap.PRIMITIVE_TYPE_NAME_MAP.containsKey(pointable.getTypeTag())) { throw RuntimeDataException.create(TYPE_UNSUPPORTED_PARQUET_WRITE, pointable.getTypeTag()); } - schemaNode.setType(new ParquetSchemaTree.FlatType( - AsterixParquetTypeMap.PRIMITIVE_TYPE_NAME_MAP.get(pointable.getTypeTag()), - AsterixParquetTypeMap.LOGICAL_TYPE_ANNOTATION_MAP.get(pointable.getTypeTag()))); + schemaNode.setType(new ParquetSchemaTree.FlatType(pointable.getTypeTag())); return null; } if (!(schemaNode.getType() instanceof ParquetSchemaTree.FlatType)) { throw RuntimeDataException.create(PARQUET_UNSUPPORTED_MIXED_TYPE_ARRAY); } ParquetSchemaTree.FlatType flatType = (ParquetSchemaTree.FlatType) schemaNode.getType(); - if (!(flatType.getPrimitiveTypeName() == AsterixParquetTypeMap.PRIMITIVE_TYPE_NAME_MAP - .get(pointable.getTypeTag())) - || !(flatType.getLogicalTypeAnnotation() == AsterixParquetTypeMap.LOGICAL_TYPE_ANNOTATION_MAP - .get(pointable.getTypeTag()))) { + + if (!flatType.isCompatibleWith(pointable.getTypeTag())) { throw RuntimeDataException.create(PARQUET_UNSUPPORTED_MIXED_TYPE_ARRAY); } + + flatType.coalesce(pointable.getTypeTag()); + return null; } diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/ParquetSchemaTree.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/ParquetSchemaTree.java index ff512dff5c..2ca086ff55 100644 --- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/ParquetSchemaTree.java +++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/ParquetSchemaTree.java @@ -25,6 +25,7 @@ import static org.apache.asterix.external.writer.printer.parquet.ParquetSchemaBu import java.util.HashMap; import java.util.Map; +import org.apache.asterix.om.types.ATypeTag; import org.apache.hyracks.api.exceptions.ErrorCode; import org.apache.hyracks.api.exceptions.HyracksDataException; import org.apache.parquet.schema.LogicalTypeAnnotation; @@ -73,21 +74,35 @@ public class ParquetSchemaTree { } static class FlatType extends AbstractType { - private final PrimitiveType.PrimitiveTypeName primitiveTypeName; - private final LogicalTypeAnnotation logicalTypeAnnotation; + private ATypeTag typeTag; + private boolean isHierarchical; - public FlatType(PrimitiveType.PrimitiveTypeName primitiveTypeName, - LogicalTypeAnnotation logicalTypeAnnotation) { - this.primitiveTypeName = primitiveTypeName; - this.logicalTypeAnnotation = logicalTypeAnnotation; + public FlatType(ATypeTag typeTag) { + this.typeTag = typeTag; + isHierarchical = AsterixParquetTypeMap.HIERARCHIAL_TYPES.containsKey(typeTag); + } + + public boolean isCompatibleWith(ATypeTag typeTag) { + if (isHierarchical) { + return AsterixParquetTypeMap.HIERARCHIAL_TYPES.containsKey(typeTag); + } else { + return this.typeTag == typeTag; + } + } + + public void coalesce(ATypeTag typeTag) { + if (!isCompatibleWith(typeTag) || !isHierarchical) { + return; + } + this.typeTag = AsterixParquetTypeMap.maxHierarchicalType(this.typeTag, typeTag); } public LogicalTypeAnnotation getLogicalTypeAnnotation() { - return logicalTypeAnnotation; + return AsterixParquetTypeMap.LOGICAL_TYPE_ANNOTATION_MAP.get(typeTag); } public PrimitiveType.PrimitiveTypeName getPrimitiveTypeName() { - return primitiveTypeName; + return AsterixParquetTypeMap.PRIMITIVE_TYPE_NAME_MAP.get(typeTag); } } diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/SchemaCheckerLazyVisitor.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/SchemaCheckerLazyVisitor.java index fc43c89afc..28d4247f14 100644 --- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/SchemaCheckerLazyVisitor.java +++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/SchemaCheckerLazyVisitor.java @@ -115,9 +115,7 @@ public class SchemaCheckerLazyVisitor implements ISchemaChecker, ParquetSchemaTree.FlatType flatType = (ParquetSchemaTree.FlatType) schemaNode.getType(); - if (flatType.getPrimitiveTypeName() != AsterixParquetTypeMap.PRIMITIVE_TYPE_NAME_MAP.get(pointable.getTypeTag()) - || !(flatType.getLogicalTypeAnnotation() == AsterixParquetTypeMap.LOGICAL_TYPE_ANNOTATION_MAP - .get(pointable.getTypeTag()))) { + if (!flatType.isCompatibleWith(pointable.getTypeTag())) { return ISchemaChecker.SchemaComparisonType.CONFLICTING; }
