This is an automated email from the ASF dual-hosted git repository. mblow pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/asterixdb.git
commit 1b85c6482e7def7b7a8e00421d05436fc2537f26 Author: Ritik Raj <[email protected]> AuthorDate: Tue Jan 7 09:14:43 2025 +0530 [ASTERIXDB-3540][COMP] Fixed calculation of expected schema for pushdown - user model changes: no - storage format changes: no - interface changes: no Details: if the getField expr consisted of a function which needs to be evaluated at runtime, the pushdown computer was not evaluating those expression leading to incorrect computation. eg: 1. `field-access-by-name`(t.r.p, x.y.age_field) 2. `field-access-by-name`(t.r.p, substring(x.y.age_field, 0, 4)) Ext-ref: MB-64730 Change-Id: Iac55527af143c292557158ca8e47e92538e93970 Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/19288 Reviewed-by: Murtadha Hubail <[email protected]> Tested-by: Murtadha Hubail <[email protected]> Integration-Tests: Murtadha Hubail <[email protected]> --- asterixdb/NOTICE | 2 +- .../rules/pushdown/ExpectedSchemaBuilder.java | 67 ++++++++++++++++++++++ .../asterix-app/data/hdfs/parquet/friends.json | 1 + .../external_dataset/ExternalDatasetTestUtils.java | 1 + .../ASTERIXDB-3540/ASTERIXDB-3540.01.ddl.sqlpp | 41 +++++++++++++ .../ASTERIXDB-3540/ASTERIXDB-3540.02.query.sqlpp | 26 +++++++++ .../ASTERIXDB-3540/ASTERIXDB-3540.03.query.sqlpp | 25 ++++++++ .../parquet/ASTERIXDB-3540/ASTERIXDB-3540.02.plan | 1 + .../parquet/ASTERIXDB-3540/ASTERIXDB-3540.03.adm | 1 + .../runtimets/testsuite_external_dataset_s3.xml | 6 ++ hyracks-fullstack/NOTICE | 2 +- 11 files changed, 171 insertions(+), 2 deletions(-) diff --git a/asterixdb/NOTICE b/asterixdb/NOTICE index 06d538de68..5118782978 100644 --- a/asterixdb/NOTICE +++ b/asterixdb/NOTICE @@ -1,5 +1,5 @@ Apache AsterixDB -Copyright 2015-2024 The Apache Software Foundation +Copyright 2015-2025 The Apache Software Foundation This product includes software developed at The Apache Software Foundation (http://www.apache.org/). diff --git a/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/rules/pushdown/ExpectedSchemaBuilder.java b/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/rules/pushdown/ExpectedSchemaBuilder.java index b7632db0e5..a9937d1af3 100644 --- a/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/rules/pushdown/ExpectedSchemaBuilder.java +++ b/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/rules/pushdown/ExpectedSchemaBuilder.java @@ -22,6 +22,7 @@ import static org.apache.asterix.optimizer.rules.pushdown.ExpressionValueAccessP import static org.apache.asterix.optimizer.rules.pushdown.ExpressionValueAccessPushdownVisitor.SUPPORTED_FUNCTIONS; import java.util.HashMap; +import java.util.List; import java.util.Map; import org.apache.asterix.om.functions.BuiltinFunctions; @@ -37,6 +38,7 @@ import org.apache.asterix.optimizer.rules.pushdown.schema.RootExpectedSchemaNode import org.apache.asterix.optimizer.rules.pushdown.schema.UnionExpectedSchemaNode; import org.apache.asterix.runtime.projection.DataProjectionInfo; import org.apache.asterix.runtime.projection.FunctionCallInformation; +import org.apache.commons.lang3.mutable.Mutable; import org.apache.hyracks.algebricks.core.algebra.base.ILogicalExpression; import org.apache.hyracks.algebricks.core.algebra.base.LogicalExpressionTag; import org.apache.hyracks.algebricks.core.algebra.base.LogicalVariable; @@ -72,6 +74,10 @@ class ExpectedSchemaBuilder { } public boolean setSchemaFromExpression(AbstractFunctionCallExpression expr, LogicalVariable producedVar) { + return buildExpectedSchemaNodes(expr, producedVar); + } + + public boolean setSchemaFromCalculatedExpression(AbstractFunctionCallExpression expr, LogicalVariable producedVar) { //Parent always nested AbstractComplexExpectedSchemaNode parent = (AbstractComplexExpectedSchemaNode) buildNestedNode(expr); if (parent != null) { @@ -111,6 +117,67 @@ class ExpectedSchemaBuilder { return !varToNode.isEmpty(); } + private boolean buildExpectedSchemaNodes(ILogicalExpression expr, LogicalVariable producedVar) { + return buildNestedNodes(expr, producedVar); + } + + private boolean buildNestedNodes(ILogicalExpression expr, LogicalVariable producedVar) { + //The current node expression + boolean changed = false; + if (expr.getExpressionTag() != LogicalExpressionTag.FUNCTION_CALL) { + return false; + } + AbstractFunctionCallExpression myExpr = (AbstractFunctionCallExpression) expr; + if (!SUPPORTED_FUNCTIONS.contains(myExpr.getFunctionIdentifier()) || noArgsOrFirstArgIsConstant(myExpr)) { + // Check if the function consists of the Supported Functions + for (Mutable<ILogicalExpression> arg : myExpr.getArguments()) { + changed |= buildNestedNodes(arg.getValue(), producedVar); + } + return changed; + } + // if the child is not a function expression, then just one node. + if (BuiltinFunctions.ARRAY_STAR.equals(myExpr.getFunctionIdentifier()) + || BuiltinFunctions.SCAN_COLLECTION.equals(myExpr.getFunctionIdentifier())) { + // these supported function won't have second child + IExpectedSchemaNode expectedSchemaNode = buildNestedNode(expr); + if (expectedSchemaNode != null) { + changed |= setSchemaFromCalculatedExpression((AbstractFunctionCallExpression) expr, producedVar); + } + } else { + ILogicalExpression childExpr = myExpr.getArguments().get(1).getValue(); + if (childExpr.getExpressionTag() != LogicalExpressionTag.FUNCTION_CALL) { + // must be a variable or constant + IExpectedSchemaNode expectedSchemaNode = buildNestedNode(expr); + if (expectedSchemaNode != null) { + changed |= setSchemaFromCalculatedExpression((AbstractFunctionCallExpression) expr, producedVar); + } + } else { + // as the childExpr is a function. + // if the function had been evaluated at compile time, it would have been + // evaluated at this stage of compilation. + // eg: field-access(t.r.p, substring("name",2,4)) + // this will be evaluated to field-access(t.r.p, "me") at compile time itself. + // since the execution reached this branch, this means the childExpr + // need to be evaluated at runtime, hence the childExpr should also be checked + // for possible pushdown. + // eg: field-access(t.r.p, substring(x.y.age_field, 0, 4)) + ILogicalExpression parentExpr = myExpr.getArguments().get(0).getValue(); + IExpectedSchemaNode parentExpectedNode = buildNestedNode(parentExpr); + if (parentExpectedNode != null) { + changed |= + setSchemaFromCalculatedExpression((AbstractFunctionCallExpression) parentExpr, producedVar); + } + changed |= buildNestedNodes(childExpr, producedVar); + } + } + return changed; + } + + private boolean noArgsOrFirstArgIsConstant(AbstractFunctionCallExpression myExpr) { + List<Mutable<ILogicalExpression>> args = myExpr.getArguments(); + return args.isEmpty() || args.get(0).getValue().getExpressionTag() == LogicalExpressionTag.CONSTANT; + } + private IExpectedSchemaNode buildNestedNode(ILogicalExpression expr) { //The current node expression AbstractFunctionCallExpression myExpr = (AbstractFunctionCallExpression) expr; diff --git a/asterixdb/asterix-app/data/hdfs/parquet/friends.json b/asterixdb/asterix-app/data/hdfs/parquet/friends.json new file mode 100644 index 0000000000..d708ad9a53 --- /dev/null +++ b/asterixdb/asterix-app/data/hdfs/parquet/friends.json @@ -0,0 +1 @@ +{ "id": "1", "name": "Monica", "x": { "y": { "age_field": "age" } }, "t": { "r": { "p": { "age": "26" } } } } \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/ExternalDatasetTestUtils.java b/asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/ExternalDatasetTestUtils.java index 316d261e14..7963132494 100644 --- a/asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/ExternalDatasetTestUtils.java +++ b/asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/ExternalDatasetTestUtils.java @@ -272,6 +272,7 @@ public class ExternalDatasetTestUtils { loadData(generatedDataBasePath, "", "heterogeneous_1.parquet", definition, definitionSegment, false, false); loadData(generatedDataBasePath, "", "heterogeneous_2.parquet", definition, definitionSegment, false, false); loadData(generatedDataBasePath, "", "parquetTypes.parquet", definition, definitionSegment, false, false); + loadData(generatedDataBasePath, "", "friends.parquet", definition, definitionSegment, false, false); } private static void loadData(String fileBasePath, String filePathSegment, String filename, String definition, diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/parquet/ASTERIXDB-3540/ASTERIXDB-3540.01.ddl.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/parquet/ASTERIXDB-3540/ASTERIXDB-3540.01.ddl.sqlpp new file mode 100644 index 0000000000..a601a8db19 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/parquet/ASTERIXDB-3540/ASTERIXDB-3540.01.ddl.sqlpp @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* +* Description : Field access pushdown +* Expected Res : Success +* Date : June 22nd 2020 +*/ + +DROP DATAVERSE test IF EXISTS; +CREATE DATAVERSE test; + +USE test; + + +CREATE TYPE ParquetType as { +}; + +CREATE EXTERNAL DATASET ParquetDataset(ParquetType) USING %adapter% +( + %template%, + ("container"="playground"), + ("definition"="parquet-data/reviews"), + ("include"="*friends.parquet"), + ("format" = "parquet") +); \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/parquet/ASTERIXDB-3540/ASTERIXDB-3540.02.query.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/parquet/ASTERIXDB-3540/ASTERIXDB-3540.02.query.sqlpp new file mode 100644 index 0000000000..e72d4121da --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/parquet/ASTERIXDB-3540/ASTERIXDB-3540.02.query.sqlpp @@ -0,0 +1,26 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +USE test; + +SET `compiler.external.field.pushdown` "true"; + +EXPLAIN +SELECT t.r.g, `field-access-by-name`(t.r.p, x.y.age_field) +FROM ParquetDataset; \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/parquet/ASTERIXDB-3540/ASTERIXDB-3540.03.query.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/parquet/ASTERIXDB-3540/ASTERIXDB-3540.03.query.sqlpp new file mode 100644 index 0000000000..d15ba8d766 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/parquet/ASTERIXDB-3540/ASTERIXDB-3540.03.query.sqlpp @@ -0,0 +1,25 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +USE test; + +SET `compiler.external.field.pushdown` "true"; + +SELECT t.r.g, `field-access-by-name`(t.r.p, x.y.age_field) +FROM ParquetDataset; \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/common/parquet/ASTERIXDB-3540/ASTERIXDB-3540.02.plan b/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/common/parquet/ASTERIXDB-3540/ASTERIXDB-3540.02.plan new file mode 100644 index 0000000000..4806a282ee --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/common/parquet/ASTERIXDB-3540/ASTERIXDB-3540.02.plan @@ -0,0 +1 @@ +"distribute result [$$24] [cardinality: 1000000.0, op-cost: 0.0, total-cost: 1000000.0]\n-- DISTRIBUTE_RESULT |PARTITIONED|\n exchange [cardinality: 1000000.0, op-cost: 0.0, total-cost: 1000000.0]\n -- ONE_TO_ONE_EXCHANGE |PARTITIONED|\n project ([$$24]) [cardinality: 1000000.0, op-cost: 0.0, total-cost: 1000000.0]\n -- STREAM_PROJECT |PARTITIONED|\n assign [$$24] <- [{\"g\": $$25.getField(\"g\"), \"$1\": $$25.getField(\"p\").getField(\"$$ParquetDataset.getField(\"x\").ge [...] \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/common/parquet/ASTERIXDB-3540/ASTERIXDB-3540.03.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/common/parquet/ASTERIXDB-3540/ASTERIXDB-3540.03.adm new file mode 100644 index 0000000000..224633561b --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/common/parquet/ASTERIXDB-3540/ASTERIXDB-3540.03.adm @@ -0,0 +1 @@ +{ "$1": "26" } \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset_s3.xml b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset_s3.xml index 724298479e..723c1186d3 100644 --- a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset_s3.xml +++ b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset_s3.xml @@ -101,6 +101,12 @@ <output-dir compare="Text">common/parquet/field-access-pushdown</output-dir> </compilation-unit> </test-case> + <test-case FilePath="external-dataset"> + <compilation-unit name="common/parquet/ASTERIXDB-3540"> + <placeholder name="adapter" value="S3" /> + <output-dir compare="Clean-JSON">common/parquet/ASTERIXDB-3540</output-dir> + </compilation-unit> + </test-case> <test-case FilePath="external-dataset"> <compilation-unit name="common/parquet/array-access-pushdown"> <placeholder name="adapter" value="S3" /> diff --git a/hyracks-fullstack/NOTICE b/hyracks-fullstack/NOTICE index e9bb9a4535..722db88282 100644 --- a/hyracks-fullstack/NOTICE +++ b/hyracks-fullstack/NOTICE @@ -1,5 +1,5 @@ Apache Hyracks and Algebricks -Copyright 2015-2024 The Apache Software Foundation +Copyright 2015-2025 The Apache Software Foundation This product includes software developed at The Apache Software Foundation (http://www.apache.org/).
