This is an automated email from the ASF dual-hosted git repository. joemcdonnell pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git
commit 45d6815821a29b83c7a3daa3d380a40e0e4f3836 Author: Csaba Ringhofer <csringho...@cloudera.com> AuthorDate: Sun Sep 24 14:21:01 2023 +0200 IMPALA-12462: Update only changed partitions after COMPUTE STATS This is mainly a revert of https://gerrit.cloudera.org/#/c/640/ but some parts had to be updated due to changes in Impala. See IMPALA-2201 for details about why this optimization was removed. The patch can massively speed up COMPUTE STATS statement when the majority of partitions has no changes. COMPUTE STATS tpcds_parquet.store_sales; before: 12s after: 1s Besides the DDL speed up the number of HMS events generated is also reduced. Testing: - added test to verify COMPUTE STATS output - correctness of cases when something is modified should be covered by existing tests - core tests passed Change-Id: If2703e0790d5c25db98ed26f26f6d96281c366a3 Reviewed-on: http://gerrit.cloudera.org:8080/20505 Tested-by: Impala Public Jenkins <impala-public-jenk...@cloudera.com> Reviewed-by: Wenzhe Zhou <wz...@cloudera.com> --- .../apache/impala/service/CatalogOpExecutor.java | 31 +++++++++++++++++----- .../QueryTest/compute-stats-incremental.test | 8 ++++++ .../queries/QueryTest/compute-stats.test | 8 ++++++ 3 files changed, 41 insertions(+), 6 deletions(-) diff --git a/fe/src/main/java/org/apache/impala/service/CatalogOpExecutor.java b/fe/src/main/java/org/apache/impala/service/CatalogOpExecutor.java index 256666d1b..d263222db 100644 --- a/fe/src/main/java/org/apache/impala/service/CatalogOpExecutor.java +++ b/fe/src/main/java/org/apache/impala/service/CatalogOpExecutor.java @@ -1881,15 +1881,34 @@ public class CatalogOpExecutor { partitionStats.stats.setNum_rows(0L); } - // Unconditionally update the partition stats and row count, even if the partition - // already has identical ones. This behavior results in possibly redundant work, - // but it is predictable and easy to reason about because it does not depend on the - // existing state of the metadata. See IMPALA-2201. + // Update the partition in HMS only if something has changed. + // Note that previously all partitions were updated to avoid an HMS bug. + // See IMPALA-2201. + boolean updatedPartition = false; + TPartitionStats existingPartStats = partition.getPartitionStats(); + // Update the partition stats if: either there are no existing stats for this + // partition, or they're different. + if (existingPartStats == null || !existingPartStats.equals(partitionStats)) { + updatedPartition = true; + } + long numRows = partitionStats.stats.num_rows; + String existingRowCount = + partition.getParameters().get(StatsSetupConst.ROW_COUNT); + if (existingRowCount == null || + !existingRowCount.equals(String.valueOf(numRows))) { + // The existing row count value wasn't set or has changed. + updatedPartition = true; + } + if (LOG.isTraceEnabled()) { - LOG.trace(String.format("Updating stats for partition %s: numRows=%d", - partition.getValuesAsString(), numRows)); + LOG.trace("{} stats for partition {}: numRows={}", + updatedPartition ? "Updating" : "Skip updating", + partition.getValuesAsString(), numRows); } + + if (!updatedPartition) continue; + HdfsPartition.Builder partBuilder = new HdfsPartition.Builder(partition); PartitionStatsUtil.partStatsToPartition(partitionStats, partBuilder); partBuilder.setRowCountParam(numRows); diff --git a/testdata/workloads/functional-query/queries/QueryTest/compute-stats-incremental.test b/testdata/workloads/functional-query/queries/QueryTest/compute-stats-incremental.test index 2eb659fc7..6a8b90740 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/compute-stats-incremental.test +++ b/testdata/workloads/functional-query/queries/QueryTest/compute-stats-incremental.test @@ -207,6 +207,14 @@ compute incremental stats alltypes_incremental partition(year=2010, month=2); STRING ==== ---- QUERY +# Test that no partitions are updated if there are no changes (IMPALA-12462). +compute incremental stats alltypes_incremental partition(year=2010, month=2); +---- RESULTS +'Updated 0 partition(s) and 11 column(s).' +---- TYPES +STRING +==== +---- QUERY show table stats alltypes_incremental; ---- RESULTS '2009','1',310,1,'24.56KB','NOT CACHED','NOT CACHED','TEXT','true',regex:.*,'$ERASURECODE_POLICY' diff --git a/testdata/workloads/functional-query/queries/QueryTest/compute-stats.test b/testdata/workloads/functional-query/queries/QueryTest/compute-stats.test index e9623425c..22e58bc14 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/compute-stats.test +++ b/testdata/workloads/functional-query/queries/QueryTest/compute-stats.test @@ -13,6 +13,14 @@ compute stats alltypes STRING ==== ---- QUERY +# Test that no partitions are updated if there are no changes (IMPALA-12462). +compute stats alltypes +---- RESULTS +'Updated 0 partition(s) and 11 column(s).' +---- TYPES +STRING +==== +---- QUERY show table stats alltypes ---- LABELS YEAR, MONTH, #ROWS, #FILES, SIZE, BYTES CACHED, CACHE REPLICATION, FORMAT, INCREMENTAL STATS, LOCATION, EC POLICY