Copilot commented on code in PR #6438: URL: https://github.com/apache/hive/pull/6438#discussion_r3205830044
########## standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/StatisticsManagementTask.java: ########## @@ -0,0 +1,204 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.metastore; + +import java.util.List; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.locks.Lock; +import java.util.concurrent.locks.ReentrantLock; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.metastore.api.DeleteColumnStatisticsRequest; +import org.apache.hadoop.hive.metastore.conf.MetastoreConf; +import org.apache.hadoop.hive.metastore.model.MPartitionColumnStatistics; +import org.apache.hadoop.hive.metastore.model.MTableColumnStatistics; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.jdo.PersistenceManager; +import javax.jdo.Query; + +/** + * Statistics management task responsible for periodic auto-deletion of table and partition column + * statistics based on a configured retention interval. + * + * <p>When {@code metastore.statistics.auto.deletion} is enabled, this task scans + * {@code TAB_COL_STATS} and {@code PART_COL_STATS} for rows whose {@code lastAnalyzed} timestamp + * is older than {@code metastore.statistics.retention.period}, and deletes them. Review Comment: The class-level Javadoc references config keys `metastore.statistics.*`, but the actual ConfVars introduced/used by this task are `metastore.column.statistics.*` (e.g., `metastore.column.statistics.auto.deletion`, `metastore.column.statistics.retention.period`). Please update the Javadoc to match the real configuration keys to avoid misleading operators. ########## standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/TestStatisticsManagement.java: ########## @@ -0,0 +1,274 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * <p> + * http://www.apache.org/licenses/LICENSE-2.0 + * <p> + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.metastore; + +import static org.apache.hadoop.hive.metastore.Warehouse.DEFAULT_CATALOG_NAME; +import static org.apache.hadoop.hive.metastore.Warehouse.DEFAULT_DATABASE_NAME; +import static org.junit.Assert.assertTrue; + +import java.util.List; +import java.util.concurrent.TimeUnit; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.metastore.annotation.MetastoreUnitTest; +import org.apache.hadoop.hive.metastore.api.ColumnStatistics; +import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData; +import org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc; +import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; +import org.apache.hadoop.hive.metastore.api.Database; +import org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData; +import org.apache.hadoop.hive.metastore.api.NoSuchObjectException; +import org.apache.hadoop.hive.metastore.api.Table; +import org.apache.hadoop.hive.metastore.client.builder.DatabaseBuilder; +import org.apache.hadoop.hive.metastore.client.builder.TableBuilder; +import org.apache.hadoop.hive.metastore.conf.MetastoreConf; +import org.apache.hadoop.hive.metastore.conf.MetastoreConf.ConfVars; +import org.apache.hadoop.hive.metastore.model.MTableColumnStatistics; +import org.apache.hadoop.hive.metastore.security.HadoopThriftAuthBridge; +import org.apache.hadoop.hive.metastore.utils.TestTxnDbUtil; +import org.apache.thrift.TException; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; +import org.junit.experimental.categories.Category; + +import javax.jdo.PersistenceManager; +import javax.jdo.Query; + +/** + * Unit tests for {@link StatisticsManagementTask}, verifying that expired table-level column + * statistics are deleted on schedule and that tables marked with the exclude property are left + * untouched. + */ +@Category(MetastoreUnitTest.class) +public class TestStatisticsManagement { + + private IMetaStoreClient client; + private Configuration conf; + + @Before + public void setUp() throws Exception { + conf = MetastoreConf.newMetastoreConf(); + MetastoreConf.setVar(conf, ConfVars.METASTORE_METADATA_TRANSFORMER_CLASS, " "); + MetaStoreTestUtils.setConfForStandloneMode(conf); + conf.setBoolean(ConfVars.MULTITHREADED.getVarname(), false); + conf.setBoolean(ConfVars.HIVE_IN_TEST.getVarname(), true); + + // Enable stats auto deletion with a short retention so the threshold check triggers easily. + MetastoreConf.setBoolVar(conf, ConfVars.COLUMN_STATISTICS_AUTO_DELETION, true); + MetastoreConf.setTimeVar(conf, ConfVars.COLUMN_STATISTICS_RETENTION_PERIOD, 1, TimeUnit.DAYS); + + MetaStoreTestUtils.startMetaStoreWithRetry(HadoopThriftAuthBridge.getBridge(), conf); + TestTxnDbUtil.setConfValues(conf); + TestTxnDbUtil.prepDb(conf); + + client = new HiveMetaStoreClient(conf); + } + + @After + public void tearDown() throws Exception { + if (client != null) { + // Drop any leftover databases, similar to TestPartitionManagement.java. + List<String> dbs = client.getAllDatabases(DEFAULT_CATALOG_NAME); + for (String db : dbs) { + if (!db.equalsIgnoreCase(DEFAULT_DATABASE_NAME)) { + client.dropDatabase(DEFAULT_CATALOG_NAME, db, true, false, true); + } + } + } + try { + if (client != null) { + client.close(); + } + } finally { + client = null; + } + } + + @Test + public void testExpiredTableColStatsAreDeleted() throws Exception { + String dbName = "stats_db1"; + String tableName = "tbl1"; + createDbAndTable(dbName, tableName, false); + writeTableLevelColStats(dbName, tableName, "c1"); + assertHasTableColStats(dbName, tableName, "c1"); + makeAllTableColStatsOlderThanRetention(dbName, tableName); + + runStatisticsManagementTask(conf); + + assertNoTableColStats(dbName, tableName, "c1"); + } Review Comment: The production task deletes both table-level and partition-level column stats, but the current unit tests only cover table-level deletion and the exclude-table-property behavior. Please add a test that creates partition column stats, backdates `MPartitionColumnStatistics.lastAnalyzed`, runs the task, and asserts partition stats are (or are not) deleted as expected. ########## standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/StatisticsManagementTask.java: ########## @@ -0,0 +1,204 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.metastore; + +import java.util.List; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.locks.Lock; +import java.util.concurrent.locks.ReentrantLock; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.metastore.api.DeleteColumnStatisticsRequest; +import org.apache.hadoop.hive.metastore.conf.MetastoreConf; +import org.apache.hadoop.hive.metastore.model.MPartitionColumnStatistics; +import org.apache.hadoop.hive.metastore.model.MTableColumnStatistics; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.jdo.PersistenceManager; +import javax.jdo.Query; + +/** + * Statistics management task responsible for periodic auto-deletion of table and partition column + * statistics based on a configured retention interval. + * + * <p>When {@code metastore.statistics.auto.deletion} is enabled, this task scans + * {@code TAB_COL_STATS} and {@code PART_COL_STATS} for rows whose {@code lastAnalyzed} timestamp + * is older than {@code metastore.statistics.retention.period}, and deletes them. + * Individual tables may opt out by setting the table property + * {@value #STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY} to any non-null value. + */ +public class StatisticsManagementTask extends ObjectStore implements MetastoreTaskThread { + + private static final Logger LOG = LoggerFactory.getLogger(StatisticsManagementTask.class); + + /** + * Table property key that, when present on a table, excludes it from automatic statistics + * deletion regardless of the global retention setting. + */ + public static final String STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY = + "statistics.auto.deletion.exclude"; + + private static final Lock LOCK = new ReentrantLock(); + + private Configuration conf; + + @Override + public long runFrequency(TimeUnit unit) { + return MetastoreConf.getTimeVar(conf, MetastoreConf.ConfVars.COLUMN_STATISTICS_MANAGEMENT_TASK_FREQUENCY, unit); + } + + @Override + public void setConf(Configuration configuration) { + this.conf = new Configuration(configuration); + super.setConf(configuration); Review Comment: `setConf` makes a defensive copy into `this.conf`, but then calls `super.setConf(configuration)` with the original instance. Since `ObjectStore.setConf` initializes internal state (PMF/PM) from the provided Configuration, this can lead to the task using one conf while the ObjectStore base uses another. Pass the same defensive copy to `super.setConf(...)` for consistency. ########## standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/StatisticsManagementTask.java: ########## @@ -0,0 +1,204 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.metastore; + +import java.util.List; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.locks.Lock; +import java.util.concurrent.locks.ReentrantLock; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.metastore.api.DeleteColumnStatisticsRequest; +import org.apache.hadoop.hive.metastore.conf.MetastoreConf; +import org.apache.hadoop.hive.metastore.model.MPartitionColumnStatistics; +import org.apache.hadoop.hive.metastore.model.MTableColumnStatistics; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.jdo.PersistenceManager; +import javax.jdo.Query; + +/** + * Statistics management task responsible for periodic auto-deletion of table and partition column + * statistics based on a configured retention interval. + * + * <p>When {@code metastore.statistics.auto.deletion} is enabled, this task scans + * {@code TAB_COL_STATS} and {@code PART_COL_STATS} for rows whose {@code lastAnalyzed} timestamp + * is older than {@code metastore.statistics.retention.period}, and deletes them. + * Individual tables may opt out by setting the table property + * {@value #STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY} to any non-null value. + */ +public class StatisticsManagementTask extends ObjectStore implements MetastoreTaskThread { + + private static final Logger LOG = LoggerFactory.getLogger(StatisticsManagementTask.class); + + /** + * Table property key that, when present on a table, excludes it from automatic statistics + * deletion regardless of the global retention setting. + */ + public static final String STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY = + "statistics.auto.deletion.exclude"; + + private static final Lock LOCK = new ReentrantLock(); + + private Configuration conf; + + @Override + public long runFrequency(TimeUnit unit) { + return MetastoreConf.getTimeVar(conf, MetastoreConf.ConfVars.COLUMN_STATISTICS_MANAGEMENT_TASK_FREQUENCY, unit); + } + + @Override + public void setConf(Configuration configuration) { + this.conf = new Configuration(configuration); + super.setConf(configuration); + } + + @Override + public Configuration getConf() { + return conf; + } + + @Override + public void run() { + LOG.debug("Auto statistics deletion started. Cleaning up table/partition column statistics over the retention period."); + long retentionMillis = + MetastoreConf.getTimeVar(conf, MetastoreConf.ConfVars.COLUMN_STATISTICS_RETENTION_PERIOD, TimeUnit.MILLISECONDS); + if (retentionMillis <= 0 || !MetastoreConf.getBoolVar(conf, MetastoreConf.ConfVars.COLUMN_STATISTICS_AUTO_DELETION)) { + LOG.info("Statistics auto deletion is set to off currently."); + return; + } + if (!LOCK.tryLock()) { + return; + } + try { + long now = System.currentTimeMillis(); + long lastAnalyzedThreshold = (now - retentionMillis) / 1000; + PersistenceManager pm = getPersistenceManager(); + boolean committed = false; + openTransaction(); + try { + try (IMetaStoreClient msc = new HiveMetaStoreClient(conf)) { + deleteExpiredTableColStats(pm, msc, lastAnalyzedThreshold); + deleteExpiredPartitionColStats(pm, msc, lastAnalyzedThreshold); + } + committed = commitTransaction(); + } finally { + if (!committed) { + rollbackTransaction(); + } + } + } catch (Exception e) { + LOG.error("Error during statistics auto deletion", e); + } finally { + LOCK.unlock(); + } + } + + /** + * Deletes expired table-level column statistics from {@code TAB_COL_STATS}. + * Tables with the {@value #STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY} property set are skipped. + * + * @param pm the JDO persistence manager to use for the query + * @param msc the metastore client used to issue delete requests + * @param lastAnalyzedThreshold epoch seconds; rows with lastAnalyzed below this value are expired + * @throws Exception if the JDO query or the delete request fails + */ + private void deleteExpiredTableColStats(PersistenceManager pm, IMetaStoreClient msc, + long lastAnalyzedThreshold) throws Exception { + Query tblQuery = null; + try { + tblQuery = pm.newQuery(MTableColumnStatistics.class); + tblQuery.setFilter("lastAnalyzed < threshold"); + tblQuery.declareParameters("long threshold"); + // partitionName does not exist on MTableColumnStatistics; omitted here + tblQuery.setResult( + "table.database.name, " + + "table.tableName, " + + "table.parameters.get(\"" + STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY + "\")"); + @SuppressWarnings("unchecked") + List<Object[]> tblRows = (List<Object[]>) tblQuery.execute(lastAnalyzedThreshold); + for (Object[] row : tblRows) { + String dbName = (String) row[0]; + String tblName = (String) row[1]; + String excludeVal = (String) row[2]; + if (excludeVal != null) { + LOG.info("Skipping auto deletion of table stats for {}.{} due to exclude property.", + dbName, tblName); + continue; + } + DeleteColumnStatisticsRequest request = new DeleteColumnStatisticsRequest(dbName, tblName); + request.setEngine("hive"); + request.setTableLevel(true); Review Comment: `deleteExpiredTableColStats` is filtering expired *rows* in `TAB_COL_STATS`, but the delete request is issued without `col_names`, which causes HMS to delete stats for *all columns* in the table (including non-expired ones). Also, because the query returns one row per column, this will invoke deletion repeatedly for the same table and can emit duplicate listener events (the rawstore delete returns true even when 0 rows are affected). Project the column name, group by (catalog, db, table), and pass only the expired `col_names` (and set `cat_name` from the query result) so only expired stats are removed once per table. ########## standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/StatisticsManagementTask.java: ########## @@ -0,0 +1,204 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.metastore; + +import java.util.List; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.locks.Lock; +import java.util.concurrent.locks.ReentrantLock; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.metastore.api.DeleteColumnStatisticsRequest; +import org.apache.hadoop.hive.metastore.conf.MetastoreConf; +import org.apache.hadoop.hive.metastore.model.MPartitionColumnStatistics; +import org.apache.hadoop.hive.metastore.model.MTableColumnStatistics; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.jdo.PersistenceManager; +import javax.jdo.Query; + +/** + * Statistics management task responsible for periodic auto-deletion of table and partition column + * statistics based on a configured retention interval. + * + * <p>When {@code metastore.statistics.auto.deletion} is enabled, this task scans + * {@code TAB_COL_STATS} and {@code PART_COL_STATS} for rows whose {@code lastAnalyzed} timestamp + * is older than {@code metastore.statistics.retention.period}, and deletes them. + * Individual tables may opt out by setting the table property + * {@value #STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY} to any non-null value. + */ +public class StatisticsManagementTask extends ObjectStore implements MetastoreTaskThread { + + private static final Logger LOG = LoggerFactory.getLogger(StatisticsManagementTask.class); + + /** + * Table property key that, when present on a table, excludes it from automatic statistics + * deletion regardless of the global retention setting. + */ + public static final String STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY = + "statistics.auto.deletion.exclude"; + + private static final Lock LOCK = new ReentrantLock(); + + private Configuration conf; + + @Override + public long runFrequency(TimeUnit unit) { + return MetastoreConf.getTimeVar(conf, MetastoreConf.ConfVars.COLUMN_STATISTICS_MANAGEMENT_TASK_FREQUENCY, unit); + } + + @Override + public void setConf(Configuration configuration) { + this.conf = new Configuration(configuration); + super.setConf(configuration); + } + + @Override + public Configuration getConf() { + return conf; + } + + @Override + public void run() { + LOG.debug("Auto statistics deletion started. Cleaning up table/partition column statistics over the retention period."); + long retentionMillis = + MetastoreConf.getTimeVar(conf, MetastoreConf.ConfVars.COLUMN_STATISTICS_RETENTION_PERIOD, TimeUnit.MILLISECONDS); + if (retentionMillis <= 0 || !MetastoreConf.getBoolVar(conf, MetastoreConf.ConfVars.COLUMN_STATISTICS_AUTO_DELETION)) { + LOG.info("Statistics auto deletion is set to off currently."); + return; + } + if (!LOCK.tryLock()) { + return; + } + try { + long now = System.currentTimeMillis(); + long lastAnalyzedThreshold = (now - retentionMillis) / 1000; + PersistenceManager pm = getPersistenceManager(); + boolean committed = false; + openTransaction(); + try { + try (IMetaStoreClient msc = new HiveMetaStoreClient(conf)) { + deleteExpiredTableColStats(pm, msc, lastAnalyzedThreshold); + deleteExpiredPartitionColStats(pm, msc, lastAnalyzedThreshold); Review Comment: In `run()`, `openTransaction()/commitTransaction()` operate on `ObjectStore`'s internal `pm`, but the code uses a different `PersistenceManager pm = getPersistenceManager()` (which creates a new PM) for the JDO queries. This means the transaction does not apply to the queries, and the extra PM is never closed, leaking resources in a periodically running task. Use the ObjectStore-managed PM/transaction for the queries (or remove the ObjectStore transaction and properly close the PM you create). ########## standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/StatisticsManagementTask.java: ########## @@ -0,0 +1,204 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.metastore; + +import java.util.List; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.locks.Lock; +import java.util.concurrent.locks.ReentrantLock; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.metastore.api.DeleteColumnStatisticsRequest; +import org.apache.hadoop.hive.metastore.conf.MetastoreConf; +import org.apache.hadoop.hive.metastore.model.MPartitionColumnStatistics; +import org.apache.hadoop.hive.metastore.model.MTableColumnStatistics; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.jdo.PersistenceManager; +import javax.jdo.Query; + +/** + * Statistics management task responsible for periodic auto-deletion of table and partition column + * statistics based on a configured retention interval. + * + * <p>When {@code metastore.statistics.auto.deletion} is enabled, this task scans + * {@code TAB_COL_STATS} and {@code PART_COL_STATS} for rows whose {@code lastAnalyzed} timestamp + * is older than {@code metastore.statistics.retention.period}, and deletes them. + * Individual tables may opt out by setting the table property + * {@value #STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY} to any non-null value. + */ +public class StatisticsManagementTask extends ObjectStore implements MetastoreTaskThread { + + private static final Logger LOG = LoggerFactory.getLogger(StatisticsManagementTask.class); + + /** + * Table property key that, when present on a table, excludes it from automatic statistics + * deletion regardless of the global retention setting. + */ + public static final String STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY = + "statistics.auto.deletion.exclude"; + + private static final Lock LOCK = new ReentrantLock(); + + private Configuration conf; + + @Override + public long runFrequency(TimeUnit unit) { + return MetastoreConf.getTimeVar(conf, MetastoreConf.ConfVars.COLUMN_STATISTICS_MANAGEMENT_TASK_FREQUENCY, unit); + } + + @Override + public void setConf(Configuration configuration) { + this.conf = new Configuration(configuration); + super.setConf(configuration); + } + + @Override + public Configuration getConf() { + return conf; + } + + @Override + public void run() { + LOG.debug("Auto statistics deletion started. Cleaning up table/partition column statistics over the retention period."); + long retentionMillis = + MetastoreConf.getTimeVar(conf, MetastoreConf.ConfVars.COLUMN_STATISTICS_RETENTION_PERIOD, TimeUnit.MILLISECONDS); + if (retentionMillis <= 0 || !MetastoreConf.getBoolVar(conf, MetastoreConf.ConfVars.COLUMN_STATISTICS_AUTO_DELETION)) { + LOG.info("Statistics auto deletion is set to off currently."); + return; + } + if (!LOCK.tryLock()) { + return; + } + try { + long now = System.currentTimeMillis(); + long lastAnalyzedThreshold = (now - retentionMillis) / 1000; + PersistenceManager pm = getPersistenceManager(); + boolean committed = false; + openTransaction(); + try { + try (IMetaStoreClient msc = new HiveMetaStoreClient(conf)) { + deleteExpiredTableColStats(pm, msc, lastAnalyzedThreshold); + deleteExpiredPartitionColStats(pm, msc, lastAnalyzedThreshold); + } + committed = commitTransaction(); + } finally { + if (!committed) { + rollbackTransaction(); + } + } + } catch (Exception e) { + LOG.error("Error during statistics auto deletion", e); + } finally { + LOCK.unlock(); + } + } + + /** + * Deletes expired table-level column statistics from {@code TAB_COL_STATS}. + * Tables with the {@value #STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY} property set are skipped. + * + * @param pm the JDO persistence manager to use for the query + * @param msc the metastore client used to issue delete requests + * @param lastAnalyzedThreshold epoch seconds; rows with lastAnalyzed below this value are expired + * @throws Exception if the JDO query or the delete request fails + */ + private void deleteExpiredTableColStats(PersistenceManager pm, IMetaStoreClient msc, + long lastAnalyzedThreshold) throws Exception { + Query tblQuery = null; + try { + tblQuery = pm.newQuery(MTableColumnStatistics.class); + tblQuery.setFilter("lastAnalyzed < threshold"); + tblQuery.declareParameters("long threshold"); + // partitionName does not exist on MTableColumnStatistics; omitted here + tblQuery.setResult( + "table.database.name, " + + "table.tableName, " + + "table.parameters.get(\"" + STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY + "\")"); + @SuppressWarnings("unchecked") + List<Object[]> tblRows = (List<Object[]>) tblQuery.execute(lastAnalyzedThreshold); + for (Object[] row : tblRows) { + String dbName = (String) row[0]; + String tblName = (String) row[1]; + String excludeVal = (String) row[2]; + if (excludeVal != null) { + LOG.info("Skipping auto deletion of table stats for {}.{} due to exclude property.", + dbName, tblName); + continue; + } + DeleteColumnStatisticsRequest request = new DeleteColumnStatisticsRequest(dbName, tblName); + request.setEngine("hive"); + request.setTableLevel(true); + msc.deleteColumnStatistics(request); + } + } finally { + if (tblQuery != null) { + tblQuery.closeAll(); + } + } + } + + /** + * Deletes expired partition-level column statistics from {@code PART_COL_STATS}. + * Tables with the {@value #STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY} property set are skipped. + * + * @param pm the JDO persistence manager to use for the query + * @param msc the metastore client used to issue delete requests + * @param lastAnalyzedThreshold epoch seconds; rows with lastAnalyzed below this value are expired + * @throws Exception if the JDO query or the delete request fails + */ + private void deleteExpiredPartitionColStats(PersistenceManager pm, IMetaStoreClient msc, + long lastAnalyzedThreshold) throws Exception { + Query partQuery = null; + try { + partQuery = pm.newQuery(MPartitionColumnStatistics.class); + partQuery.setFilter("lastAnalyzed < threshold"); + partQuery.declareParameters("long threshold"); + // project via partition navigation to reach partitionName and the table exclude property + partQuery.setResult( + "partition.table.database.name, " + + "partition.table.tableName, " + + "partition.partitionName, " + + "partition.table.parameters.get(\"" + STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY + "\")"); + @SuppressWarnings("unchecked") + List<Object[]> partRows = (List<Object[]>) partQuery.execute(lastAnalyzedThreshold); + for (Object[] row : partRows) { + String dbName = (String) row[0]; + String tblName = (String) row[1]; + String partName = (String) row[2]; + String excludeVal = (String) row[3]; + if (excludeVal != null) { + LOG.info("Skipping auto deletion of partition stats for {}.{} due to exclude property.", + dbName, tblName); + continue; + } + DeleteColumnStatisticsRequest request = new DeleteColumnStatisticsRequest(dbName, tblName); + request.setEngine("hive"); + request.setTableLevel(false); + request.addToPart_names(partName); + msc.deleteColumnStatistics(request); + } Review Comment: `deleteExpiredPartitionColStats` issues partition-level deletions without specifying `col_names`, which deletes stats for all columns in the partition (including non-expired ones) whenever any single column stat row is expired. Since the query returns one row per column, this can also trigger repeated deletes and duplicate listener events for the same (db,tbl,part). Include the column name in the projection and delete only the expired columns, grouped/deduped by (catalog, db, table, partition). -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
