kasakrisz commented on code in PR #5721: URL: https://github.com/apache/hive/pull/5721#discussion_r2032648066
########## ql/src/java/org/apache/hadoop/hive/ql/ddl/table/misc/columnstats/AlterTableDropColumnStatisticsAnalyzer.java: ########## @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.ddl.table.misc.columnstats; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +import org.apache.hadoop.hive.common.TableName; +import org.apache.hadoop.hive.ql.QueryState; +import org.apache.hadoop.hive.ql.ddl.DDLSemanticAnalyzerFactory.DDLType; +import org.apache.hadoop.hive.ql.ddl.DDLUtils; +import org.apache.hadoop.hive.ql.ddl.table.AbstractAlterTableAnalyzer; +import org.apache.hadoop.hive.ql.ddl.table.AlterTableType; +import org.apache.hadoop.hive.ql.exec.ColumnStatsDropTask; +import org.apache.hadoop.hive.ql.exec.TaskFactory; +import org.apache.hadoop.hive.ql.io.AcidUtils; +import org.apache.hadoop.hive.ql.metadata.Table; +import org.apache.hadoop.hive.ql.parse.ASTNode; +import org.apache.hadoop.hive.ql.parse.HiveParser; +import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.ql.plan.ColumnStatsDropWork; + +/** + * Analyzer for drop column statistics commands. + */ +@DDLType(types = {HiveParser.TOK_ALTERTABLE_DROPCOLSTATS, HiveParser.TOK_ALTERPARTITION_DROPCOLSTATS}) +public class AlterTableDropColumnStatisticsAnalyzer extends AbstractAlterTableAnalyzer { + public AlterTableDropColumnStatisticsAnalyzer(QueryState queryState) throws SemanticException { + super(queryState); + } + + @Override + protected void analyzeCommand(TableName tableName, Map<String, String> partitionSpec, ASTNode command) + throws SemanticException { + Table table = getTable(tableName); + + if (DDLUtils.isIcebergTable(table)) { Review Comment: Please don't use `isIcebergTable` because in the compiler it is too specific. I don't understand why is this method exists in the first place. It is an architectural mistake. IIUC we provide the drop stats function for storage types that store the column stats in HMS. So how about excluding the ones which can store them by themselves? ``` if (table.isNonNative() && table.getStorageHandler().canSetColStatistics(table)) ``` There is also a configuration which controls where stats are stored in case of Iceberg: https://github.com/apache/hive/blob/0e10ff30292265f00a7e7fb9c93f310f4349ac56/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java#L2238-L2239 What should we do when it is set to `metasore`? Maybe we should allow deleting? WYT? ########## ql/src/java/org/apache/hadoop/hive/ql/exec/ColumnStatsDropTask.java: ########## @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec; + +import org.apache.hadoop.hive.ql.ErrorMsg; +import org.apache.hadoop.hive.ql.plan.ColumnStatsDropWork; +import org.apache.hadoop.hive.ql.plan.api.StageType; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * ColumnStatsDropTask implementation. Examples: + * ALTER TABLE src_stat DROP STATISTICS for columns [comma separated list of columns]; + * ALTER TABLE src_stat_part PARTITION(partitionId=100) DROP STATISTICS for columns [comma separated list of columns]; + **/ + +public class ColumnStatsDropTask extends Task<ColumnStatsDropWork> { + private static final long serialVersionUID = 1L; + private static final Logger LOG = LoggerFactory.getLogger(ColumnStatsDropTask.class); + + @Override + public int execute() { + try { + getHive() + .deleteColumnStatistics(work.getDbName(), work.getTableName(), work.getPartName(), work.getColNames()); + return 0; + } catch (Exception e) { + setException(e); + LOG.info("Failed to drop column stats in metastore", e); + return ErrorMsg.getErrorMsg(e.getMessage()).getErrorCode(); Review Comment: What is the error code in this case? What type of exceptions do we expect? Should we create a unique error? ########## ql/src/test/queries/clientpositive/drop_stats_for_columns.q: ########## @@ -0,0 +1,36 @@ +create table t1(i int, j int, k int); + +alter table t1 update statistics set('numRows'='10000', 'rawDataSize'='18000'); +alter table t1 update statistics for column i set('numDVs'='2500','numNulls'='50','highValue'='1000','lowValue'='0'); Review Comment: How does this update works? Especially if some properties are not specified? Do they removed? ``` alter table t1 update statistics for column i set(); ``` ########## ql/src/test/results/clientpositive/llap/drop_histogram_stats_for_columns.q.out: ########## @@ -0,0 +1,307 @@ +PREHOOK: query: CREATE TABLE test_stats (a string, b int, c double) STORED AS ORC +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@test_stats +POSTHOOK: query: CREATE TABLE test_stats (a string, b int, c double) STORED AS ORC +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@test_stats +PREHOOK: query: insert into test_stats (a, b, c) values ("a", 2, 1.1) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@test_stats +POSTHOOK: query: insert into test_stats (a, b, c) values ("a", 2, 1.1) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@test_stats +POSTHOOK: Lineage: test_stats.a SCRIPT [] +POSTHOOK: Lineage: test_stats.b SCRIPT [] +POSTHOOK: Lineage: test_stats.c SCRIPT [] +PREHOOK: query: insert into test_stats (a, b, c) values ("b", 2, 2.1) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@test_stats +POSTHOOK: query: insert into test_stats (a, b, c) values ("b", 2, 2.1) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@test_stats +POSTHOOK: Lineage: test_stats.a SCRIPT [] +POSTHOOK: Lineage: test_stats.b SCRIPT [] +POSTHOOK: Lineage: test_stats.c SCRIPT [] +PREHOOK: query: insert into test_stats (a, b, c) values ("c", 2, 2.1) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@test_stats +POSTHOOK: query: insert into test_stats (a, b, c) values ("c", 2, 2.1) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@test_stats +POSTHOOK: Lineage: test_stats.a SCRIPT [] +POSTHOOK: Lineage: test_stats.b SCRIPT [] +POSTHOOK: Lineage: test_stats.c SCRIPT [] +PREHOOK: query: insert into test_stats (a, b, c) values ("d", 2, 3.1) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@test_stats +POSTHOOK: query: insert into test_stats (a, b, c) values ("d", 2, 3.1) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@test_stats +POSTHOOK: Lineage: test_stats.a SCRIPT [] +POSTHOOK: Lineage: test_stats.b SCRIPT [] +POSTHOOK: Lineage: test_stats.c SCRIPT [] +PREHOOK: query: insert into test_stats (a, b, c) values ("e", 2, 3.1) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@test_stats +POSTHOOK: query: insert into test_stats (a, b, c) values ("e", 2, 3.1) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@test_stats +POSTHOOK: Lineage: test_stats.a SCRIPT [] +POSTHOOK: Lineage: test_stats.b SCRIPT [] +POSTHOOK: Lineage: test_stats.c SCRIPT [] +PREHOOK: query: insert into test_stats (a, b, c) values ("f", 2, 4.1) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@test_stats +POSTHOOK: query: insert into test_stats (a, b, c) values ("f", 2, 4.1) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@test_stats +POSTHOOK: Lineage: test_stats.a SCRIPT [] +POSTHOOK: Lineage: test_stats.b SCRIPT [] +POSTHOOK: Lineage: test_stats.c SCRIPT [] +PREHOOK: query: insert into test_stats (a, b, c) values ("g", 2, 5.1) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@test_stats +POSTHOOK: query: insert into test_stats (a, b, c) values ("g", 2, 5.1) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@test_stats +POSTHOOK: Lineage: test_stats.a SCRIPT [] +POSTHOOK: Lineage: test_stats.b SCRIPT [] +POSTHOOK: Lineage: test_stats.c SCRIPT [] +PREHOOK: query: insert into test_stats (a, b, c) values ("h", 2, 6.1) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@test_stats +POSTHOOK: query: insert into test_stats (a, b, c) values ("h", 2, 6.1) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@test_stats +POSTHOOK: Lineage: test_stats.a SCRIPT [] +POSTHOOK: Lineage: test_stats.b SCRIPT [] +POSTHOOK: Lineage: test_stats.c SCRIPT [] +PREHOOK: query: insert into test_stats (a, b, c) values ("i", 3, 6.1) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@test_stats +POSTHOOK: query: insert into test_stats (a, b, c) values ("i", 3, 6.1) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@test_stats +POSTHOOK: Lineage: test_stats.a SCRIPT [] +POSTHOOK: Lineage: test_stats.b SCRIPT [] +POSTHOOK: Lineage: test_stats.c SCRIPT [] +PREHOOK: query: describe formatted test_stats +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@test_stats +POSTHOOK: query: describe formatted test_stats +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@test_stats +# col_name data_type comment +a string +b int +c double + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"a\":\"true\",\"b\":\"true\",\"c\":\"true\"}} + bucketing_version 2 + numFiles 9 + numRows 9 + rawDataSize 873 + totalSize #Masked# +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.ql.io.orc.OrcSerde +InputFormat: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: describe formatted test_stats a +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@test_stats +POSTHOOK: query: describe formatted test_stats a +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@test_stats +col_name a +data_type string +min +max +num_nulls 0 +distinct_count 9 +avg_col_len 1.0 +max_col_len 1 +num_trues +num_falses +bit_vector HL +histogram +comment from deserializer +COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"a\":\"true\",\"b\":\"true\",\"c\":\"true\"}} +PREHOOK: query: describe formatted test_stats b +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@test_stats +POSTHOOK: query: describe formatted test_stats b +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@test_stats +col_name b +data_type int +min 2 +max 3 +num_nulls 0 +distinct_count 2 +avg_col_len +max_col_len +num_trues +num_falses +bit_vector HL +histogram Q1: 2, Q2: 2, Q3: 2 +comment from deserializer +COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"a\":\"true\",\"b\":\"true\",\"c\":\"true\"}} +PREHOOK: query: describe formatted test_stats c +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@test_stats +POSTHOOK: query: describe formatted test_stats c +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@test_stats +col_name c +data_type double +min 1.1 +max 6.1 +num_nulls 0 +distinct_count 6 +avg_col_len +max_col_len +num_trues +num_falses +bit_vector HL +histogram Q1: 2.1, Q2: 3.1, Q3: 5.1 +comment from deserializer +COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"a\":\"true\",\"b\":\"true\",\"c\":\"true\"}} +PREHOOK: query: alter table test_stats drop statistics for columns a, c +PREHOOK: type: ALTERTABLE_DROP_COL_STATS +PREHOOK: Input: default@test_stats +PREHOOK: Output: default@test_stats +POSTHOOK: query: alter table test_stats drop statistics for columns a, c +POSTHOOK: type: ALTERTABLE_DROP_COL_STATS +POSTHOOK: Input: default@test_stats +POSTHOOK: Output: default@test_stats +PREHOOK: query: describe formatted test_stats +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@test_stats +POSTHOOK: query: describe formatted test_stats +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@test_stats +# col_name data_type comment +a string +b int +c double + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"a\":\"true\",\"b\":\"true\",\"c\":\"true\"}} Review Comment: Why are column stats accurate after drop? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: gitbox-unsubscr...@hive.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: gitbox-unsubscr...@hive.apache.org For additional commands, e-mail: gitbox-h...@hive.apache.org