This is an automated email from the ASF dual-hosted git repository.
dkuzmenko pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new 7003dc9f2b8 HIVE-29300: Wrong estimation for num rows in EXPLAIN with
histogram statistics (#6189)
7003dc9f2b8 is described below
commit 7003dc9f2b8e758db49d68163b03488d085646cc
Author: Thomas Rebele <[email protected]>
AuthorDate: Mon Nov 17 17:22:16 2025 +0100
HIVE-29300: Wrong estimation for num rows in EXPLAIN with histogram
statistics (#6189)
---
.../stats/annotation/StatsRulesProcFactory.java | 18 +-
.../test/queries/clientpositive/stats_histogram2.q | 28 +++
.../clientpositive/llap/stats_histogram.q.out | 6 +-
.../clientpositive/llap/stats_histogram2.q.out | 215 +++++++++++++++++++++
4 files changed, 258 insertions(+), 9 deletions(-)
diff --git
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
index a91228fd60c..19f83f39147 100644
---
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
+++
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
@@ -1285,15 +1285,21 @@ private long
evaluateComparatorWithHistogram(ColStatistics cs, long currNumRows,
try {
final float value = extractFloatFromLiteralValue(colTypeLowerCase,
boundValue);
- // kll ignores null values (i.e., kll.getN() + numNulls =
currNumRows), we therefore need to use kll.getN()
- // instead of currNumRows since the CDF is expressed as a fraction of
kll.getN(), not currNumRows
+ double rawSelectivity;
if (upperBound) {
- return Math.round(kll.getN() * (closedBound ?
- lessThanOrEqualSelectivity(kll, value) :
lessThanSelectivity(kll, value)));
+ rawSelectivity = closedBound ?
+ lessThanOrEqualSelectivity(kll, value) :
lessThanSelectivity(kll, value);
} else {
- return Math.round(kll.getN() * (closedBound ?
- greaterThanOrEqualSelectivity(kll, value) :
greaterThanSelectivity(kll, value)));
+ rawSelectivity = closedBound ?
+ greaterThanOrEqualSelectivity(kll, value) :
greaterThanSelectivity(kll, value);
}
+
+ // kll ignores null values, i.e., kll.getN() + numNulls = total number
of rows,
+ // so the above rawSelectivity = count/kll.getN().
+ // What we need is the selectivity count/(total number of rows).
+ // With count = kll.getN() * rawSelectivity we get:
+ double totalSelectivity = (kll.getN() * rawSelectivity) / (kll.getN()
+ cs.getNumNulls());
+ return Math.round(currNumRows * totalSelectivity);
} catch (RuntimeException e) {
LOG.debug("Selectivity computation using histogram failed to parse the
boundary value ({}), "
+ ", using the generic computation strategy", boundValue, e);
diff --git a/ql/src/test/queries/clientpositive/stats_histogram2.q
b/ql/src/test/queries/clientpositive/stats_histogram2.q
new file mode 100644
index 00000000000..63907692136
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/stats_histogram2.q
@@ -0,0 +1,28 @@
+--! qt:dataset:src
+-- The histograms created by ANALYZE TABLE are not deterministic, so mask them:
+--! qt:replace:/(Q[0-9]:) *[0-9.]+/$1 #Masked#/
+-- Same for the row estimations influenced by the histograms.
+-- They are around 5 rows. If they are >= 20 something is really off:
+--! qt:replace:/Statistics: Num rows: [0-1]?[0-9] Data size:
[0-9]+/Statistics: Num rows: #Masked# Data size: #Masked#/
+
+set hive.fetch.task.conversion=none;
+set metastore.stats.fetch.bitvector=true;
+set metastore.stats.fetch.kll=true;
+set hive.stats.fetch.column.stats=true;
+
+CREATE TABLE sh2a AS (SELECT cast(key as int) as k1, cast(key as int) as k2
FROM src);
+
+DESCRIBE FORMATTED sh2a k1;
+DESCRIBE FORMATTED sh2a k2;
+
+explain SELECT 1 FROM sh2a WHERE k1 < 10 AND k2 < 250;
+SELECT count(1) FROM sh2a WHERE k1 < 10 AND k2 < 250;
+
+set hive.stats.kll.enable=true;
+CREATE TABLE sh2b AS (SELECT cast(key as int) as k1, cast(key as int) as k2
FROM src);
+ANALYZE TABLE sh2b COMPUTE STATISTICS FOR COLUMNS;
+
+DESCRIBE FORMATTED sh2b k1;
+DESCRIBE FORMATTED sh2b k2;
+
+explain SELECT 1 FROM sh2b WHERE k1 < 10 AND k2 < 250;
diff --git a/ql/src/test/results/clientpositive/llap/stats_histogram.q.out
b/ql/src/test/results/clientpositive/llap/stats_histogram.q.out
index 4c6b08004bb..5ff94404746 100644
--- a/ql/src/test/results/clientpositive/llap/stats_histogram.q.out
+++ b/ql/src/test/results/clientpositive/llap/stats_histogram.q.out
@@ -714,12 +714,12 @@ STAGE PLANS:
Statistics: Num rows: 15 Data size: 1732 Basic stats:
COMPLETE Column stats: COMPLETE
Filter Operator
predicate: (((d < 3.0) or (d > 7.0)) and (e > 0)) (type:
boolean)
- Statistics: Num rows: 10 Data size: 1156 Basic stats:
COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 8 Data size: 924 Basic stats:
COMPLETE Column stats: COMPLETE
Select Operator
- Statistics: Num rows: 10 Data size: 1156 Basic stats:
COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 8 Data size: 924 Basic stats:
COMPLETE Column stats: COMPLETE
Group By Operator
aggregations: count()
- minReductionHashAggr: 0.9
+ minReductionHashAggr: 0.875
mode: hash
outputColumnNames: _col0
Statistics: Num rows: 1 Data size: 8 Basic stats:
COMPLETE Column stats: COMPLETE
diff --git a/ql/src/test/results/clientpositive/llap/stats_histogram2.q.out
b/ql/src/test/results/clientpositive/llap/stats_histogram2.q.out
new file mode 100644
index 00000000000..fd35003aede
--- /dev/null
+++ b/ql/src/test/results/clientpositive/llap/stats_histogram2.q.out
@@ -0,0 +1,215 @@
+PREHOOK: query: CREATE TABLE sh2a AS (SELECT cast(key as int) as k1, cast(key
as int) as k2 FROM src)
+PREHOOK: type: CREATETABLE_AS_SELECT
+PREHOOK: Input: default@src
+PREHOOK: Output: database:default
+PREHOOK: Output: default@sh2a
+POSTHOOK: query: CREATE TABLE sh2a AS (SELECT cast(key as int) as k1, cast(key
as int) as k2 FROM src)
+POSTHOOK: type: CREATETABLE_AS_SELECT
+POSTHOOK: Input: default@src
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@sh2a
+POSTHOOK: Lineage: sh2a.k1 EXPRESSION [(src)src.FieldSchema(name:key,
type:string, comment:default), ]
+POSTHOOK: Lineage: sh2a.k2 EXPRESSION [(src)src.FieldSchema(name:key,
type:string, comment:default), ]
+PREHOOK: query: DESCRIBE FORMATTED sh2a k1
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@sh2a
+POSTHOOK: query: DESCRIBE FORMATTED sh2a k1
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@sh2a
+col_name k1
+data_type int
+min 0
+max 498
+num_nulls 0
+distinct_count 303
+avg_col_len
+max_col_len
+num_trues
+num_falses
+bit_vector HL
+histogram
+comment from deserializer
+COLUMN_STATS_ACCURATE
{\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"k1\":\"true\",\"k2\":\"true\"}}
+PREHOOK: query: DESCRIBE FORMATTED sh2a k2
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@sh2a
+POSTHOOK: query: DESCRIBE FORMATTED sh2a k2
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@sh2a
+col_name k2
+data_type int
+min 0
+max 498
+num_nulls 0
+distinct_count 303
+avg_col_len
+max_col_len
+num_trues
+num_falses
+bit_vector HL
+histogram
+comment from deserializer
+COLUMN_STATS_ACCURATE
{\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"k1\":\"true\",\"k2\":\"true\"}}
+PREHOOK: query: explain SELECT 1 FROM sh2a WHERE k1 < 10 AND k2 < 250
+PREHOOK: type: QUERY
+PREHOOK: Input: default@sh2a
+#### A masked pattern was here ####
+POSTHOOK: query: explain SELECT 1 FROM sh2a WHERE k1 < 10 AND k2 < 250
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@sh2a
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Tez
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: sh2a
+ filterExpr: ((k1 < 10) and (k2 < 250)) (type: boolean)
+ Statistics: Num rows: 500 Data size: 4000 Basic stats:
COMPLETE Column stats: COMPLETE
+ Filter Operator
+ predicate: ((k1 < 10) and (k2 < 250)) (type: boolean)
+ Statistics: Num rows: #Masked# Data size: #Masked# Basic
stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: 1 (type: int)
+ outputColumnNames: _col0
+ Statistics: Num rows: #Masked# Data size: #Masked# Basic
stats: COMPLETE Column stats: COMPLETE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: #Masked# Data size: #Masked#
Basic stats: COMPLETE Column stats: COMPLETE
+ table:
+ input format:
org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format:
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde:
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ Execution mode: vectorized, llap
+ LLAP IO: all inputs
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: SELECT count(1) FROM sh2a WHERE k1 < 10 AND k2 < 250
+PREHOOK: type: QUERY
+PREHOOK: Input: default@sh2a
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT count(1) FROM sh2a WHERE k1 < 10 AND k2 < 250
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@sh2a
+#### A masked pattern was here ####
+10
+PREHOOK: query: CREATE TABLE sh2b AS (SELECT cast(key as int) as k1, cast(key
as int) as k2 FROM src)
+PREHOOK: type: CREATETABLE_AS_SELECT
+PREHOOK: Input: default@src
+PREHOOK: Output: database:default
+PREHOOK: Output: default@sh2b
+POSTHOOK: query: CREATE TABLE sh2b AS (SELECT cast(key as int) as k1, cast(key
as int) as k2 FROM src)
+POSTHOOK: type: CREATETABLE_AS_SELECT
+POSTHOOK: Input: default@src
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@sh2b
+POSTHOOK: Lineage: sh2b.k1 EXPRESSION [(src)src.FieldSchema(name:key,
type:string, comment:default), ]
+POSTHOOK: Lineage: sh2b.k2 EXPRESSION [(src)src.FieldSchema(name:key,
type:string, comment:default), ]
+PREHOOK: query: ANALYZE TABLE sh2b COMPUTE STATISTICS FOR COLUMNS
+PREHOOK: type: ANALYZE_TABLE
+PREHOOK: Input: default@sh2b
+PREHOOK: Output: default@sh2b
+#### A masked pattern was here ####
+POSTHOOK: query: ANALYZE TABLE sh2b COMPUTE STATISTICS FOR COLUMNS
+POSTHOOK: type: ANALYZE_TABLE
+POSTHOOK: Input: default@sh2b
+POSTHOOK: Output: default@sh2b
+#### A masked pattern was here ####
+PREHOOK: query: DESCRIBE FORMATTED sh2b k1
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@sh2b
+POSTHOOK: query: DESCRIBE FORMATTED sh2b k1
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@sh2b
+col_name k1
+data_type int
+min 0
+max 498
+num_nulls 0
+distinct_count 303
+avg_col_len
+max_col_len
+num_trues
+num_falses
+bit_vector HL
+histogram Q1: #Masked#, Q2: #Masked#, Q3: #Masked#
+comment from deserializer
+COLUMN_STATS_ACCURATE
{\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"k1\":\"true\",\"k2\":\"true\"}}
+PREHOOK: query: DESCRIBE FORMATTED sh2b k2
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@sh2b
+POSTHOOK: query: DESCRIBE FORMATTED sh2b k2
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@sh2b
+col_name k2
+data_type int
+min 0
+max 498
+num_nulls 0
+distinct_count 303
+avg_col_len
+max_col_len
+num_trues
+num_falses
+bit_vector HL
+histogram Q1: #Masked#, Q2: #Masked#, Q3: #Masked#
+comment from deserializer
+COLUMN_STATS_ACCURATE
{\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"k1\":\"true\",\"k2\":\"true\"}}
+PREHOOK: query: explain SELECT 1 FROM sh2b WHERE k1 < 10 AND k2 < 250
+PREHOOK: type: QUERY
+PREHOOK: Input: default@sh2b
+#### A masked pattern was here ####
+POSTHOOK: query: explain SELECT 1 FROM sh2b WHERE k1 < 10 AND k2 < 250
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@sh2b
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Tez
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: sh2b
+ filterExpr: ((k1 < 10) and (k2 < 250)) (type: boolean)
+ Statistics: Num rows: 500 Data size: 4000 Basic stats:
COMPLETE Column stats: COMPLETE
+ Filter Operator
+ predicate: ((k1 < 10) and (k2 < 250)) (type: boolean)
+ Statistics: Num rows: #Masked# Data size: #Masked# Basic
stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: 1 (type: int)
+ outputColumnNames: _col0
+ Statistics: Num rows: #Masked# Data size: #Masked# Basic
stats: COMPLETE Column stats: COMPLETE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: #Masked# Data size: #Masked#
Basic stats: COMPLETE Column stats: COMPLETE
+ table:
+ input format:
org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format:
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde:
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ Execution mode: vectorized, llap
+ LLAP IO: all inputs
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+