numFalses when scaling rows (#6299)

okumin Wed, 25 Feb 2026 21:49:05 -0800

This is an automated email from the ASF dual-hosted git repository.

okumin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git



The following commit(s) were added to refs/heads/master by this push:
     new 7aef5c9d8d9 HIVE-29422: Statistics - scaling 
numNulls/numTrues/numFalses when scaling rows (#6299)
7aef5c9d8d9 is described below

commit 7aef5c9d8d9345d5b8970c58fd06451957301273
Author: konstantinb <[email protected]>
AuthorDate: Wed Feb 25 21:48:48 2026 -0800

    HIVE-29422: Statistics - scaling numNulls/numTrues/numFalses when scaling 
rows (#6299)
    
    * HIVE-29422: Statistics - scaling numNulls/numTrues/numFalses when scaling 
rows
    
    * HIVE-29422: trigger a rebuild
    
    * HIVE-29422: trigger a rebuild
---
 .../org/apache/hadoop/hive/ql/plan/Statistics.java |  23 +-
 .../apache/hadoop/hive/ql/plan/TestStatistics.java | 156 +++++++++
 .../queries/clientpositive/runtime_stats_scaling.q |  45 +++
 .../llap/runtime_stats_scaling.q.out               | 369 +++++++++++++++++++++
 4 files changed, 592 insertions(+), 1 deletion(-)

diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/Statistics.java 
b/ql/src/java/org/apache/hadoop/hive/ql/plan/Statistics.java
index 2e36c85e091..78ca89d9fd0 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/plan/Statistics.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/Statistics.java
@@ -345,9 +345,30 @@ public Statistics scaleToRowCount(long newRowCount, 
boolean downScaleOnly) {
     if (downScaleOnly && newRowCount >= numRows) {
       return ret;
     }
-    // FIXME: using real scaling by new/old ration might yield better results?
     ret.numRows = newRowCount;
     ret.dataSize = StatsUtils.safeMult(getAvgRowSize(), newRowCount);
+
+    // Adjust column stats to prevent invalid values after scaling: count-based
+    // stats are set to unknown (-1), zero values preserved. Distribution data 
cleared.
+    if (ret.columnStats != null) {
+      for (ColStatistics cs : ret.columnStats.values()) {
+        if (cs.getCountDistint() > newRowCount) {
+          cs.setCountDistint(newRowCount);
+        }
+        if (cs.getNumNulls() > 0) {
+          cs.setNumNulls(-1);
+        }
+        if (cs.getNumTrues() > 0) {
+          cs.setNumTrues(-1);
+        }
+        if (cs.getNumFalses() > 0) {
+          cs.setNumFalses(-1);
+        }
+        cs.setBitVectors(null);
+        cs.setHistogram(null);
+      }
+    }
+
     return ret;
   }
 
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/plan/TestStatistics.java 
b/ql/src/test/org/apache/hadoop/hive/ql/plan/TestStatistics.java
new file mode 100644
index 00000000000..ddbd964311d
--- /dev/null
+++ b/ql/src/test/org/apache/hadoop/hive/ql/plan/TestStatistics.java
@@ -0,0 +1,156 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.plan;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
+
+import java.util.Arrays;
+
+import org.junit.Test;
+
+public class TestStatistics {
+
+  @Test
+  public void testScaleToRowCountPreventsNegativeNonNullCount() {
+    Statistics stats = new Statistics(10, 1000, 0, 0);
+    ColStatistics colStats = new ColStatistics("str_col", "string");
+    colStats.setNumNulls(9);
+    colStats.setCountDistint(2);
+    colStats.setAvgColLen(10.0);
+    stats.setColumnStats(Arrays.asList(colStats));
+
+    Statistics scaled = stats.scaleToRowCount(1, false);
+
+    assertEquals(1, scaled.getNumRows());
+    ColStatistics scaledCol = scaled.getColumnStatisticsFromColName("str_col");
+    assertEquals(-1, scaledCol.getNumNulls());
+  }
+
+  @Test
+  public void testScaleToRowCountCapsCountDistinct() {
+    Statistics stats = new Statistics(100, 1000, 0, 0);
+    ColStatistics colStats = new ColStatistics("col1", "int");
+    colStats.setCountDistint(100);
+    colStats.setNumNulls(0);
+    stats.setColumnStats(Arrays.asList(colStats));
+
+    Statistics scaled = stats.scaleToRowCount(10, false);
+
+    ColStatistics scaledCol = scaled.getColumnStatisticsFromColName("col1");
+    assertEquals(10, scaledCol.getCountDistint());
+  }
+
+  @Test
+  public void testScaleToRowCountSetsNumNullsToUnknown() {
+    Statistics stats = new Statistics(100, 1000, 0, 0);
+    ColStatistics colStats = new ColStatistics("col1", "string");
+    colStats.setNumNulls(50);
+    stats.setColumnStats(Arrays.asList(colStats));
+
+    Statistics scaled = stats.scaleToRowCount(10, false);
+
+    ColStatistics scaledCol = scaled.getColumnStatisticsFromColName("col1");
+    assertEquals(-1, scaledCol.getNumNulls());
+  }
+
+  @Test
+  public void testScaleToRowCountSetsBooleanStatsToUnknown() {
+    Statistics stats = new Statistics(100, 1000, 0, 0);
+    ColStatistics colStats = new ColStatistics("bool_col", "boolean");
+    colStats.setNumTrues(30);
+    colStats.setNumFalses(70);
+    colStats.setNumNulls(0);
+    stats.setColumnStats(Arrays.asList(colStats));
+
+    Statistics scaled = stats.scaleToRowCount(10, false);
+
+    ColStatistics scaledCol = 
scaled.getColumnStatisticsFromColName("bool_col");
+    assertEquals(-1, scaledCol.getNumTrues());
+    assertEquals(-1, scaledCol.getNumFalses());
+  }
+
+  @Test
+  public void testScaleToRowCountPreservesZeroBooleanStats() {
+    Statistics stats = new Statistics(100, 1000, 0, 0);
+    ColStatistics colStats = new ColStatistics("bool_col", "boolean");
+    colStats.setNumTrues(0);
+    colStats.setNumFalses(100);
+    colStats.setNumNulls(0);
+    stats.setColumnStats(Arrays.asList(colStats));
+
+    Statistics scaled = stats.scaleToRowCount(10, false);
+
+    ColStatistics scaledCol = 
scaled.getColumnStatisticsFromColName("bool_col");
+    assertEquals(0, scaledCol.getNumTrues());
+    assertEquals(-1, scaledCol.getNumFalses());
+  }
+
+  @Test
+  public void testScaleToRowCountClearsDistributionData() {
+    Statistics stats = new Statistics(100, 1000, 0, 0);
+    ColStatistics colStats = new ColStatistics("col1", "int");
+    colStats.setNumNulls(0);
+    colStats.setBitVectors(new byte[]{1, 2, 3});
+    colStats.setHistogram(new byte[]{4, 5, 6});
+    stats.setColumnStats(Arrays.asList(colStats));
+
+    Statistics scaled = stats.scaleToRowCount(10, false);
+
+    ColStatistics scaledCol = scaled.getColumnStatisticsFromColName("col1");
+    assertNull(scaledCol.getBitVectors());
+    assertNull(scaledCol.getHistogram());
+  }
+
+  @Test
+  public void testScaleToRowCountMultipleColumns() {
+    Statistics stats = new Statistics(100, 1000, 0, 0);
+
+    ColStatistics col1 = new ColStatistics("int_col", "int");
+    col1.setNumNulls(20);
+    col1.setCountDistint(80);
+
+    ColStatistics col2 = new ColStatistics("str_col", "string");
+    col2.setNumNulls(0);
+    col2.setCountDistint(50);
+
+    ColStatistics col3 = new ColStatistics("bool_col", "boolean");
+    col3.setNumNulls(10);
+    col3.setNumTrues(40);
+    col3.setNumFalses(50);
+
+    stats.setColumnStats(Arrays.asList(col1, col2, col3));
+
+    Statistics scaled = stats.scaleToRowCount(5, false);
+
+    assertEquals(5, scaled.getNumRows());
+
+    ColStatistics scaledCol1 = 
scaled.getColumnStatisticsFromColName("int_col");
+    assertEquals(-1, scaledCol1.getNumNulls());
+    assertEquals(5, scaledCol1.getCountDistint());
+
+    ColStatistics scaledCol2 = 
scaled.getColumnStatisticsFromColName("str_col");
+    assertEquals(0, scaledCol2.getNumNulls());
+    assertEquals(5, scaledCol2.getCountDistint());
+
+    ColStatistics scaledCol3 = 
scaled.getColumnStatisticsFromColName("bool_col");
+    assertEquals(-1, scaledCol3.getNumNulls());
+    assertEquals(-1, scaledCol3.getNumTrues());
+    assertEquals(-1, scaledCol3.getNumFalses());
+  }
+}
diff --git a/ql/src/test/queries/clientpositive/runtime_stats_scaling.q 
b/ql/src/test/queries/clientpositive/runtime_stats_scaling.q
new file mode 100644
index 00000000000..378fc576597
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/runtime_stats_scaling.q
@@ -0,0 +1,45 @@
+-- Test runtime statistics scaling: column stats adjustment when row count 
changes.
+-- When runtime row count is smaller than compile-time estimate, count-based 
stats
+-- (numNulls, numTrues, numFalses) must be adjusted to prevent invalid values.
+
+set hive.fetch.task.conversion=none;
+
+create table t_runtime_scaling (
+  id int,
+  str_col string,
+  bool_col boolean
+);
+
+-- 10 rows: skewed id values create selectivity mismatch
+-- str_col: 9 rows NULL, 1 non-null (tests numNulls for string)
+-- bool_col: 2 true, 1 false, 7 NULL (tests numNulls for boolean)
+insert into t_runtime_scaling values
+  (1, NULL, NULL), (2, NULL, NULL), (3, NULL, NULL), (4, NULL, NULL),
+  (5, NULL, NULL), (6, NULL, true), (7, NULL, true), (8, NULL, NULL),
+  (9, NULL, NULL), (100, 'only_non_null', false);
+
+analyze table t_runtime_scaling compute statistics;
+analyze table t_runtime_scaling compute statistics for columns;
+
+-- Compile-time: estimates ~50% selectivity (5 rows). Runtime: 1 row passes.
+
+-- Test 1: numNulls scaling for string (str_col has 9 nulls, scaled to 1 row)
+explain
+select str_col from t_runtime_scaling where id > 50;
+
+explain reoptimization
+select str_col from t_runtime_scaling where id > 50;
+
+-- Test 2: numNulls scaling for boolean (bool_col has 7 nulls, scaled to 1 row)
+explain
+select bool_col from t_runtime_scaling where id > 50;
+
+explain reoptimization
+select bool_col from t_runtime_scaling where id > 50;
+
+-- Test 3: combined (both columns)
+explain
+select str_col, bool_col from t_runtime_scaling where id > 50;
+
+explain reoptimization
+select str_col, bool_col from t_runtime_scaling where id > 50;
diff --git 
a/ql/src/test/results/clientpositive/llap/runtime_stats_scaling.q.out 
b/ql/src/test/results/clientpositive/llap/runtime_stats_scaling.q.out
new file mode 100644
index 00000000000..bad17e6daea
--- /dev/null
+++ b/ql/src/test/results/clientpositive/llap/runtime_stats_scaling.q.out
@@ -0,0 +1,369 @@
+PREHOOK: query: create table t_runtime_scaling (
+  id int,
+  str_col string,
+  bool_col boolean
+)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@t_runtime_scaling
+POSTHOOK: query: create table t_runtime_scaling (
+  id int,
+  str_col string,
+  bool_col boolean
+)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@t_runtime_scaling
+PREHOOK: query: insert into t_runtime_scaling values
+  (1, NULL, NULL), (2, NULL, NULL), (3, NULL, NULL), (4, NULL, NULL),
+  (5, NULL, NULL), (6, NULL, true), (7, NULL, true), (8, NULL, NULL),
+  (9, NULL, NULL), (100, 'only_non_null', false)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@t_runtime_scaling
+POSTHOOK: query: insert into t_runtime_scaling values
+  (1, NULL, NULL), (2, NULL, NULL), (3, NULL, NULL), (4, NULL, NULL),
+  (5, NULL, NULL), (6, NULL, true), (7, NULL, true), (8, NULL, NULL),
+  (9, NULL, NULL), (100, 'only_non_null', false)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@t_runtime_scaling
+POSTHOOK: Lineage: t_runtime_scaling.bool_col SCRIPT []
+POSTHOOK: Lineage: t_runtime_scaling.id SCRIPT []
+POSTHOOK: Lineage: t_runtime_scaling.str_col SCRIPT []
+PREHOOK: query: analyze table t_runtime_scaling compute statistics
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t_runtime_scaling
+PREHOOK: Output: default@t_runtime_scaling
+POSTHOOK: query: analyze table t_runtime_scaling compute statistics
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t_runtime_scaling
+POSTHOOK: Output: default@t_runtime_scaling
+PREHOOK: query: analyze table t_runtime_scaling compute statistics for columns
+PREHOOK: type: ANALYZE_TABLE
+PREHOOK: Input: default@t_runtime_scaling
+PREHOOK: Output: default@t_runtime_scaling
+#### A masked pattern was here ####
+POSTHOOK: query: analyze table t_runtime_scaling compute statistics for columns
+POSTHOOK: type: ANALYZE_TABLE
+POSTHOOK: Input: default@t_runtime_scaling
+POSTHOOK: Output: default@t_runtime_scaling
+#### A masked pattern was here ####
+PREHOOK: query: explain
+select str_col from t_runtime_scaling where id > 50
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t_runtime_scaling
+#### A masked pattern was here ####
+POSTHOOK: query: explain
+select str_col from t_runtime_scaling where id > 50
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t_runtime_scaling
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: t_runtime_scaling
+                  filterExpr: (id > 50) (type: boolean)
+                  Statistics: Num rows: 10 Data size: 210 Basic stats: 
COMPLETE Column stats: COMPLETE
+                  Filter Operator
+                    predicate: (id > 50) (type: boolean)
+                    Statistics: Num rows: 5 Data size: 105 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    Select Operator
+                      expressions: str_col (type: string)
+                      outputColumnNames: _col0
+                      Statistics: Num rows: 5 Data size: 85 Basic stats: 
COMPLETE Column stats: COMPLETE
+                      File Output Operator
+                        compressed: false
+                        Statistics: Num rows: 5 Data size: 85 Basic stats: 
COMPLETE Column stats: COMPLETE
+                        table:
+                            input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
+                            output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                            serde: 
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+            Execution mode: vectorized, llap
+            LLAP IO: all inputs
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: explain reoptimization
+select str_col from t_runtime_scaling where id > 50
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t_runtime_scaling
+#### A masked pattern was here ####
+POSTHOOK: query: explain reoptimization
+select str_col from t_runtime_scaling where id > 50
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t_runtime_scaling
+#### A masked pattern was here ####
+PREHOOK: query: explain reoptimization
+select str_col from t_runtime_scaling where id > 50
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t_runtime_scaling
+#### A masked pattern was here ####
+POSTHOOK: query: explain reoptimization
+select str_col from t_runtime_scaling where id > 50
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t_runtime_scaling
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: t_runtime_scaling
+                  filterExpr: (id > 50) (type: boolean)
+                  Statistics: (RUNTIME) Num rows: 10 Data size: 210 Basic 
stats: COMPLETE Column stats: COMPLETE
+                  Filter Operator
+                    predicate: (id > 50) (type: boolean)
+                    Statistics: (RUNTIME) Num rows: 1 Data size: 89 Basic 
stats: COMPLETE Column stats: COMPLETE
+                    Select Operator
+                      expressions: str_col (type: string)
+                      outputColumnNames: _col0
+                      Statistics: (RUNTIME) Num rows: 1 Data size: 85 Basic 
stats: COMPLETE Column stats: COMPLETE
+                      File Output Operator
+                        compressed: false
+                        Statistics: Num rows: 1 Data size: 85 Basic stats: 
COMPLETE Column stats: COMPLETE
+                        table:
+                            input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
+                            output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                            serde: 
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+            Execution mode: vectorized, llap
+            LLAP IO: all inputs
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: explain
+select bool_col from t_runtime_scaling where id > 50
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t_runtime_scaling
+#### A masked pattern was here ####
+POSTHOOK: query: explain
+select bool_col from t_runtime_scaling where id > 50
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t_runtime_scaling
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: t_runtime_scaling
+                  filterExpr: (id > 50) (type: boolean)
+                  Statistics: Num rows: 10 Data size: 56 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  Filter Operator
+                    predicate: (id > 50) (type: boolean)
+                    Statistics: Num rows: 5 Data size: 28 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    Select Operator
+                      expressions: bool_col (type: boolean)
+                      outputColumnNames: _col0
+                      Statistics: Num rows: 5 Data size: 8 Basic stats: 
COMPLETE Column stats: COMPLETE
+                      File Output Operator
+                        compressed: false
+                        Statistics: Num rows: 5 Data size: 8 Basic stats: 
COMPLETE Column stats: COMPLETE
+                        table:
+                            input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
+                            output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                            serde: 
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+            Execution mode: vectorized, llap
+            LLAP IO: all inputs
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: explain reoptimization
+select bool_col from t_runtime_scaling where id > 50
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t_runtime_scaling
+#### A masked pattern was here ####
+POSTHOOK: query: explain reoptimization
+select bool_col from t_runtime_scaling where id > 50
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t_runtime_scaling
+#### A masked pattern was here ####
+PREHOOK: query: explain reoptimization
+select bool_col from t_runtime_scaling where id > 50
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t_runtime_scaling
+#### A masked pattern was here ####
+POSTHOOK: query: explain reoptimization
+select bool_col from t_runtime_scaling where id > 50
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t_runtime_scaling
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: t_runtime_scaling
+                  filterExpr: (id > 50) (type: boolean)
+                  Statistics: (RUNTIME) Num rows: 10 Data size: 50 Basic 
stats: COMPLETE Column stats: COMPLETE
+                  Filter Operator
+                    predicate: (id > 50) (type: boolean)
+                    Statistics: (RUNTIME) Num rows: 1 Data size: 8 Basic 
stats: COMPLETE Column stats: COMPLETE
+                    Select Operator
+                      expressions: bool_col (type: boolean)
+                      outputColumnNames: _col0
+                      Statistics: (RUNTIME) Num rows: 1 Data size: 4 Basic 
stats: COMPLETE Column stats: COMPLETE
+                      File Output Operator
+                        compressed: false
+                        Statistics: Num rows: 1 Data size: 4 Basic stats: 
COMPLETE Column stats: COMPLETE
+                        table:
+                            input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
+                            output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                            serde: 
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+            Execution mode: vectorized, llap
+            LLAP IO: all inputs
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: explain
+select str_col, bool_col from t_runtime_scaling where id > 50
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t_runtime_scaling
+#### A masked pattern was here ####
+POSTHOOK: query: explain
+select str_col, bool_col from t_runtime_scaling where id > 50
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t_runtime_scaling
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: t_runtime_scaling
+                  filterExpr: (id > 50) (type: boolean)
+                  Statistics: Num rows: 10 Data size: 226 Basic stats: 
COMPLETE Column stats: COMPLETE
+                  Filter Operator
+                    predicate: (id > 50) (type: boolean)
+                    Statistics: Num rows: 5 Data size: 113 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    Select Operator
+                      expressions: str_col (type: string), bool_col (type: 
boolean)
+                      outputColumnNames: _col0, _col1
+                      Statistics: Num rows: 5 Data size: 93 Basic stats: 
COMPLETE Column stats: COMPLETE
+                      File Output Operator
+                        compressed: false
+                        Statistics: Num rows: 5 Data size: 93 Basic stats: 
COMPLETE Column stats: COMPLETE
+                        table:
+                            input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
+                            output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                            serde: 
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+            Execution mode: vectorized, llap
+            LLAP IO: all inputs
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: explain reoptimization
+select str_col, bool_col from t_runtime_scaling where id > 50
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t_runtime_scaling
+#### A masked pattern was here ####
+POSTHOOK: query: explain reoptimization
+select str_col, bool_col from t_runtime_scaling where id > 50
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t_runtime_scaling
+#### A masked pattern was here ####
+PREHOOK: query: explain reoptimization
+select str_col, bool_col from t_runtime_scaling where id > 50
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t_runtime_scaling
+#### A masked pattern was here ####
+POSTHOOK: query: explain reoptimization
+select str_col, bool_col from t_runtime_scaling where id > 50
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t_runtime_scaling
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: t_runtime_scaling
+                  filterExpr: (id > 50) (type: boolean)
+                  Statistics: (RUNTIME) Num rows: 10 Data size: 220 Basic 
stats: COMPLETE Column stats: COMPLETE
+                  Filter Operator
+                    predicate: (id > 50) (type: boolean)
+                    Statistics: (RUNTIME) Num rows: 1 Data size: 93 Basic 
stats: COMPLETE Column stats: COMPLETE
+                    Select Operator
+                      expressions: str_col (type: string), bool_col (type: 
boolean)
+                      outputColumnNames: _col0, _col1
+                      Statistics: (RUNTIME) Num rows: 1 Data size: 89 Basic 
stats: COMPLETE Column stats: COMPLETE
+                      File Output Operator
+                        compressed: false
+                        Statistics: Num rows: 1 Data size: 89 Basic stats: 
COMPLETE Column stats: COMPLETE
+                        table:
+                            input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
+                            output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                            serde: 
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+            Execution mode: vectorized, llap
+            LLAP IO: all inputs
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+

(hive) branch master updated: HIVE-29422: Statistics - scaling numNulls/numTrues/numFalses when scaling rows (#6299)

Reply via email to