This is an automated email from the ASF dual-hosted git repository.
dkuzmenko pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/branch-4.0 by this push:
new fa579c49218 HIVE-28196: Preserve column stats when applying UDF
upper/lower (Seonggon Namgung, reviewed by Denys Kuzmenko)
fa579c49218 is described below
commit fa579c492182395738708f3ccb610845a9f99063
Author: seonggon <[email protected]>
AuthorDate: Tue May 28 19:38:18 2024 +0900
HIVE-28196: Preserve column stats when applying UDF upper/lower (Seonggon
Namgung, reviewed by Denys Kuzmenko)
Closes #5263
---
.../hive/ql/udf/generic/GenericUDFLower.java | 18 +++-
.../hive/ql/udf/generic/GenericUDFUpper.java | 18 +++-
.../queries/clientpositive/stats_uppper_lower.q | 13 +++
.../llap/groupby_grouping_sets_pushdown1.q.out | 6 +-
.../llap/reduce_deduplicate_extended.q.out | 12 +--
.../clientpositive/llap/stats_uppper_lower.q.out | 108 +++++++++++++++++++++
.../results/clientpositive/llap/vector_udf1.q.out | 8 +-
.../llap/vectorized_string_funcs.q.out | 4 +-
.../test/results/clientpositive/nonmr_fetch.q.out | 2 +-
.../perf/tpcds30tb/tez/query24.q.out | 10 +-
10 files changed, 176 insertions(+), 23 deletions(-)
diff --git
a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFLower.java
b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFLower.java
index 128df018eca..41143890742 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFLower.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFLower.java
@@ -24,6 +24,9 @@ import
org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedExpressions;
import org.apache.hadoop.hive.ql.exec.vector.expressions.StringLower;
import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.plan.ColStatistics;
+import org.apache.hadoop.hive.ql.stats.estimator.StatEstimator;
+import org.apache.hadoop.hive.ql.stats.estimator.StatEstimatorProvider;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
@@ -34,6 +37,9 @@ import
org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectIn
import org.apache.hadoop.hive.serde2.typeinfo.BaseCharTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
+import java.util.List;
+import java.util.Optional;
+
/**
* UDFLower.
*
@@ -43,7 +49,7 @@ value = "_FUNC_(str) - Returns str with all characters
changed to lowercase",
extended = "Example:\n"
+ " > SELECT _FUNC_('Facebook') FROM src LIMIT 1;\n" + " 'facebook'")
@VectorizedExpressions({StringLower.class})
-public class GenericUDFLower extends GenericUDF {
+public class GenericUDFLower extends GenericUDF implements
StatEstimatorProvider {
private transient PrimitiveObjectInspector argumentOI;
private transient StringConverter stringConverter;
private transient PrimitiveCategory returnType = PrimitiveCategory.STRING;
@@ -108,4 +114,14 @@ public class GenericUDFLower extends GenericUDF {
return getStandardDisplayString("lower", children);
}
+ @Override
+ public StatEstimator getStatEstimator() {
+ return new StatEstimator() {
+ @Override
+ public Optional<ColStatistics> estimate(List<ColStatistics> argStats) {
+ return Optional.of(argStats.get(0).clone());
+ }
+ };
+ }
+
}
diff --git
a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFUpper.java
b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFUpper.java
index 25a6e04ddeb..019cbe94a4b 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFUpper.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFUpper.java
@@ -24,6 +24,9 @@ import
org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedExpressions;
import org.apache.hadoop.hive.ql.exec.vector.expressions.StringUpper;
import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.plan.ColStatistics;
+import org.apache.hadoop.hive.ql.stats.estimator.StatEstimator;
+import org.apache.hadoop.hive.ql.stats.estimator.StatEstimatorProvider;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
@@ -34,6 +37,9 @@ import
org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectIn
import org.apache.hadoop.hive.serde2.typeinfo.BaseCharTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
+import java.util.List;
+import java.util.Optional;
+
/**
* UDFUpper.
*
@@ -43,7 +49,7 @@ import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
extended = "Example:\n"
+ " > SELECT _FUNC_('Facebook') FROM src LIMIT 1;\n" + " 'FACEBOOK'")
@VectorizedExpressions({StringUpper.class})
-public class GenericUDFUpper extends GenericUDF {
+public class GenericUDFUpper extends GenericUDF implements
StatEstimatorProvider {
private transient PrimitiveObjectInspector argumentOI;
private transient StringConverter stringConverter;
private transient PrimitiveCategory returnType = PrimitiveCategory.STRING;
@@ -110,4 +116,14 @@ public class GenericUDFUpper extends GenericUDF {
return getStandardDisplayString("upper", children);
}
+ @Override
+ public StatEstimator getStatEstimator() {
+ return new StatEstimator() {
+ @Override
+ public Optional<ColStatistics> estimate(List<ColStatistics> argStats) {
+ return Optional.of(argStats.get(0).clone());
+ }
+ };
+ }
+
}
diff --git a/ql/src/test/queries/clientpositive/stats_uppper_lower.q
b/ql/src/test/queries/clientpositive/stats_uppper_lower.q
new file mode 100644
index 00000000000..fa222c9a69b
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/stats_uppper_lower.q
@@ -0,0 +1,13 @@
+set hive.explain.user=false;
+set hive.stats.fetch.column.stats=true;
+set hive.stats.max.variable.length=10000;
+set hive.auto.convert.join=true;
+set hive.auto.convert.join.noconditionaltask.size=10000;
+
+create table t (key string, value string);
+
+insert into t values ('a', 'a'), ('b', 'b'), ('c', 'c'), ('D', 'D'), ('E',
'E');
+
+analyze table t compute statistics for columns;
+
+explain select a.key, lower(a.value) from t a join t b on a.key = upper(b.key);
diff --git
a/ql/src/test/results/clientpositive/llap/groupby_grouping_sets_pushdown1.q.out
b/ql/src/test/results/clientpositive/llap/groupby_grouping_sets_pushdown1.q.out
index 65750dbe233..6bc7ddd3912 100644
---
a/ql/src/test/results/clientpositive/llap/groupby_grouping_sets_pushdown1.q.out
+++
b/ql/src/test/results/clientpositive/llap/groupby_grouping_sets_pushdown1.q.out
@@ -1169,7 +1169,7 @@ STAGE PLANS:
minReductionHashAggr: 0.4
mode: hash
outputColumnNames: _col0, _col1, _col2, _col3
- Statistics: Num rows: 1 Data size: 287 Basic stats:
COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 1 Data size: 190 Basic stats:
COMPLETE Column stats: COMPLETE
Reduce Output Operator
bucketingVersion: 2
key expressions: _col0 (type: string), _col1 (type:
string), _col2 (type: bigint)
@@ -1177,7 +1177,7 @@ STAGE PLANS:
numBuckets: -1
sort order: +++
Map-reduce partition columns: _col0 (type: string),
_col1 (type: string), _col2 (type: bigint)
- Statistics: Num rows: 1 Data size: 287 Basic stats:
COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 1 Data size: 190 Basic stats:
COMPLETE Column stats: COMPLETE
tag: -1
value expressions: _col3 (type: bigint)
auto parallelism: true
@@ -1229,7 +1229,7 @@ STAGE PLANS:
keys: KEY._col0 (type: string), KEY._col1 (type: string),
KEY._col2 (type: bigint)
mode: mergepartial
outputColumnNames: _col0, _col1, _col3
- Statistics: Num rows: 1 Data size: 287 Basic stats: COMPLETE
Column stats: COMPLETE
+ Statistics: Num rows: 1 Data size: 190 Basic stats: COMPLETE
Column stats: COMPLETE
pruneGroupingSetId: true
Select Operator
expressions: _col1 (type: string), _col3 (type: bigint)
diff --git
a/ql/src/test/results/clientpositive/llap/reduce_deduplicate_extended.q.out
b/ql/src/test/results/clientpositive/llap/reduce_deduplicate_extended.q.out
index 4450613b3e8..2e5eabfd9fe 100644
--- a/ql/src/test/results/clientpositive/llap/reduce_deduplicate_extended.q.out
+++ b/ql/src/test/results/clientpositive/llap/reduce_deduplicate_extended.q.out
@@ -112,14 +112,14 @@ STAGE PLANS:
keys: _col0 (type: string), _col1 (type: string)
mode: complete
outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 316 Data size: 88164 Basic stats:
COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 316 Data size: 58776 Basic stats:
COMPLETE Column stats: COMPLETE
Select Operator
expressions: _col0 (type: string), _col2 (type: double),
_col1 (type: string)
outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 316 Data size: 88164 Basic stats:
COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 316 Data size: 58776 Basic stats:
COMPLETE Column stats: COMPLETE
File Output Operator
compressed: false
- Statistics: Num rows: 316 Data size: 88164 Basic stats:
COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 316 Data size: 58776 Basic stats:
COMPLETE Column stats: COMPLETE
table:
input format:
org.apache.hadoop.mapred.SequenceFileInputFormat
output format:
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
@@ -2725,14 +2725,14 @@ STAGE PLANS:
keys: _col0 (type: string), _col1 (type: string)
mode: complete
outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 500 Data size: 139500 Basic stats:
COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 500 Data size: 93000 Basic stats:
COMPLETE Column stats: COMPLETE
Select Operator
expressions: _col0 (type: string), _col2 (type: double),
_col1 (type: string)
outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 500 Data size: 139500 Basic stats:
COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 500 Data size: 93000 Basic stats:
COMPLETE Column stats: COMPLETE
File Output Operator
compressed: false
- Statistics: Num rows: 500 Data size: 139500 Basic stats:
COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 500 Data size: 93000 Basic stats:
COMPLETE Column stats: COMPLETE
table:
input format:
org.apache.hadoop.mapred.SequenceFileInputFormat
output format:
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
diff --git a/ql/src/test/results/clientpositive/llap/stats_uppper_lower.q.out
b/ql/src/test/results/clientpositive/llap/stats_uppper_lower.q.out
new file mode 100644
index 00000000000..ee112bc736c
--- /dev/null
+++ b/ql/src/test/results/clientpositive/llap/stats_uppper_lower.q.out
@@ -0,0 +1,108 @@
+PREHOOK: query: create table t (key string, value string)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@t
+POSTHOOK: query: create table t (key string, value string)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@t
+PREHOOK: query: insert into t values ('a', 'a'), ('b', 'b'), ('c', 'c'), ('D',
'D'), ('E', 'E')
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@t
+POSTHOOK: query: insert into t values ('a', 'a'), ('b', 'b'), ('c', 'c'),
('D', 'D'), ('E', 'E')
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@t
+POSTHOOK: Lineage: t.key SCRIPT []
+POSTHOOK: Lineage: t.value SCRIPT []
+PREHOOK: query: analyze table t compute statistics for columns
+PREHOOK: type: ANALYZE_TABLE
+PREHOOK: Input: default@t
+PREHOOK: Output: default@t
+#### A masked pattern was here ####
+POSTHOOK: query: analyze table t compute statistics for columns
+POSTHOOK: type: ANALYZE_TABLE
+POSTHOOK: Input: default@t
+POSTHOOK: Output: default@t
+#### A masked pattern was here ####
+PREHOOK: query: explain select a.key, lower(a.value) from t a join t b on
a.key = upper(b.key)
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t
+#### A masked pattern was here ####
+POSTHOOK: query: explain select a.key, lower(a.value) from t a join t b on
a.key = upper(b.key)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Tez
+#### A masked pattern was here ####
+ Edges:
+ Map 1 <- Map 2 (BROADCAST_EDGE)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: a
+ filterExpr: key is not null (type: boolean)
+ Statistics: Num rows: 5 Data size: 850 Basic stats: COMPLETE
Column stats: COMPLETE
+ Filter Operator
+ predicate: key is not null (type: boolean)
+ Statistics: Num rows: 5 Data size: 850 Basic stats:
COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: key (type: string), lower(value) (type:
string)
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 5 Data size: 850 Basic stats:
COMPLETE Column stats: COMPLETE
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ keys:
+ 0 _col0 (type: string)
+ 1 _col0 (type: string)
+ outputColumnNames: _col0, _col1
+ input vertices:
+ 1 Map 2
+ Statistics: Num rows: 5 Data size: 850 Basic stats:
COMPLETE Column stats: COMPLETE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 5 Data size: 850 Basic stats:
COMPLETE Column stats: COMPLETE
+ table:
+ input format:
org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format:
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde:
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ Execution mode: vectorized, llap
+ LLAP IO: all inputs
+ Map 2
+ Map Operator Tree:
+ TableScan
+ alias: b
+ filterExpr: upper(key) is not null (type: boolean)
+ Statistics: Num rows: 5 Data size: 425 Basic stats: COMPLETE
Column stats: COMPLETE
+ Filter Operator
+ predicate: upper(key) is not null (type: boolean)
+ Statistics: Num rows: 5 Data size: 425 Basic stats:
COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: upper(key) (type: string)
+ outputColumnNames: _col0
+ Statistics: Num rows: 5 Data size: 425 Basic stats:
COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: _col0 (type: string)
+ null sort order: z
+ sort order: +
+ Map-reduce partition columns: _col0 (type: string)
+ Statistics: Num rows: 5 Data size: 425 Basic stats:
COMPLETE Column stats: COMPLETE
+ Execution mode: vectorized, llap
+ LLAP IO: all inputs
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
diff --git a/ql/src/test/results/clientpositive/llap/vector_udf1.q.out
b/ql/src/test/results/clientpositive/llap/vector_udf1.q.out
index 4d5bee17647..4677df19f9e 100644
--- a/ql/src/test/results/clientpositive/llap/vector_udf1.q.out
+++ b/ql/src/test/results/clientpositive/llap/vector_udf1.q.out
@@ -190,13 +190,13 @@ STAGE PLANS:
native: true
projectedOutputColumnNums: [10, 11, 14]
selectExpressions: StringUpper(col 1:string) ->
10:string, StringUpper(col 3:varchar(20)) -> 11:varchar(20),
StringGroupColEqualStringGroupColumn(col 12:string, col 13:string)(children:
StringUpper(col 1:string) -> 12:string, StringUpper(col 3:varchar(20)) ->
13:varchar(20)) -> 14:boolean
- Statistics: Num rows: 1 Data size: 292 Basic stats:
COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 1 Data size: 186 Basic stats:
COMPLETE Column stats: COMPLETE
File Output Operator
compressed: false
File Sink Vectorization:
className: VectorFileSinkOperator
native: false
- Statistics: Num rows: 1 Data size: 292 Basic stats:
COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 1 Data size: 186 Basic stats:
COMPLETE Column stats: COMPLETE
table:
input format:
org.apache.hadoop.mapred.SequenceFileInputFormat
output format:
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
@@ -295,13 +295,13 @@ STAGE PLANS:
native: true
projectedOutputColumnNums: [10, 11, 14]
selectExpressions: StringLower(col 1:string) ->
10:string, StringLower(col 3:varchar(20)) -> 11:varchar(20),
StringGroupColEqualStringGroupColumn(col 12:string, col 13:string)(children:
StringLower(col 1:string) -> 12:string, StringLower(col 3:varchar(20)) ->
13:varchar(20)) -> 14:boolean
- Statistics: Num rows: 1 Data size: 292 Basic stats:
COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 1 Data size: 186 Basic stats:
COMPLETE Column stats: COMPLETE
File Output Operator
compressed: false
File Sink Vectorization:
className: VectorFileSinkOperator
native: false
- Statistics: Num rows: 1 Data size: 292 Basic stats:
COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 1 Data size: 186 Basic stats:
COMPLETE Column stats: COMPLETE
table:
input format:
org.apache.hadoop.mapred.SequenceFileInputFormat
output format:
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
diff --git
a/ql/src/test/results/clientpositive/llap/vectorized_string_funcs.q.out
b/ql/src/test/results/clientpositive/llap/vectorized_string_funcs.q.out
index 12e50372e47..bb951147642 100644
--- a/ql/src/test/results/clientpositive/llap/vectorized_string_funcs.q.out
+++ b/ql/src/test/results/clientpositive/llap/vectorized_string_funcs.q.out
@@ -71,10 +71,10 @@ STAGE PLANS:
Select Operator
expressions: substr(cstring1, 1, 2) (type: string),
substr(cstring1, 2) (type: string), lower(cstring1) (type: string),
upper(cstring1) (type: string), upper(cstring1) (type: string),
length(cstring1) (type: int), trim(cstring1) (type: string), ltrim(cstring1)
(type: string), rtrim(cstring1) (type: string), concat(cstring1, cstring2)
(type: string), concat('>', cstring1) (type: string), concat(cstring1, '<')
(type: string), concat(substr(cstring1, 1, 2), substr(cstr [...]
outputColumnNames: _col0, _col1, _col2, _col3, _col4,
_col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12
- Statistics: Num rows: 1024 Data size: 2024426 Basic
stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 1024 Data size: 1674908 Basic
stats: COMPLETE Column stats: COMPLETE
File Output Operator
compressed: false
- Statistics: Num rows: 1024 Data size: 2024426 Basic
stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 1024 Data size: 1674908 Basic
stats: COMPLETE Column stats: COMPLETE
table:
input format:
org.apache.hadoop.mapred.SequenceFileInputFormat
output format:
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
diff --git a/ql/src/test/results/clientpositive/nonmr_fetch.q.out
b/ql/src/test/results/clientpositive/nonmr_fetch.q.out
index 907025078c1..0a59ffedb2b 100644
--- a/ql/src/test/results/clientpositive/nonmr_fetch.q.out
+++ b/ql/src/test/results/clientpositive/nonmr_fetch.q.out
@@ -472,7 +472,7 @@ STAGE PLANS:
Select Operator
expressions: (UDFToInteger(key) * 10) (type: int), upper(value)
(type: string)
outputColumnNames: _col0, _col1
- Statistics: Num rows: 10 Data size: 1880 Basic stats: COMPLETE
Column stats: COMPLETE
+ Statistics: Num rows: 10 Data size: 950 Basic stats: COMPLETE
Column stats: COMPLETE
ListSink
PREHOOK: query: select cast(key as int) * 10, upper(value) from src limit 10
diff --git
a/ql/src/test/results/clientpositive/perf/tpcds30tb/tez/query24.q.out
b/ql/src/test/results/clientpositive/perf/tpcds30tb/tez/query24.q.out
index dd0ab3975db..85615c0fa49 100644
--- a/ql/src/test/results/clientpositive/perf/tpcds30tb/tez/query24.q.out
+++ b/ql/src/test/results/clientpositive/perf/tpcds30tb/tez/query24.q.out
@@ -95,10 +95,10 @@ STAGE PLANS:
outputColumnNames: _col1, _col3, _col4, _col5, _col6,
_col8, _col10, _col11, _col12
input vertices:
0 Map 9
- Statistics: Num rows: 7981221 Data size: 5203756092
Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 7981221 Data size: 5147887545
Basic stats: COMPLETE Column stats: COMPLETE
Filter Operator
predicate: (_col12 <> _col3) (type: boolean)
- Statistics: Num rows: 7981221 Data size: 5203756092
Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 7981221 Data size: 5147887545
Basic stats: COMPLETE Column stats: COMPLETE
Select Operator
expressions: _col8 (type: bigint), _col10 (type:
char(20)), _col11 (type: char(30)), _col1 (type: char(2)), _col4 (type:
bigint), _col5 (type: varchar(50)), _col6 (type: char(2))
outputColumnNames: _col0, _col2, _col3, _col6,
_col9, _col10, _col11
@@ -232,7 +232,7 @@ STAGE PLANS:
Select Operator
expressions: ca_address_sk (type: bigint), ca_state
(type: char(2)), ca_zip (type: char(10)), upper(ca_country) (type: varchar(20))
outputColumnNames: _col0, _col1, _col2, _col3
- Statistics: Num rows: 40000000 Data size: 11480000000
Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 40000000 Data size: 11200000000
Basic stats: COMPLETE Column stats: COMPLETE
Map Join Operator
condition map:
Inner Join 0 to 1
@@ -242,13 +242,13 @@ STAGE PLANS:
outputColumnNames: _col0, _col1, _col3, _col4, _col5,
_col6
input vertices:
1 Map 10
- Statistics: Num rows: 712937 Data size: 270916060
Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 712937 Data size: 265925501
Basic stats: COMPLETE Column stats: COMPLETE
Reduce Output Operator
key expressions: _col0 (type: bigint)
null sort order: z
sort order: +
Map-reduce partition columns: _col0 (type: bigint)
- Statistics: Num rows: 712937 Data size: 270916060
Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 712937 Data size: 265925501
Basic stats: COMPLETE Column stats: COMPLETE
value expressions: _col1 (type: char(2)), _col3
(type: varchar(20)), _col4 (type: bigint), _col5 (type: varchar(50)), _col6
(type: char(2))
Execution mode: vectorized, llap
LLAP IO: may be used (ACID table)