[ 
https://issues.apache.org/jira/browse/HIVE-27158?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17714093#comment-17714093
 ] 

Simhadri Govindappa edited comment on HIVE-27158 at 4/19/23 6:42 PM:
---------------------------------------------------------------------

 

After patching a cluster with this change. Here are the steps and the results:

 

Here i checked for Number of distinct values to verify that column stats are 
being used. 

For a table with 31 million rows with duplicate entries.  Please search for 
"ndv" in both the plans below.  

 

With _*hive.iceberg.stats.source=iceberg;*_ 
 # The correct values for ndv is used. 
 # Columns stats snippet below ( from explain formatted) 

{noformat}
"colStats": [
        {
          "name": "dept",
          "ndv": 3
        },
        {
          "name": "id",
          "ndv": 6,
          "minValue": 1,
          "maxValue": 6
        },
        {
          "name": "name",
          "ndv": 2
        }
      ]{noformat}
Full query with explain formatted for iceberg with stats from puffin :
{noformat}
0: jdbc:hive2://simhadrigovindappa-1.simhadri> set 
hive.iceberg.stats.source=iceberg;
No rows affected (0.005 seconds)


0: jdbc:hive2://simhadrigovindappa-1.simhadri> explain formatted select 
distinct  t.dept from (select * from tbl_ice union all select * from tbl_ice) t;


INFO  : Compiling 
command(queryId=hive_20230419125130_dd45a223-aef7-4f91-afb9-7b016f0db657): 
explain formatted select distinct  t.dept from (select * from tbl_ice union all 
select * from tbl_ice) t
INFO  : Semantic Analysis Completed (retrial = false)
INFO  : Created Hive schema: Schema(fieldSchemas:[FieldSchema(name:Explain, 
type:string, comment:null)], properties:null)
INFO  : Completed compiling 
command(queryId=hive_20230419125130_dd45a223-aef7-4f91-afb9-7b016f0db657); Time 
taken: 0.141 seconds
INFO  : Executing 
command(queryId=hive_20230419125130_dd45a223-aef7-4f91-afb9-7b016f0db657): 
explain formatted select distinct  t.dept from (select * from tbl_ice union all 
select * from tbl_ice) t
INFO  : Starting task [Stage-3:EXPLAIN] in serial mode
INFO  : Completed executing 
command(queryId=hive_20230419125130_dd45a223-aef7-4f91-afb9-7b016f0db657); Time 
taken: 0.015 seconds
INFO  : OK
+----------------------------------------------------+
|                      Explain                       |
+----------------------------------------------------+
| {"CBOPlan":"{\n  \"rels\": [\n    {\n      \"id\": \"0\",\n      \"relOp\": 
\"org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableScan\",\n   
   \"table\": [\n        \"default\",\n        \"tbl_ice\"\n      ],\n      
\"table:alias\": \"tbl_ice\",\n      \"inputs\": [],\n      \"rowCount\": 
3.1850496E7,\n      \"avgRowSize\": 6.666666666666667,\n      \"rowType\": [\n  
      {\n          \"type\": \"INTEGER\",\n          \"nullable\": true,\n      
    \"name\": \"id\"\n        },\n        {\n          \"type\": \"VARCHAR\",\n 
         \"nullable\": true,\n          \"precision\": 2147483647,\n          
\"name\": \"name\"\n        },\n        {\n          \"type\": \"VARCHAR\",\n   
       \"nullable\": true,\n          \"precision\": 2147483647,\n          
\"name\": \"dept\"\n        },\n        {\n          \"type\": \"BIGINT\",\n    
      \"nullable\": true,\n          \"name\": 
\"BLOCK__OFFSET__INSIDE__FILE\"\n        },\n        {\n          \"type\": 
\"VARCHAR\",\n          \"nullable\": true,\n          \"precision\": 
2147483647,\n          \"name\": \"INPUT__FILE__NAME\"\n        },\n        {\n 
         \"fields\": [\n            {\n              \"type\": \"BIGINT\",\n    
          \"nullable\": true,\n              \"name\": \"writeid\"\n            
},\n            {\n              \"type\": \"INTEGER\",\n              
\"nullable\": true,\n              \"name\": \"bucketid\"\n            },\n     
       {\n              \"type\": \"BIGINT\",\n              \"nullable\": 
true,\n              \"name\": \"rowid\"\n            }\n          ],\n         
 \"name\": \"ROW__ID\"\n        },\n        {\n          \"type\": 
\"BOOLEAN\",\n          \"nullable\": true,\n          \"name\": 
\"ROW__IS__DELETED\"\n        }\n      ],\n      \"colStats\": [\n        {\n   
       \"name\": \"dept\",\n          \"ndv\": 3\n        },\n        {\n       
   \"name\": \"id\",\n          \"ndv\": 6,\n          \"minValue\": 1,\n       
   \"maxValue\": 6\n        },\n        {\n          \"name\": \"name\",\n      
    \"ndv\": 2\n        }\n      ]\n    },\n    {\n      \"id\": \"1\",\n      
\"relOp\": 
\"org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveProject\",\n     
 \"fields\": [\n        \"dept\"\n      ],\n      \"exprs\": [\n        {\n     
     \"input\": 2,\n          \"name\": \"$2\"\n        }\n      ],\n      
\"rowCount\": 3.1850496E7\n    },\n    {\n      \"id\": \"2\",\n      
\"relOp\": 
\"org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveUnion\",\n      
\"all\": true,\n      \"inputs\": [\n        \"1\",\n        \"1\"\n      ],\n  
    \"rowCount\": 6.3700992E7\n    },\n    {\n      \"id\": \"3\",\n      
\"relOp\": 
\"org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveProject\",\n     
 \"fields\": [\n        \"dept\"\n      ],\n      \"exprs\": [\n        {\n     
     \"input\": 0,\n          \"name\": \"$0\"\n        }\n      ],\n      
\"rowCount\": 6.3700992E7\n    },\n    {\n      \"id\": \"4\",\n      
\"relOp\": 
\"org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveAggregate\",\n   
   \"group\": [\n        0\n      ],\n      \"aggs\": [],\n      \"rowCount\": 
6.0\n    }\n  ]\n}","optimizedSQL":"SELECT `dept`\nFROM (SELECT `dept`\nFROM 
`default`.`tbl_ice`\nUNION ALL\nSELECT `dept`\nFROM `default`.`tbl_ice`)\nGROUP 
BY `dept`","cboInfo":"Plan optimized by CBO.","STAGE 
DEPENDENCIES":{"Stage-1":{"ROOT STAGE":"TRUE"},"Stage-0":{"DEPENDENT 
STAGES":"Stage-1"}},"STAGE 
PLANS":{"Stage-1":{"Tez":{"DagId:":"hive_20230419125130_dd45a223-aef7-4f91-afb9-7b016f0db657:444","Edges:":{"Map
 1":{"parent":"Union 2","type":"CONTAINS"},"Map 4":{"parent":"Union 
2","type":"CONTAINS"},"Reducer 3":{"parent":"Union 
2","type":"SIMPLE_EDGE"}},"DagName:":"hive_20230419125130_dd45a223-aef7-4f91-afb9-7b016f0db657:444","Vertices:":{"Map
 1":{"Map Operator 
Tree:":[{"TableScan":{"alias:":"tbl_ice","columns:":["dept"],"database:":"default","Statistics:":"Num
 rows: 31850496 Data size: 2898395136 Basic stats: COMPLETE Column stats: 
COMPLETE","table:":"tbl_ice","OperatorId:":"TS_12","children":{"Select 
Operator":{"expressions:":"dept (type: 
string)","columnExprMap:":{"_col0":"dept"},"outputColumnNames:":["_col0"],"Statistics:":"Num
 rows: 31850496 Data size: 2898395136 Basic stats: COMPLETE Column stats: 
COMPLETE","OperatorId:":"SEL_22","children":{"Group By 
Operator":{"columnExprMap:":{"_col0":"_col0"},"keys:":"_col0 (type: 
string)","minReductionHashAggr:":"0.5","mode:":"hash","outputColumnNames:":["_col0"],"Statistics:":"Num
 rows: 261 Data size: 23751 Basic stats: COMPLETE Column stats: 
COMPLETE","OperatorId:":"GBY_23","children":{"Reduce Output 
Operator":{"columnExprMap:":{"KEY._col0":"_col0"},"key expressions:":"_col0 
(type: string)","null sort order:":"a","sort order:":"+","Map-reduce partition 
columns:":"_col0 (type: string)","Statistics:":"Num rows: 261 Data size: 23751 
Basic stats: COMPLETE Column stats: 
COMPLETE","OperatorId:":"RS_24","outputname:":"Reducer 
3","outputOperator:":["GBY_25"]}}}}}}}}],"Execution mode:":"vectorized"},"Map 
4":{"Map Operator 
Tree:":[{"TableScan":{"alias:":"tbl_ice","columns:":["dept"],"database:":"default","Statistics:":"Num
 rows: 31850496 Data size: 2898395136 Basic stats: COMPLETE Column stats: 
COMPLETE","table:":"tbl_ice","OperatorId:":"TS_17","children":{"Select 
Operator":{"expressions:":"dept (type: 
string)","columnExprMap:":{"_col0":"dept"},"outputColumnNames:":["_col0"],"Statistics:":"Num
 rows: 31850496 Data size: 2898395136 Basic stats: COMPLETE Column stats: 
COMPLETE","OperatorId:":"SEL_27","children":{"Group By 
Operator":{"columnExprMap:":{"_col0":"_col0"},"keys:":"_col0 (type: 
string)","minReductionHashAggr:":"0.5","mode:":"hash","outputColumnNames:":["_col0"],"Statistics:":"Num
 rows: 261 Data size: 23751 Basic stats: COMPLETE Column stats: 
COMPLETE","OperatorId:":"GBY_28","children":{"Reduce Output 
Operator":{"columnExprMap:":{"KEY._col0":"_col0"},"key expressions:":"_col0 
(type: string)","null sort order:":"a","sort order:":"+","Map-reduce partition 
columns:":"_col0 (type: string)","Statistics:":"Num rows: 261 Data size: 23751 
Basic stats: COMPLETE Column stats: 
COMPLETE","OperatorId:":"RS_29","outputname:":"Reducer 
3","outputOperator:":["GBY_25"]}}}}}}}}],"Execution 
mode:":"vectorized"},"Reducer 3":{"Execution mode:":"vectorized","Reduce 
Operator Tree:":{"Group By 
Operator":{"columnExprMap:":{"_col0":"KEY._col0"},"keys:":"KEY._col0 (type: 
string)","mode:":"mergepartial","outputColumnNames:":["_col0"],"Statistics:":"Num
 rows: 3 Data size: 273 Basic stats: COMPLETE Column stats: 
COMPLETE","OperatorId:":"GBY_25","children":{"File Output 
Operator":{"compressed:":"false","Statistics:":"Num rows: 3 Data size: 273 
Basic stats: COMPLETE Column stats: COMPLETE","table:":{"input 
format:":"org.apache.hadoop.mapred.SequenceFileInputFormat","output 
format:":"org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat","serde:":"org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe"},"OperatorId:":"FS_26"}}}}},"Union
 2":{"Vertex:":"Union 2"}}}},"Stage-0":{"Fetch 
Operator":{"limit:":"-1","Processor 
Tree:":{"ListSink":{"OperatorId:":"LIST_SINK_30"}}}}}} |
+----------------------------------------------------+
1 row selected (0.177 seconds)




0: jdbc:hive2://simhadrigovindappa-1.simhadri> explain select distinct  t.dept 
from (select * from tbl_ice union all select * from tbl_ice) t;


INFO  : Compiling 
command(queryId=hive_20230419122058_20601449-a801-41dc-86de-c6a126adc8df): 
explain select distinct  t.dept from (select * from tbl_ice union all select * 
from tbl_ice) t
INFO  : Semantic Analysis Completed (retrial = false)
INFO  : Created Hive schema: Schema(fieldSchemas:[FieldSchema(name:Explain, 
type:string, comment:null)], properties:null)
INFO  : Completed compiling 
command(queryId=hive_20230419122058_20601449-a801-41dc-86de-c6a126adc8df); Time 
taken: 0.078 seconds
INFO  : Executing 
command(queryId=hive_20230419122058_20601449-a801-41dc-86de-c6a126adc8df): 
explain select distinct  t.dept from (select * from tbl_ice union all select * 
from tbl_ice) t
INFO  : Starting task [Stage-3:EXPLAIN] in serial mode
INFO  : Completed executing 
command(queryId=hive_20230419122058_20601449-a801-41dc-86de-c6a126adc8df); Time 
taken: 0.009 seconds
INFO  : OK
+----------------------------------------------------+
|                      Explain                       |
+----------------------------------------------------+
| Plan optimized by CBO.                             |
|                                                    |
| Vertex dependency in root stage                    |
| Map 1 <- Union 2 (CONTAINS)                        |
| Map 4 <- Union 2 (CONTAINS)                        |
| Reducer 3 <- Union 2 (SIMPLE_EDGE)                 |
|                                                    |
| Stage-0                                            |
|   Fetch Operator                                   |
|     limit:-1                                       |
|     Stage-1                                        |
|       Reducer 3 vectorized                         |
|       File Output Operator [FS_26]                 |
|         Group By Operator [GBY_25] (rows=3 width=91) |
|           Output:["_col0"],keys:KEY._col0          |
|         <-Union 2 [SIMPLE_EDGE]                    |
|           <-Map 1 [CONTAINS] vectorized            |
|             Reduce Output Operator [RS_24]         |
|               PartitionCols:_col0                  |
|               Group By Operator [GBY_23] (rows=261 width=91) |
|                 Output:["_col0"],keys:_col0        |
|                 Select Operator [SEL_22] (rows=31850496 width=91) |
|                   Output:["_col0"]                 |
|                   TableScan [TS_12] (rows=31850496 width=91) |
|                     Output:["dept"]                |
|           <-Map 4 [CONTAINS] vectorized            |
|             Reduce Output Operator [RS_29]         |
|               PartitionCols:_col0                  |
|               Group By Operator [GBY_28] (rows=261 width=91) |
|                 Output:["_col0"],keys:_col0        |
|                 Select Operator [SEL_27] (rows=31850496 width=91) |
|                   Output:["_col0"]                 |
|                   TableScan [TS_17] (rows=31850496 width=91) |
|                     Output:["dept"]                |
|                                                    |
+----------------------------------------------------+
35 rows selected (0.104 seconds) {noformat}
 

 

With _*set hive.iceberg.stats.source=metastore;*_ 
 # _**_ This is the older behaviour prior to this patch
 # The values for ndv are incorrectly estimated and column stats are not used.

 

 
{noformat}
0: jdbc:hive2://simhadrigovindappa-1.simhadri> set 
hive.iceberg.stats.source=metastore;
No rows affected (0.006 seconds)


0: jdbc:hive2://simhadrigovindappa-1.simhadri> explain formatted select 
distinct  t.dept from (select * from tbl_ice union all select * from tbl_ice) t;
INFO  : Compiling 
command(queryId=hive_20230419125158_88fc9874-e420-4015-a472-e39106913b5b): 
explain formatted select distinct  t.dept from (select * from tbl_ice union all 
select * from tbl_ice) t
INFO  : No Stats for default@tbl_ice, Columns: dept
INFO  : No Stats for default@tbl_ice, Columns: name, id
INFO  : Semantic Analysis Completed (retrial = false)
INFO  : Created Hive schema: Schema(fieldSchemas:[FieldSchema(name:Explain, 
type:string, comment:null)], properties:null)
INFO  : Completed compiling 
command(queryId=hive_20230419125158_88fc9874-e420-4015-a472-e39106913b5b); Time 
taken: 0.102 seconds
INFO  : Executing 
command(queryId=hive_20230419125158_88fc9874-e420-4015-a472-e39106913b5b): 
explain formatted select distinct  t.dept from (select * from tbl_ice union all 
select * from tbl_ice) t
INFO  : Starting task [Stage-3:EXPLAIN] in serial mode
INFO  : Completed executing 
command(queryId=hive_20230419125158_88fc9874-e420-4015-a472-e39106913b5b); Time 
taken: 0.017 seconds
INFO  : OK
+----------------------------------------------------+
|                      Explain                       |
+----------------------------------------------------+
| {"CBOPlan":"{\n  \"rels\": [\n    {\n      \"id\": \"0\",\n      \"relOp\": 
\"org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableScan\",\n   
   \"table\": [\n        \"default\",\n        \"tbl_ice\"\n      ],\n      
\"table:alias\": \"tbl_ice\",\n      \"inputs\": [],\n      \"rowCount\": 
3.1850496E7,\n      \"avgRowSize\": 100.0,\n      \"rowType\": [\n        {\n   
       \"type\": \"INTEGER\",\n          \"nullable\": true,\n          
\"name\": \"id\"\n        },\n        {\n          \"type\": \"VARCHAR\",\n     
     \"nullable\": true,\n          \"precision\": 2147483647,\n          
\"name\": \"name\"\n        },\n        {\n          \"type\": \"VARCHAR\",\n   
       \"nullable\": true,\n          \"precision\": 2147483647,\n          
\"name\": \"dept\"\n        },\n        {\n          \"type\": \"BIGINT\",\n    
      \"nullable\": true,\n          \"name\": 
\"BLOCK__OFFSET__INSIDE__FILE\"\n        },\n        {\n          \"type\": 
\"VARCHAR\",\n          \"nullable\": true,\n          \"precision\": 
2147483647,\n          \"name\": \"INPUT__FILE__NAME\"\n        },\n        {\n 
         \"fields\": [\n            {\n              \"type\": \"BIGINT\",\n    
          \"nullable\": true,\n              \"name\": \"writeid\"\n            
},\n            {\n              \"type\": \"INTEGER\",\n              
\"nullable\": true,\n              \"name\": \"bucketid\"\n            },\n     
       {\n              \"type\": \"BIGINT\",\n              \"nullable\": 
true,\n              \"name\": \"rowid\"\n            }\n          ],\n         
 \"name\": \"ROW__ID\"\n        },\n        {\n          \"type\": 
\"BOOLEAN\",\n          \"nullable\": true,\n          \"name\": 
\"ROW__IS__DELETED\"\n        }\n      ],\n      \"colStats\": [\n        {\n   
       \"name\": \"dept\",\n          \"ndv\": 6370099\n        },\n        {\n 
         \"name\": \"id\",\n          \"ndv\": 6370099,\n          
\"minValue\": -9223372036854775808,\n          \"maxValue\": 
9223372036854775807\n        },\n        {\n          \"name\": \"name\",\n     
     \"ndv\": 6370099\n        }\n      ]\n    },\n    {\n      \"id\": 
\"1\",\n      \"relOp\": 
\"org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveProject\",\n     
 \"fields\": [\n        \"dept\"\n      ],\n      \"exprs\": [\n        {\n     
     \"input\": 2,\n          \"name\": \"$2\"\n        }\n      ],\n      
\"rowCount\": 3.1850496E7\n    },\n    {\n      \"id\": \"2\",\n      
\"relOp\": 
\"org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveUnion\",\n      
\"all\": true,\n      \"inputs\": [\n        \"1\",\n        \"1\"\n      ],\n  
    \"rowCount\": 6.3700992E7\n    },\n    {\n      \"id\": \"3\",\n      
\"relOp\": 
\"org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveProject\",\n     
 \"fields\": [\n        \"dept\"\n      ],\n      \"exprs\": [\n        {\n     
     \"input\": 0,\n          \"name\": \"$0\"\n        }\n      ],\n      
\"rowCount\": 6.3700992E7\n    },\n    {\n      \"id\": \"4\",\n      
\"relOp\": 
\"org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveAggregate\",\n   
   \"group\": [\n        0\n      ],\n      \"aggs\": [],\n      \"rowCount\": 
1.2740198E7\n    }\n  ]\n}","optimizedSQL":"SELECT `dept`\nFROM (SELECT 
`dept`\nFROM `default`.`tbl_ice`\nUNION ALL\nSELECT `dept`\nFROM 
`default`.`tbl_ice`)\nGROUP BY `dept`","cboInfo":"Plan optimized by 
CBO.","STAGE DEPENDENCIES":{"Stage-1":{"ROOT 
STAGE":"TRUE"},"Stage-0":{"DEPENDENT STAGES":"Stage-1"}},"STAGE 
PLANS":{"Stage-1":{"Tez":{"DagId:":"hive_20230419125158_88fc9874-e420-4015-a472-e39106913b5b:445","Edges:":{"Map
 1":{"parent":"Union 2","type":"CONTAINS"},"Map 4":{"parent":"Union 
2","type":"CONTAINS"},"Reducer 3":{"parent":"Union 
2","type":"SIMPLE_EDGE"}},"DagName:":"hive_20230419125158_88fc9874-e420-4015-a472-e39106913b5b:445","Vertices:":{"Map
 1":{"Map Operator 
Tree:":[{"TableScan":{"alias:":"tbl_ice","columns:":["dept"],"database:":"default","Statistics:":"Num
 rows: 31850496 Data size: 5567467032 Basic stats: COMPLETE Column stats: 
NONE","table:":"tbl_ice","OperatorId:":"TS_12","children":{"Select 
Operator":{"expressions:":"dept (type: 
string)","columnExprMap:":{"_col0":"dept"},"outputColumnNames:":["_col0"],"Statistics:":"Num
 rows: 31850496 Data size: 5567467032 Basic stats: COMPLETE Column stats: 
NONE","OperatorId:":"SEL_22","children":{"Group By 
Operator":{"columnExprMap:":{"_col0":"_col0"},"keys:":"_col0 (type: 
string)","minReductionHashAggr:":"0.5","mode:":"hash","outputColumnNames:":["_col0"],"Statistics:":"Num
 rows: 63700992 Data size: 11134934064 Basic stats: COMPLETE Column stats: 
NONE","OperatorId:":"GBY_23","children":{"Reduce Output 
Operator":{"columnExprMap:":{"KEY._col0":"_col0"},"key expressions:":"_col0 
(type: string)","null sort order:":"a","sort order:":"+","Map-reduce partition 
columns:":"_col0 (type: string)","Statistics:":"Num rows: 63700992 Data size: 
11134934064 Basic stats: COMPLETE Column stats: 
NONE","OperatorId:":"RS_24","outputname:":"Reducer 
3","outputOperator:":["GBY_25"]}}}}}}}}],"Execution mode:":"vectorized"},"Map 
4":{"Map Operator 
Tree:":[{"TableScan":{"alias:":"tbl_ice","columns:":["dept"],"database:":"default","Statistics:":"Num
 rows: 31850496 Data size: 5567467032 Basic stats: COMPLETE Column stats: 
NONE","table:":"tbl_ice","OperatorId:":"TS_17","children":{"Select 
Operator":{"expressions:":"dept (type: 
string)","columnExprMap:":{"_col0":"dept"},"outputColumnNames:":["_col0"],"Statistics:":"Num
 rows: 31850496 Data size: 5567467032 Basic stats: COMPLETE Column stats: 
NONE","OperatorId:":"SEL_27","children":{"Group By 
Operator":{"columnExprMap:":{"_col0":"_col0"},"keys:":"_col0 (type: 
string)","minReductionHashAggr:":"0.5","mode:":"hash","outputColumnNames:":["_col0"],"Statistics:":"Num
 rows: 63700992 Data size: 11134934064 Basic stats: COMPLETE Column stats: 
NONE","OperatorId:":"GBY_28","children":{"Reduce Output 
Operator":{"columnExprMap:":{"KEY._col0":"_col0"},"key expressions:":"_col0 
(type: string)","null sort order:":"a","sort order:":"+","Map-reduce partition 
columns:":"_col0 (type: string)","Statistics:":"Num rows: 63700992 Data size: 
11134934064 Basic stats: COMPLETE Column stats: 
NONE","OperatorId:":"RS_29","outputname:":"Reducer 
3","outputOperator:":["GBY_25"]}}}}}}}}],"Execution 
mode:":"vectorized"},"Reducer 3":{"Execution mode:":"vectorized","Reduce 
Operator Tree:":{"Group By 
Operator":{"columnExprMap:":{"_col0":"KEY._col0"},"keys:":"KEY._col0 (type: 
string)","mode:":"mergepartial","outputColumnNames:":["_col0"],"Statistics:":"Num
 rows: 31850496 Data size: 5567467032 Basic stats: COMPLETE Column stats: 
NONE","OperatorId:":"GBY_25","children":{"File Output 
Operator":{"compressed:":"false","Statistics:":"Num rows: 31850496 Data size: 
5567467032 Basic stats: COMPLETE Column stats: NONE","table:":{"input 
format:":"org.apache.hadoop.mapred.SequenceFileInputFormat","output 
format:":"org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat","serde:":"org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe"},"OperatorId:":"FS_26"}}}}},"Union
 2":{"Vertex:":"Union 2"}}}},"Stage-0":{"Fetch 
Operator":{"limit:":"-1","Processor 
Tree:":{"ListSink":{"OperatorId:":"LIST_SINK_30"}}}}}} |
+----------------------------------------------------+
1 row selected (0.141 seconds)
0: jdb



0: jdbc:hive2://simhadrigovindappa-1.simhadri>
0: jdbc:hive2://simhadrigovindappa-1.simhadri> explain select distinct  t.dept 
from (select * from tbl_ice union all select * from tbl_ice) t;
INFO  : Compiling 
command(queryId=hive_20230419122039_d9a7bbe1-06f9-4ec5-b985-31ae1084b0e1): 
explain select distinct  t.dept from (select * from tbl_ice union all select * 
from tbl_ice) t
INFO  : No Stats for default@tbl_ice, Columns: dept
INFO  : Semantic Analysis Completed (retrial = false)
INFO  : Created Hive schema: Schema(fieldSchemas:[FieldSchema(name:Explain, 
type:string, comment:null)], properties:null)
INFO  : Completed compiling 
command(queryId=hive_20230419122039_d9a7bbe1-06f9-4ec5-b985-31ae1084b0e1); Time 
taken: 0.081 seconds
INFO  : Executing 
command(queryId=hive_20230419122039_d9a7bbe1-06f9-4ec5-b985-31ae1084b0e1): 
explain select distinct  t.dept from (select * from tbl_ice union all select * 
from tbl_ice) t
INFO  : Starting task [Stage-3:EXPLAIN] in serial mode
INFO  : Completed executing 
command(queryId=hive_20230419122039_d9a7bbe1-06f9-4ec5-b985-31ae1084b0e1); Time 
taken: 0.011 seconds
INFO  : OK
+----------------------------------------------------+
|                      Explain                       |
+----------------------------------------------------+
| Plan optimized by CBO.                             |
|                                                    |
| Vertex dependency in root stage                    |
| Map 1 <- Union 2 (CONTAINS)                        |
| Map 4 <- Union 2 (CONTAINS)                        |
| Reducer 3 <- Union 2 (SIMPLE_EDGE)                 |
|                                                    |
| Stage-0                                            |
|   Fetch Operator                                   |
|     limit:-1                                       |
|     Stage-1                                        |
|       Reducer 3 vectorized                         |
|       File Output Operator [FS_26]                 |
|         Group By Operator [GBY_25] (rows=31850496 width=174) |
|           Output:["_col0"],keys:KEY._col0          |
|         <-Union 2 [SIMPLE_EDGE]                    |
|           <-Map 1 [CONTAINS] vectorized            |
|             Reduce Output Operator [RS_24]         |
|               PartitionCols:_col0                  |
|               Group By Operator [GBY_23] (rows=63700992 width=174) |
|                 Output:["_col0"],keys:_col0        |
|                 Select Operator [SEL_22] (rows=31850496 width=174) |
|                   Output:["_col0"]                 |
|                   TableScan [TS_12] (rows=31850496 width=174) |
|                     Output:["dept"]                |
|           <-Map 4 [CONTAINS] vectorized            |
|             Reduce Output Operator [RS_29]         |
|               PartitionCols:_col0                  |
|               Group By Operator [GBY_28] (rows=63700992 width=174) |
|                 Output:["_col0"],keys:_col0        |
|                 Select Operator [SEL_27] (rows=31850496 width=174) |
|                   Output:["_col0"]                 |
|                   TableScan [TS_17] (rows=31850496 width=174) |
|                     Output:["dept"]                |
|                                                    |
+----------------------------------------------------+
35 rows selected (0.109 seconds){noformat}
 

 

Table details:
{noformat}
0: jdbc:hive2://simhadrigovindappa-1.simhadri> describe formatted tbl_ice;
INFO  : Compiling 
command(queryId=hive_20230419125721_f21b45f1-80e6-4c09-b576-256b5f3a132d): 
describe formatted tbl_ice
INFO  : Semantic Analysis Completed (retrial = false)
INFO  : Created Hive schema: Schema(fieldSchemas:[FieldSchema(name:col_name, 
type:string, comment:from deserializer), FieldSchema(name:data_type, 
type:string, comment:from deserializer), FieldSchema(name:comment, type:string, 
comment:from deserializer)], properties:null)
INFO  : Completed compiling 
command(queryId=hive_20230419125721_f21b45f1-80e6-4c09-b576-256b5f3a132d); Time 
taken: 0.116 seconds
INFO  : Executing 
command(queryId=hive_20230419125721_f21b45f1-80e6-4c09-b576-256b5f3a132d): 
describe formatted tbl_ice
INFO  : Starting task [Stage-0:DDL] in serial mode
INFO  : Completed executing 
command(queryId=hive_20230419125721_f21b45f1-80e6-4c09-b576-256b5f3a132d); Time 
taken: 0.027 seconds
INFO  : OK
+-------------------------------+----------------------------------------------------+----------------------------------------------------+
|           col_name            |                     data_type                 
     |                      comment                       |
+-------------------------------+----------------------------------------------------+----------------------------------------------------+
| id                            | int                                           
     |                                                    |
| name                          | string                                        
     |                                                    |
| dept                          | string                                        
     |                                                    |
|                               | NULL                                          
     | NULL                                               |
| # Detailed Table Information  | NULL                                          
     | NULL                                               |
| Database:                     | default                                       
     | NULL                                               |
| OwnerType:                    | USER                                          
     | NULL                                               |
| Owner:                        | hive                                          
     | NULL                                               |
| CreateTime:                   | Thu Apr 13 15:37:52 UTC 2023                  
     | NULL                                               |
| LastAccessTime:               | UNKNOWN                                       
     | NULL                                               |
| Retention:                    | 0                                             
     | NULL                                               |
| Location:                     | 
hdfs://simhadrigovindappa-1.simhadrigovindappa.root.hwx.site:8020/warehouse/tablespace/external/hive/TBL_ICE
 | NULL                                               |
| Table Type:                   | EXTERNAL_TABLE                                
     | NULL                                               |
| Table Parameters:             | NULL                                          
     | NULL                                               |
|                               | EXTERNAL                                      
     | TRUE                                               |
|                               | FORMAT-VERSION                                
     | 2                                                  |
|                               | bucketing_version                             
     | 2                                                  |
|                               | engine.hive.enabled                           
     | true                                               |
|                               | iceberg.orc.files.only                        
     | false                                              |
|                               | metadata_location                             
     | 
hdfs://simhadrigovindappa-1.simhadrigovindappa.root.hwx.site:8020/warehouse/tablespace/external/hive/TBL_ICE/metadata/00026-1e59d88f-b012-4223-93fd-11264b6285fd.metadata.json
 |
|                               | numFiles                                      
     | 26                                                 |
|                               | numRows                                       
     | 31850496                                           |
|                               | previous_metadata_location                    
     | 
hdfs://simhadrigovindappa-1.simhadrigovindappa.root.hwx.site:8020/warehouse/tablespace/external/hive/TBL_ICE/metadata/00025-8bab5a3e-e014-4dd2-b9ad-8a7793546b41.metadata.json
 |
|                               | rawDataSize                                   
     | 0                                                  |
|                               | serialization.format                          
     | 1                                                  |
|                               | storage_handler                               
     | org.apache.iceberg.mr.hive.HiveIcebergStorageHandler |
|                               | table_type                                    
     | ICEBERG                                            |
|                               | totalSize                                     
     | 1519780                                            |
|                               | transient_lastDdlTime                         
     | 1681418834                                         |
|                               | uuid                                          
     | fdd6170c-33aa-481c-83df-b1ed465b7567               |
|                               | write.format.default                          
     | parquet                                            |
|                               | NULL                                          
     | NULL                                               |
| # Storage Information         | NULL                                          
     | NULL                                               |
| SerDe Library:                | org.apache.iceberg.mr.hive.HiveIcebergSerDe   
     | NULL                                               |
| InputFormat:                  | 
org.apache.iceberg.mr.hive.HiveIcebergInputFormat  | NULL                       
                        |
| OutputFormat:                 | 
org.apache.iceberg.mr.hive.HiveIcebergOutputFormat | NULL                       
                        |
| Compressed:                   | No                                            
     | NULL                                               |
| Sort Columns:                 | []                                            
     | NULL                                               |
+-------------------------------+----------------------------------------------------+----------------------------------------------------+
38 rows selected (0.167 seconds)
{noformat}
 


was (Author: simhadri-g):
 

After patching a cluster with this change. Here are the steps and the results:

 

Here i checked for Number of distinct values to verify that column stats are 
being used. 

For a table with 31 million rows with duplicate entries.  Please search for 
"ndv" in both the plans below.  

 

With _*hive.iceberg.stats.source=iceberg;*_ 
 # The correct values for ndv is used. 
 # Columns stats snippet below ( from explain formatted) 

{noformat}
"colStats": [
        {
          "name": "dept",
          "ndv": 3
        },
        {
          "name": "id",
          "ndv": 6,
          "minValue": 1,
          "maxValue": 6
        },
        {
          "name": "name",
          "ndv": 2
        }
      ]{noformat}
Full query with explain formatted for iceberg with stats from puffin :
{noformat}
0: jdbc:hive2://simhadrigovindappa-1.simhadri> set 
hive.iceberg.stats.source=iceberg;
No rows affected (0.005 seconds)


0: jdbc:hive2://simhadrigovindappa-1.simhadri> explain formatted select 
distinct  t.dept from (select * from tbl_ice union all select * from tbl_ice) t;
INFO  : Compiling 
command(queryId=hive_20230419125130_dd45a223-aef7-4f91-afb9-7b016f0db657): 
explain formatted select distinct  t.dept from (select * from tbl_ice union all 
select * from tbl_ice) t
INFO  : Semantic Analysis Completed (retrial = false)
INFO  : Created Hive schema: Schema(fieldSchemas:[FieldSchema(name:Explain, 
type:string, comment:null)], properties:null)
INFO  : Completed compiling 
command(queryId=hive_20230419125130_dd45a223-aef7-4f91-afb9-7b016f0db657); Time 
taken: 0.141 seconds
INFO  : Executing 
command(queryId=hive_20230419125130_dd45a223-aef7-4f91-afb9-7b016f0db657): 
explain formatted select distinct  t.dept from (select * from tbl_ice union all 
select * from tbl_ice) t
INFO  : Starting task [Stage-3:EXPLAIN] in serial mode
INFO  : Completed executing 
command(queryId=hive_20230419125130_dd45a223-aef7-4f91-afb9-7b016f0db657); Time 
taken: 0.015 seconds
INFO  : OK
+----------------------------------------------------+
|                      Explain                       |
+----------------------------------------------------+
| {"CBOPlan":"{\n  \"rels\": [\n    {\n      \"id\": \"0\",\n      \"relOp\": 
\"org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableScan\",\n   
   \"table\": [\n        \"default\",\n        \"tbl_ice\"\n      ],\n      
\"table:alias\": \"tbl_ice\",\n      \"inputs\": [],\n      \"rowCount\": 
3.1850496E7,\n      \"avgRowSize\": 6.666666666666667,\n      \"rowType\": [\n  
      {\n          \"type\": \"INTEGER\",\n          \"nullable\": true,\n      
    \"name\": \"id\"\n        },\n        {\n          \"type\": \"VARCHAR\",\n 
         \"nullable\": true,\n          \"precision\": 2147483647,\n          
\"name\": \"name\"\n        },\n        {\n          \"type\": \"VARCHAR\",\n   
       \"nullable\": true,\n          \"precision\": 2147483647,\n          
\"name\": \"dept\"\n        },\n        {\n          \"type\": \"BIGINT\",\n    
      \"nullable\": true,\n          \"name\": 
\"BLOCK__OFFSET__INSIDE__FILE\"\n        },\n        {\n          \"type\": 
\"VARCHAR\",\n          \"nullable\": true,\n          \"precision\": 
2147483647,\n          \"name\": \"INPUT__FILE__NAME\"\n        },\n        {\n 
         \"fields\": [\n            {\n              \"type\": \"BIGINT\",\n    
          \"nullable\": true,\n              \"name\": \"writeid\"\n            
},\n            {\n              \"type\": \"INTEGER\",\n              
\"nullable\": true,\n              \"name\": \"bucketid\"\n            },\n     
       {\n              \"type\": \"BIGINT\",\n              \"nullable\": 
true,\n              \"name\": \"rowid\"\n            }\n          ],\n         
 \"name\": \"ROW__ID\"\n        },\n        {\n          \"type\": 
\"BOOLEAN\",\n          \"nullable\": true,\n          \"name\": 
\"ROW__IS__DELETED\"\n        }\n      ],\n      \"colStats\": [\n        {\n   
       \"name\": \"dept\",\n          \"ndv\": 3\n        },\n        {\n       
   \"name\": \"id\",\n          \"ndv\": 6,\n          \"minValue\": 1,\n       
   \"maxValue\": 6\n        },\n        {\n          \"name\": \"name\",\n      
    \"ndv\": 2\n        }\n      ]\n    },\n    {\n      \"id\": \"1\",\n      
\"relOp\": 
\"org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveProject\",\n     
 \"fields\": [\n        \"dept\"\n      ],\n      \"exprs\": [\n        {\n     
     \"input\": 2,\n          \"name\": \"$2\"\n        }\n      ],\n      
\"rowCount\": 3.1850496E7\n    },\n    {\n      \"id\": \"2\",\n      
\"relOp\": 
\"org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveUnion\",\n      
\"all\": true,\n      \"inputs\": [\n        \"1\",\n        \"1\"\n      ],\n  
    \"rowCount\": 6.3700992E7\n    },\n    {\n      \"id\": \"3\",\n      
\"relOp\": 
\"org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveProject\",\n     
 \"fields\": [\n        \"dept\"\n      ],\n      \"exprs\": [\n        {\n     
     \"input\": 0,\n          \"name\": \"$0\"\n        }\n      ],\n      
\"rowCount\": 6.3700992E7\n    },\n    {\n      \"id\": \"4\",\n      
\"relOp\": 
\"org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveAggregate\",\n   
   \"group\": [\n        0\n      ],\n      \"aggs\": [],\n      \"rowCount\": 
6.0\n    }\n  ]\n}","optimizedSQL":"SELECT `dept`\nFROM (SELECT `dept`\nFROM 
`default`.`tbl_ice`\nUNION ALL\nSELECT `dept`\nFROM `default`.`tbl_ice`)\nGROUP 
BY `dept`","cboInfo":"Plan optimized by CBO.","STAGE 
DEPENDENCIES":{"Stage-1":{"ROOT STAGE":"TRUE"},"Stage-0":{"DEPENDENT 
STAGES":"Stage-1"}},"STAGE 
PLANS":{"Stage-1":{"Tez":{"DagId:":"hive_20230419125130_dd45a223-aef7-4f91-afb9-7b016f0db657:444","Edges:":{"Map
 1":{"parent":"Union 2","type":"CONTAINS"},"Map 4":{"parent":"Union 
2","type":"CONTAINS"},"Reducer 3":{"parent":"Union 
2","type":"SIMPLE_EDGE"}},"DagName:":"hive_20230419125130_dd45a223-aef7-4f91-afb9-7b016f0db657:444","Vertices:":{"Map
 1":{"Map Operator 
Tree:":[{"TableScan":{"alias:":"tbl_ice","columns:":["dept"],"database:":"default","Statistics:":"Num
 rows: 31850496 Data size: 2898395136 Basic stats: COMPLETE Column stats: 
COMPLETE","table:":"tbl_ice","OperatorId:":"TS_12","children":{"Select 
Operator":{"expressions:":"dept (type: 
string)","columnExprMap:":{"_col0":"dept"},"outputColumnNames:":["_col0"],"Statistics:":"Num
 rows: 31850496 Data size: 2898395136 Basic stats: COMPLETE Column stats: 
COMPLETE","OperatorId:":"SEL_22","children":{"Group By 
Operator":{"columnExprMap:":{"_col0":"_col0"},"keys:":"_col0 (type: 
string)","minReductionHashAggr:":"0.5","mode:":"hash","outputColumnNames:":["_col0"],"Statistics:":"Num
 rows: 261 Data size: 23751 Basic stats: COMPLETE Column stats: 
COMPLETE","OperatorId:":"GBY_23","children":{"Reduce Output 
Operator":{"columnExprMap:":{"KEY._col0":"_col0"},"key expressions:":"_col0 
(type: string)","null sort order:":"a","sort order:":"+","Map-reduce partition 
columns:":"_col0 (type: string)","Statistics:":"Num rows: 261 Data size: 23751 
Basic stats: COMPLETE Column stats: 
COMPLETE","OperatorId:":"RS_24","outputname:":"Reducer 
3","outputOperator:":["GBY_25"]}}}}}}}}],"Execution mode:":"vectorized"},"Map 
4":{"Map Operator 
Tree:":[{"TableScan":{"alias:":"tbl_ice","columns:":["dept"],"database:":"default","Statistics:":"Num
 rows: 31850496 Data size: 2898395136 Basic stats: COMPLETE Column stats: 
COMPLETE","table:":"tbl_ice","OperatorId:":"TS_17","children":{"Select 
Operator":{"expressions:":"dept (type: 
string)","columnExprMap:":{"_col0":"dept"},"outputColumnNames:":["_col0"],"Statistics:":"Num
 rows: 31850496 Data size: 2898395136 Basic stats: COMPLETE Column stats: 
COMPLETE","OperatorId:":"SEL_27","children":{"Group By 
Operator":{"columnExprMap:":{"_col0":"_col0"},"keys:":"_col0 (type: 
string)","minReductionHashAggr:":"0.5","mode:":"hash","outputColumnNames:":["_col0"],"Statistics:":"Num
 rows: 261 Data size: 23751 Basic stats: COMPLETE Column stats: 
COMPLETE","OperatorId:":"GBY_28","children":{"Reduce Output 
Operator":{"columnExprMap:":{"KEY._col0":"_col0"},"key expressions:":"_col0 
(type: string)","null sort order:":"a","sort order:":"+","Map-reduce partition 
columns:":"_col0 (type: string)","Statistics:":"Num rows: 261 Data size: 23751 
Basic stats: COMPLETE Column stats: 
COMPLETE","OperatorId:":"RS_29","outputname:":"Reducer 
3","outputOperator:":["GBY_25"]}}}}}}}}],"Execution 
mode:":"vectorized"},"Reducer 3":{"Execution mode:":"vectorized","Reduce 
Operator Tree:":{"Group By 
Operator":{"columnExprMap:":{"_col0":"KEY._col0"},"keys:":"KEY._col0 (type: 
string)","mode:":"mergepartial","outputColumnNames:":["_col0"],"Statistics:":"Num
 rows: 3 Data size: 273 Basic stats: COMPLETE Column stats: 
COMPLETE","OperatorId:":"GBY_25","children":{"File Output 
Operator":{"compressed:":"false","Statistics:":"Num rows: 3 Data size: 273 
Basic stats: COMPLETE Column stats: COMPLETE","table:":{"input 
format:":"org.apache.hadoop.mapred.SequenceFileInputFormat","output 
format:":"org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat","serde:":"org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe"},"OperatorId:":"FS_26"}}}}},"Union
 2":{"Vertex:":"Union 2"}}}},"Stage-0":{"Fetch 
Operator":{"limit:":"-1","Processor 
Tree:":{"ListSink":{"OperatorId:":"LIST_SINK_30"}}}}}} |
+----------------------------------------------------+
1 row selected (0.177 seconds)

{noformat}
 

 

With _*set hive.iceberg.stats.source=metastore;*_ 
 #  _**_ This is the older behaviour prior to this patch
 # The values for ndv are incorrectly estimated and column stats are not used.

 

 
{noformat}
0: jdbc:hive2://simhadrigovindappa-1.simhadri> set 
hive.iceberg.stats.source=metastore;
No rows affected (0.006 seconds)
0: jdbc:hive2://simhadrigovindappa-1.simhadri> explain formatted select 
distinct  t.dept from (select * from tbl_ice union all select * from tbl_ice) t;
INFO  : Compiling 
command(queryId=hive_20230419125158_88fc9874-e420-4015-a472-e39106913b5b): 
explain formatted select distinct  t.dept from (select * from tbl_ice union all 
select * from tbl_ice) t
INFO  : No Stats for default@tbl_ice, Columns: dept
INFO  : No Stats for default@tbl_ice, Columns: name, id
INFO  : Semantic Analysis Completed (retrial = false)
INFO  : Created Hive schema: Schema(fieldSchemas:[FieldSchema(name:Explain, 
type:string, comment:null)], properties:null)
INFO  : Completed compiling 
command(queryId=hive_20230419125158_88fc9874-e420-4015-a472-e39106913b5b); Time 
taken: 0.102 seconds
INFO  : Executing 
command(queryId=hive_20230419125158_88fc9874-e420-4015-a472-e39106913b5b): 
explain formatted select distinct  t.dept from (select * from tbl_ice union all 
select * from tbl_ice) t
INFO  : Starting task [Stage-3:EXPLAIN] in serial mode
INFO  : Completed executing 
command(queryId=hive_20230419125158_88fc9874-e420-4015-a472-e39106913b5b); Time 
taken: 0.017 seconds
INFO  : OK
+----------------------------------------------------+
|                      Explain                       |
+----------------------------------------------------+
| {"CBOPlan":"{\n  \"rels\": [\n    {\n      \"id\": \"0\",\n      \"relOp\": 
\"org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableScan\",\n   
   \"table\": [\n        \"default\",\n        \"tbl_ice\"\n      ],\n      
\"table:alias\": \"tbl_ice\",\n      \"inputs\": [],\n      \"rowCount\": 
3.1850496E7,\n      \"avgRowSize\": 100.0,\n      \"rowType\": [\n        {\n   
       \"type\": \"INTEGER\",\n          \"nullable\": true,\n          
\"name\": \"id\"\n        },\n        {\n          \"type\": \"VARCHAR\",\n     
     \"nullable\": true,\n          \"precision\": 2147483647,\n          
\"name\": \"name\"\n        },\n        {\n          \"type\": \"VARCHAR\",\n   
       \"nullable\": true,\n          \"precision\": 2147483647,\n          
\"name\": \"dept\"\n        },\n        {\n          \"type\": \"BIGINT\",\n    
      \"nullable\": true,\n          \"name\": 
\"BLOCK__OFFSET__INSIDE__FILE\"\n        },\n        {\n          \"type\": 
\"VARCHAR\",\n          \"nullable\": true,\n          \"precision\": 
2147483647,\n          \"name\": \"INPUT__FILE__NAME\"\n        },\n        {\n 
         \"fields\": [\n            {\n              \"type\": \"BIGINT\",\n    
          \"nullable\": true,\n              \"name\": \"writeid\"\n            
},\n            {\n              \"type\": \"INTEGER\",\n              
\"nullable\": true,\n              \"name\": \"bucketid\"\n            },\n     
       {\n              \"type\": \"BIGINT\",\n              \"nullable\": 
true,\n              \"name\": \"rowid\"\n            }\n          ],\n         
 \"name\": \"ROW__ID\"\n        },\n        {\n          \"type\": 
\"BOOLEAN\",\n          \"nullable\": true,\n          \"name\": 
\"ROW__IS__DELETED\"\n        }\n      ],\n      \"colStats\": [\n        {\n   
       \"name\": \"dept\",\n          \"ndv\": 6370099\n        },\n        {\n 
         \"name\": \"id\",\n          \"ndv\": 6370099,\n          
\"minValue\": -9223372036854775808,\n          \"maxValue\": 
9223372036854775807\n        },\n        {\n          \"name\": \"name\",\n     
     \"ndv\": 6370099\n        }\n      ]\n    },\n    {\n      \"id\": 
\"1\",\n      \"relOp\": 
\"org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveProject\",\n     
 \"fields\": [\n        \"dept\"\n      ],\n      \"exprs\": [\n        {\n     
     \"input\": 2,\n          \"name\": \"$2\"\n        }\n      ],\n      
\"rowCount\": 3.1850496E7\n    },\n    {\n      \"id\": \"2\",\n      
\"relOp\": 
\"org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveUnion\",\n      
\"all\": true,\n      \"inputs\": [\n        \"1\",\n        \"1\"\n      ],\n  
    \"rowCount\": 6.3700992E7\n    },\n    {\n      \"id\": \"3\",\n      
\"relOp\": 
\"org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveProject\",\n     
 \"fields\": [\n        \"dept\"\n      ],\n      \"exprs\": [\n        {\n     
     \"input\": 0,\n          \"name\": \"$0\"\n        }\n      ],\n      
\"rowCount\": 6.3700992E7\n    },\n    {\n      \"id\": \"4\",\n      
\"relOp\": 
\"org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveAggregate\",\n   
   \"group\": [\n        0\n      ],\n      \"aggs\": [],\n      \"rowCount\": 
1.2740198E7\n    }\n  ]\n}","optimizedSQL":"SELECT `dept`\nFROM (SELECT 
`dept`\nFROM `default`.`tbl_ice`\nUNION ALL\nSELECT `dept`\nFROM 
`default`.`tbl_ice`)\nGROUP BY `dept`","cboInfo":"Plan optimized by 
CBO.","STAGE DEPENDENCIES":{"Stage-1":{"ROOT 
STAGE":"TRUE"},"Stage-0":{"DEPENDENT STAGES":"Stage-1"}},"STAGE 
PLANS":{"Stage-1":{"Tez":{"DagId:":"hive_20230419125158_88fc9874-e420-4015-a472-e39106913b5b:445","Edges:":{"Map
 1":{"parent":"Union 2","type":"CONTAINS"},"Map 4":{"parent":"Union 
2","type":"CONTAINS"},"Reducer 3":{"parent":"Union 
2","type":"SIMPLE_EDGE"}},"DagName:":"hive_20230419125158_88fc9874-e420-4015-a472-e39106913b5b:445","Vertices:":{"Map
 1":{"Map Operator 
Tree:":[{"TableScan":{"alias:":"tbl_ice","columns:":["dept"],"database:":"default","Statistics:":"Num
 rows: 31850496 Data size: 5567467032 Basic stats: COMPLETE Column stats: 
NONE","table:":"tbl_ice","OperatorId:":"TS_12","children":{"Select 
Operator":{"expressions:":"dept (type: 
string)","columnExprMap:":{"_col0":"dept"},"outputColumnNames:":["_col0"],"Statistics:":"Num
 rows: 31850496 Data size: 5567467032 Basic stats: COMPLETE Column stats: 
NONE","OperatorId:":"SEL_22","children":{"Group By 
Operator":{"columnExprMap:":{"_col0":"_col0"},"keys:":"_col0 (type: 
string)","minReductionHashAggr:":"0.5","mode:":"hash","outputColumnNames:":["_col0"],"Statistics:":"Num
 rows: 63700992 Data size: 11134934064 Basic stats: COMPLETE Column stats: 
NONE","OperatorId:":"GBY_23","children":{"Reduce Output 
Operator":{"columnExprMap:":{"KEY._col0":"_col0"},"key expressions:":"_col0 
(type: string)","null sort order:":"a","sort order:":"+","Map-reduce partition 
columns:":"_col0 (type: string)","Statistics:":"Num rows: 63700992 Data size: 
11134934064 Basic stats: COMPLETE Column stats: 
NONE","OperatorId:":"RS_24","outputname:":"Reducer 
3","outputOperator:":["GBY_25"]}}}}}}}}],"Execution mode:":"vectorized"},"Map 
4":{"Map Operator 
Tree:":[{"TableScan":{"alias:":"tbl_ice","columns:":["dept"],"database:":"default","Statistics:":"Num
 rows: 31850496 Data size: 5567467032 Basic stats: COMPLETE Column stats: 
NONE","table:":"tbl_ice","OperatorId:":"TS_17","children":{"Select 
Operator":{"expressions:":"dept (type: 
string)","columnExprMap:":{"_col0":"dept"},"outputColumnNames:":["_col0"],"Statistics:":"Num
 rows: 31850496 Data size: 5567467032 Basic stats: COMPLETE Column stats: 
NONE","OperatorId:":"SEL_27","children":{"Group By 
Operator":{"columnExprMap:":{"_col0":"_col0"},"keys:":"_col0 (type: 
string)","minReductionHashAggr:":"0.5","mode:":"hash","outputColumnNames:":["_col0"],"Statistics:":"Num
 rows: 63700992 Data size: 11134934064 Basic stats: COMPLETE Column stats: 
NONE","OperatorId:":"GBY_28","children":{"Reduce Output 
Operator":{"columnExprMap:":{"KEY._col0":"_col0"},"key expressions:":"_col0 
(type: string)","null sort order:":"a","sort order:":"+","Map-reduce partition 
columns:":"_col0 (type: string)","Statistics:":"Num rows: 63700992 Data size: 
11134934064 Basic stats: COMPLETE Column stats: 
NONE","OperatorId:":"RS_29","outputname:":"Reducer 
3","outputOperator:":["GBY_25"]}}}}}}}}],"Execution 
mode:":"vectorized"},"Reducer 3":{"Execution mode:":"vectorized","Reduce 
Operator Tree:":{"Group By 
Operator":{"columnExprMap:":{"_col0":"KEY._col0"},"keys:":"KEY._col0 (type: 
string)","mode:":"mergepartial","outputColumnNames:":["_col0"],"Statistics:":"Num
 rows: 31850496 Data size: 5567467032 Basic stats: COMPLETE Column stats: 
NONE","OperatorId:":"GBY_25","children":{"File Output 
Operator":{"compressed:":"false","Statistics:":"Num rows: 31850496 Data size: 
5567467032 Basic stats: COMPLETE Column stats: NONE","table:":{"input 
format:":"org.apache.hadoop.mapred.SequenceFileInputFormat","output 
format:":"org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat","serde:":"org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe"},"OperatorId:":"FS_26"}}}}},"Union
 2":{"Vertex:":"Union 2"}}}},"Stage-0":{"Fetch 
Operator":{"limit:":"-1","Processor 
Tree:":{"ListSink":{"OperatorId:":"LIST_SINK_30"}}}}}} |
+----------------------------------------------------+
1 row selected (0.141 seconds)
0: jdb{noformat}
 

 

Table details:
{noformat}
0: jdbc:hive2://simhadrigovindappa-1.simhadri> describe formatted tbl_ice;
INFO  : Compiling 
command(queryId=hive_20230419125721_f21b45f1-80e6-4c09-b576-256b5f3a132d): 
describe formatted tbl_ice
INFO  : Semantic Analysis Completed (retrial = false)
INFO  : Created Hive schema: Schema(fieldSchemas:[FieldSchema(name:col_name, 
type:string, comment:from deserializer), FieldSchema(name:data_type, 
type:string, comment:from deserializer), FieldSchema(name:comment, type:string, 
comment:from deserializer)], properties:null)
INFO  : Completed compiling 
command(queryId=hive_20230419125721_f21b45f1-80e6-4c09-b576-256b5f3a132d); Time 
taken: 0.116 seconds
INFO  : Executing 
command(queryId=hive_20230419125721_f21b45f1-80e6-4c09-b576-256b5f3a132d): 
describe formatted tbl_ice
INFO  : Starting task [Stage-0:DDL] in serial mode
INFO  : Completed executing 
command(queryId=hive_20230419125721_f21b45f1-80e6-4c09-b576-256b5f3a132d); Time 
taken: 0.027 seconds
INFO  : OK
+-------------------------------+----------------------------------------------------+----------------------------------------------------+
|           col_name            |                     data_type                 
     |                      comment                       |
+-------------------------------+----------------------------------------------------+----------------------------------------------------+
| id                            | int                                           
     |                                                    |
| name                          | string                                        
     |                                                    |
| dept                          | string                                        
     |                                                    |
|                               | NULL                                          
     | NULL                                               |
| # Detailed Table Information  | NULL                                          
     | NULL                                               |
| Database:                     | default                                       
     | NULL                                               |
| OwnerType:                    | USER                                          
     | NULL                                               |
| Owner:                        | hive                                          
     | NULL                                               |
| CreateTime:                   | Thu Apr 13 15:37:52 UTC 2023                  
     | NULL                                               |
| LastAccessTime:               | UNKNOWN                                       
     | NULL                                               |
| Retention:                    | 0                                             
     | NULL                                               |
| Location:                     | 
hdfs://simhadrigovindappa-1.simhadrigovindappa.root.hwx.site:8020/warehouse/tablespace/external/hive/TBL_ICE
 | NULL                                               |
| Table Type:                   | EXTERNAL_TABLE                                
     | NULL                                               |
| Table Parameters:             | NULL                                          
     | NULL                                               |
|                               | EXTERNAL                                      
     | TRUE                                               |
|                               | FORMAT-VERSION                                
     | 2                                                  |
|                               | bucketing_version                             
     | 2                                                  |
|                               | engine.hive.enabled                           
     | true                                               |
|                               | iceberg.orc.files.only                        
     | false                                              |
|                               | metadata_location                             
     | 
hdfs://simhadrigovindappa-1.simhadrigovindappa.root.hwx.site:8020/warehouse/tablespace/external/hive/TBL_ICE/metadata/00026-1e59d88f-b012-4223-93fd-11264b6285fd.metadata.json
 |
|                               | numFiles                                      
     | 26                                                 |
|                               | numRows                                       
     | 31850496                                           |
|                               | previous_metadata_location                    
     | 
hdfs://simhadrigovindappa-1.simhadrigovindappa.root.hwx.site:8020/warehouse/tablespace/external/hive/TBL_ICE/metadata/00025-8bab5a3e-e014-4dd2-b9ad-8a7793546b41.metadata.json
 |
|                               | rawDataSize                                   
     | 0                                                  |
|                               | serialization.format                          
     | 1                                                  |
|                               | storage_handler                               
     | org.apache.iceberg.mr.hive.HiveIcebergStorageHandler |
|                               | table_type                                    
     | ICEBERG                                            |
|                               | totalSize                                     
     | 1519780                                            |
|                               | transient_lastDdlTime                         
     | 1681418834                                         |
|                               | uuid                                          
     | fdd6170c-33aa-481c-83df-b1ed465b7567               |
|                               | write.format.default                          
     | parquet                                            |
|                               | NULL                                          
     | NULL                                               |
| # Storage Information         | NULL                                          
     | NULL                                               |
| SerDe Library:                | org.apache.iceberg.mr.hive.HiveIcebergSerDe   
     | NULL                                               |
| InputFormat:                  | 
org.apache.iceberg.mr.hive.HiveIcebergInputFormat  | NULL                       
                        |
| OutputFormat:                 | 
org.apache.iceberg.mr.hive.HiveIcebergOutputFormat | NULL                       
                        |
| Compressed:                   | No                                            
     | NULL                                               |
| Sort Columns:                 | []                                            
     | NULL                                               |
+-------------------------------+----------------------------------------------------+----------------------------------------------------+
38 rows selected (0.167 seconds)
{noformat}
 

> Store hive columns stats in puffin files for iceberg tables
> -----------------------------------------------------------
>
>                 Key: HIVE-27158
>                 URL: https://issues.apache.org/jira/browse/HIVE-27158
>             Project: Hive
>          Issue Type: Improvement
>            Reporter: Simhadri Govindappa
>            Assignee: Simhadri Govindappa
>            Priority: Major
>              Labels: pull-request-available
>             Fix For: 4.0.0
>
>          Time Spent: 11h 20m
>  Remaining Estimate: 0h
>




--
This message was sent by Atlassian Jira
(v8.20.10#820010)

Reply via email to