[ 
https://issues.apache.org/jira/browse/HIVE-27163?focusedWorklogId=858928&page=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-858928
 ]

ASF GitHub Bot logged work on HIVE-27163:
-----------------------------------------

                Author: ASF GitHub Bot
            Created on: 25/Apr/23 12:54
            Start Date: 25/Apr/23 12:54
    Worklog Time Spent: 10m 
      Work Description: kasakrisz commented on code in PR #4228:
URL: https://github.com/apache/hive/pull/4228#discussion_r1176442212


##########
ql/src/java/org/apache/hadoop/hive/ql/ddl/table/create/CreateTableDesc.java:
##########
@@ -921,14 +924,23 @@ public Table toTable(HiveConf conf) throws HiveException {
     // When replicating the statistics for a table will be obtained from the 
source. Do not
     // reset it on replica.
     if (replicationSpec == null || !replicationSpec.isInReplicationScope()) {
-      if (!this.isCTAS && (tbl.getPath() == null || (!isExternal() && 
tbl.isEmpty()))) {
-        if (!tbl.isPartitioned() && 
conf.getBoolVar(HiveConf.ConfVars.HIVESTATSAUTOGATHER)) {
-          
StatsSetupConst.setStatsStateForCreateTable(tbl.getTTable().getParameters(),
-                  MetaStoreUtils.getColumnNames(tbl.getCols()), 
StatsSetupConst.TRUE);
-        }
-      } else {
-        
StatsSetupConst.setStatsStateForCreateTable(tbl.getTTable().getParameters(), 
null,
-                StatsSetupConst.FALSE);
+      // Remove COLUMN_STATS_ACCURATE=true from table's parameter, let the HMS 
determine if
+      // there is need to add column stats dependent on the table's location 
in case the metastore transformer
+      // sets or alters the table's location.
+      
StatsSetupConst.setStatsStateForCreateTable(tbl.getTTable().getParameters(), 
null,
+          StatsSetupConst.FALSE);
+      if (!this.isCTAS && !tbl.isPartitioned() && !tbl.isTemporary() &&
+          conf.getBoolVar(HiveConf.ConfVars.HIVESTATSAUTOGATHER)) {
+        // Put the flag into the dictionary in order not to pollute the table,
+        // ObjectDictionary is meant to convey repeatitive messages.
+        ObjectDictionary dictionary = tbl.getTTable().isSetDictionary() ?
+            tbl.getTTable().getDictionary() : new ObjectDictionary();
+        String cols = 
MetaStoreUtils.getColumnNames(tbl.getCols()).stream().collect(Collectors.joining("\0"));

Review Comment:
   Please extract `"\0"` to a constant.



##########
standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/utils/MetaStoreServerUtils.java:
##########
@@ -508,6 +511,48 @@ public static void clearQuickStats(Map<String, String> 
params) {
     params.remove(StatsSetupConst.NUM_ERASURE_CODED_FILES);
   }
 
+  public static void updateTableStatsForCreateTable(Warehouse wh, Database db, 
Table tbl,
+      EnvironmentContext envContext, Configuration conf, Path tblPath, boolean 
newDir)
+      throws MetaException {
+    // If the created table is a view, skip generating the stats
+    if (MetaStoreUtils.isView(tbl)) {
+      return;
+    }
+    assert tblPath != null;
+    if (tbl.isSetDictionary() && tbl.getDictionary().getValues() != null) {
+      List<java.nio.ByteBuffer> values = tbl.getDictionary().getValues().
+          remove(StatsSetupConst.STATS_FOR_CREATE_TABLE);
+      java.nio.ByteBuffer buffer;
+      if (values != null && values.size() > 0 && (buffer = 
values.get(0)).hasArray()) {
+        String val = new String(buffer.array(), StandardCharsets.UTF_8);
+        if (StatsSetupConst.TRUE.equals(val)) {
+          try {
+            boolean isIcebergTable =
+                
HiveMetaHook.ICEBERG.equalsIgnoreCase(tbl.getParameters().get(HiveMetaHook.TABLE_TYPE));
+            PathFilter pathFilter = isIcebergTable ?
+                path -> !"metadata".equals(path.getName()) : 
FileUtils.HIDDEN_FILES_PATH_FILTER;

Review Comment:
   This part is too Iceberg specific. Is it possible to get this filter via 
storage handler api?



##########
ql/src/java/org/apache/hadoop/hive/ql/ddl/table/create/CreateTableDesc.java:
##########
@@ -921,14 +924,23 @@ public Table toTable(HiveConf conf) throws HiveException {
     // When replicating the statistics for a table will be obtained from the 
source. Do not
     // reset it on replica.
     if (replicationSpec == null || !replicationSpec.isInReplicationScope()) {
-      if (!this.isCTAS && (tbl.getPath() == null || (!isExternal() && 
tbl.isEmpty()))) {
-        if (!tbl.isPartitioned() && 
conf.getBoolVar(HiveConf.ConfVars.HIVESTATSAUTOGATHER)) {
-          
StatsSetupConst.setStatsStateForCreateTable(tbl.getTTable().getParameters(),
-                  MetaStoreUtils.getColumnNames(tbl.getCols()), 
StatsSetupConst.TRUE);
-        }
-      } else {
-        
StatsSetupConst.setStatsStateForCreateTable(tbl.getTTable().getParameters(), 
null,
-                StatsSetupConst.FALSE);
+      // Remove COLUMN_STATS_ACCURATE=true from table's parameter, let the HMS 
determine if
+      // there is need to add column stats dependent on the table's location 
in case the metastore transformer
+      // sets or alters the table's location.
+      
StatsSetupConst.setStatsStateForCreateTable(tbl.getTTable().getParameters(), 
null,
+          StatsSetupConst.FALSE);
+      if (!this.isCTAS && !tbl.isPartitioned() && !tbl.isTemporary() &&
+          conf.getBoolVar(HiveConf.ConfVars.HIVESTATSAUTOGATHER)) {
+        // Put the flag into the dictionary in order not to pollute the table,
+        // ObjectDictionary is meant to convey repeatitive messages.
+        ObjectDictionary dictionary = tbl.getTTable().isSetDictionary() ?
+            tbl.getTTable().getDictionary() : new ObjectDictionary();
+        String cols = 
MetaStoreUtils.getColumnNames(tbl.getCols()).stream().collect(Collectors.joining("\0"));
+        List<java.nio.ByteBuffer> buffers = new ArrayList<>();
+        
buffers.add(java.nio.ByteBuffer.wrap(StatsSetupConst.TRUE.getBytes(StandardCharsets.UTF_8)));
+        
buffers.add(java.nio.ByteBuffer.wrap(cols.getBytes(StandardCharsets.UTF_8)));
+        dictionary.putToValues(StatsSetupConst.STATS_FOR_CREATE_TABLE, 
buffers);

Review Comment:
   Could you please use some serialization library here like 
`ObjectOutputStream` or json? Maybe a new class could be introduced to 
represent the data structure and makes easier to extend it in the future.
   ```
   class StatsSetup {
     private boolean enabled;
     private List<String> columnNames;
   }
   ```



##########
standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/Warehouse.java:
##########
@@ -500,8 +501,16 @@ public void recycleDirToCmPath(Path f, boolean ifPurge) 
throws MetaException {
   }
 
   public boolean isEmptyDir(Path path) throws IOException, MetaException {
+    return isEmptyDir(path, null);

Review Comment:
   Can you pass `p -> true` instead of `null` ?
   
   Also it seems that `FileSystem.listStatus` has a bigger memory footprint 
when a filter is passed:
   
https://github.com/apache/hadoop/blob/a3b9c37a397ad4188041dd80621bdeefc46885f2/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileSystem.java#L2011





Issue Time Tracking
-------------------

    Worklog Id:     (was: 858928)
    Time Spent: 1h 20m  (was: 1h 10m)

> Column stats are not getting published after an insert query into an external 
> table with custom location
> --------------------------------------------------------------------------------------------------------
>
>                 Key: HIVE-27163
>                 URL: https://issues.apache.org/jira/browse/HIVE-27163
>             Project: Hive
>          Issue Type: Bug
>          Components: Hive
>            Reporter: Taraka Rama Rao Lethavadla
>            Assignee: Zhihua Deng
>            Priority: Major
>              Labels: pull-request-available
>          Time Spent: 1h 20m
>  Remaining Estimate: 0h
>
> Test case details are below
> *test.q*
> {noformat}
> set hive.stats.column.autogather=true;
> set hive.stats.autogather=true;
> dfs ${system:test.dfs.mkdir} ${system:test.tmp.dir}/test;
> create external table test_custom(age int, name string) stored as orc 
> location '/tmp/test';
> insert into test_custom select 1, 'test';
> desc formatted test_custom age;{noformat}
> *test.q.out*
>  
>  
> {noformat}
> #### A masked pattern was here ####
> PREHOOK: type: CREATETABLE
> #### A masked pattern was here ####
> PREHOOK: Output: database:default
> PREHOOK: Output: default@test_custom
> #### A masked pattern was here ####
> POSTHOOK: type: CREATETABLE
> #### A masked pattern was here ####
> POSTHOOK: Output: database:default
> POSTHOOK: Output: default@test_custom
> PREHOOK: query: insert into test_custom select 1, 'test'
> PREHOOK: type: QUERY
> PREHOOK: Input: _dummy_database@_dummy_table
> PREHOOK: Output: default@test_custom
> POSTHOOK: query: insert into test_custom select 1, 'test'
> POSTHOOK: type: QUERY
> POSTHOOK: Input: _dummy_database@_dummy_table
> POSTHOOK: Output: default@test_custom
> POSTHOOK: Lineage: test_custom.age SIMPLE []
> POSTHOOK: Lineage: test_custom.name SIMPLE []
> PREHOOK: query: desc formatted test_custom age
> PREHOOK: type: DESCTABLE
> PREHOOK: Input: default@test_custom
> POSTHOOK: query: desc formatted test_custom age
> POSTHOOK: type: DESCTABLE
> POSTHOOK: Input: default@test_custom
> col_name                age
> data_type               int
> min
> max
> num_nulls
> distinct_count
> avg_col_len
> max_col_len
> num_trues
> num_falses
> bit_vector
> comment                 from deserializer{noformat}
> As we can see from desc formatted output, column stats were not populated
>  



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

Reply via email to