Author: hashutosh
Date: Thu Mar 13 22:51:09 2014
New Revision: 1577364

URL: http://svn.apache.org/r1577364
Log:
HIVE-6630 : FS based stats collection have issues for list bucketing case 
(Ashutosh Chauhan via Gunther Hagleitner)

Added:
    hive/trunk/ql/src/test/queries/clientpositive/lb_fs_stats.q
    hive/trunk/ql/src/test/results/clientpositive/lb_fs_stats.q.out
Modified:
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java
    
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/stats/fs/FSStatsPublisher.java

Modified: 
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java
URL: 
http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java?rev=1577364&r1=1577363&r2=1577364&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java 
(original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java 
Thu Mar 13 22:51:09 2014
@@ -914,11 +914,10 @@ public class FileSinkOperator extends Te
       String lbSpec = split[1];
 
       String prefix;
-      String postfix;
+      String postfix=null;
       if (taskIndependent) {
         // key = "database.table/SP/DP/"LB/
         prefix = conf.getTableInfo().getTableName();
-        postfix = Utilities.join(lbSpec);
       } else {
         // key = "prefix/SP/DP/"LB/taskID/
         prefix = conf.getStatsAggPrefix();

Modified: 
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/stats/fs/FSStatsPublisher.java
URL: 
http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/stats/fs/FSStatsPublisher.java?rev=1577364&r1=1577363&r2=1577364&view=diff
==============================================================================
--- 
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/stats/fs/FSStatsPublisher.java 
(original)
+++ 
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/stats/fs/FSStatsPublisher.java 
Thu Mar 13 22:51:09 2014
@@ -21,6 +21,7 @@ package org.apache.hadoop.hive.ql.stats.
 import java.io.IOException;
 import java.util.HashMap;
 import java.util.Map;
+import java.util.Map.Entry;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -71,7 +72,15 @@ public class FSStatsPublisher implements
   public boolean publishStat(String partKV, Map<String, String> stats) {
     LOG.debug("Putting in map : " + partKV + "\t" + stats);
     // we need to do new hashmap, since stats object is reused across calls.
-    statsMap.put(partKV, new HashMap<String, String>(stats));
+    Map<String,String> cpy = new HashMap<String, String>(stats);
+    Map<String,String> statMap = statsMap.get(partKV);
+    if (null != statMap) {
+      // In case of LB, we might get called repeatedly.
+      for (Entry<String, String> e : statMap.entrySet()) {
+        cpy.put(e.getKey(), String.valueOf(Long.valueOf(e.getValue()) + 
Long.valueOf(cpy.get(e.getKey()))));
+      }
+    }
+    statsMap.put(partKV, cpy);
     return true;
   }
 

Added: hive/trunk/ql/src/test/queries/clientpositive/lb_fs_stats.q
URL: 
http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/lb_fs_stats.q?rev=1577364&view=auto
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/lb_fs_stats.q (added)
+++ hive/trunk/ql/src/test/queries/clientpositive/lb_fs_stats.q Thu Mar 13 
22:51:09 2014
@@ -0,0 +1,19 @@
+set hive.mapred.supports.subdirectories=true;
+set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
+set hive.merge.mapfiles=false; 
+set hive.merge.mapredfiles=false; 
+set mapred.input.dir.recursive=true;
+set hive.stats.dbclass=fs;
+-- Tests truncating a column from a list bucketing table
+
+-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23)
+
+CREATE TABLE test_tab (key STRING, value STRING) PARTITIONED BY (part STRING) 
STORED AS RCFILE;
+
+ALTER TABLE test_tab SKEWED BY (key) ON ("484") STORED AS DIRECTORIES;
+
+INSERT OVERWRITE TABLE test_tab PARTITION (part = '1') SELECT * FROM src;
+
+describe formatted test_tab partition (part='1');
+
+set hive.stats.dbclass=jdbc:derby;

Added: hive/trunk/ql/src/test/results/clientpositive/lb_fs_stats.q.out
URL: 
http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientpositive/lb_fs_stats.q.out?rev=1577364&view=auto
==============================================================================
--- hive/trunk/ql/src/test/results/clientpositive/lb_fs_stats.q.out (added)
+++ hive/trunk/ql/src/test/results/clientpositive/lb_fs_stats.q.out Thu Mar 13 
22:51:09 2014
@@ -0,0 +1,79 @@
+PREHOOK: query: -- Tests truncating a column from a list bucketing table
+
+-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23)
+
+CREATE TABLE test_tab (key STRING, value STRING) PARTITIONED BY (part STRING) 
STORED AS RCFILE
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+POSTHOOK: query: -- Tests truncating a column from a list bucketing table
+
+-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23)
+
+CREATE TABLE test_tab (key STRING, value STRING) PARTITIONED BY (part STRING) 
STORED AS RCFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@test_tab
+PREHOOK: query: ALTER TABLE test_tab SKEWED BY (key) ON ("484") STORED AS 
DIRECTORIES
+PREHOOK: type: ALTERTABLE_SKEWED
+PREHOOK: Input: default@test_tab
+PREHOOK: Output: default@test_tab
+POSTHOOK: query: ALTER TABLE test_tab SKEWED BY (key) ON ("484") STORED AS 
DIRECTORIES
+POSTHOOK: type: ALTERTABLE_SKEWED
+POSTHOOK: Input: default@test_tab
+POSTHOOK: Output: default@test_tab
+PREHOOK: query: INSERT OVERWRITE TABLE test_tab PARTITION (part = '1') SELECT 
* FROM src
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+PREHOOK: Output: default@test_tab@part=1
+POSTHOOK: query: INSERT OVERWRITE TABLE test_tab PARTITION (part = '1') SELECT 
* FROM src
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+POSTHOOK: Output: default@test_tab@part=1
+POSTHOOK: Lineage: test_tab PARTITION(part=1).key SIMPLE 
[(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: test_tab PARTITION(part=1).value SIMPLE 
[(src)src.FieldSchema(name:value, type:string, comment:default), ]
+PREHOOK: query: describe formatted test_tab partition (part='1')
+PREHOOK: type: DESCTABLE
+POSTHOOK: query: describe formatted test_tab partition (part='1')
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Lineage: test_tab PARTITION(part=1).key SIMPLE 
[(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: test_tab PARTITION(part=1).value SIMPLE 
[(src)src.FieldSchema(name:value, type:string, comment:default), ]
+# col_name             data_type               comment             
+                
+key                    string                  None                
+value                  string                  None                
+                
+# Partition Information                 
+# col_name             data_type               comment             
+                
+part                   string                  None                
+                
+# Detailed Partition Information                
+Partition Value:       [1]                      
+Database:              default                  
+Table:                 test_tab                 
+#### A masked pattern was here ####
+Protect Mode:          None                     
+#### A masked pattern was here ####
+Partition Parameters:           
+       COLUMN_STATS_ACCURATE   true                
+       numFiles                2                   
+       numRows                 500                 
+       rawDataSize             4812                
+       totalSize               5370                
+#### A masked pattern was here ####
+                
+# Storage Information           
+SerDe Library:         org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe    
 
+InputFormat:           org.apache.hadoop.hive.ql.io.RCFileInputFormat   
+OutputFormat:          org.apache.hadoop.hive.ql.io.RCFileOutputFormat  
+Compressed:            No                       
+Num Buckets:           -1                       
+Bucket Columns:        []                       
+Sort Columns:          []                       
+Stored As SubDirectories:      Yes                      
+Skewed Columns:        [key]                    
+Skewed Values:         [[484]]                  
+#### A masked pattern was here ####
+Skewed Value to Truncated Path:        {[484]=/test_tab/part=1/key=484}        
 
+Storage Desc Params:            
+       serialization.format    1                   


Reply via email to