svn commit: r1609025 - in /hive/trunk: itests/qtest/ ql/src/test/queries/clientpositive/ ql/src/test/results/clientpositive/ shims/common-secure/src/main/java/org/apache/hadoop/hive/shims/

navis Tue, 08 Jul 2014 23:50:18 -0700

Author: navis
Date: Wed Jul  9 06:49:00 2014
New Revision: 1609025

URL: http://svn.apache.org/r1609025
Log:
HIVE-7220 : Empty dir in external table causes issue (Szehon Ho via Navis)


Added:
    hive/trunk/ql/src/test/queries/clientpositive/empty_dir_in_table.q
    hive/trunk/ql/src/test/results/clientpositive/empty_dir_in_table.q.out
Modified:
    hive/trunk/itests/qtest/testconfiguration.properties
    
hive/trunk/shims/common-secure/src/main/java/org/apache/hadoop/hive/shims/HadoopShimsSecure.java

Modified: hive/trunk/itests/qtest/testconfiguration.properties
URL: 
http://svn.apache.org/viewvc/hive/trunk/itests/qtest/testconfiguration.properties?rev=1609025&r1=1609024&r2=1609025&view=diff
==============================================================================
--- hive/trunk/itests/qtest/testconfiguration.properties (original)
+++ hive/trunk/itests/qtest/testconfiguration.properties Wed Jul  9 06:49:00 
2014
@@ -1,4 +1,4 @@
-minimr.query.files=stats_counter_partitioned.q,list_bucket_dml_10.q,input16_cc.q,scriptfile1.q,scriptfile1_win.q,bucket4.q,bucketmapjoin6.q,disable_merge_for_bucketing.q,reduce_deduplicate.q,smb_mapjoin_8.q,join1.q,groupby2.q,bucketizedhiveinputformat.q,bucketmapjoin7.q,optrstat_groupby.q,bucket_num_reducers.q,bucket5.q,load_fs2.q,bucket_num_reducers2.q,infer_bucket_sort_merge.q,infer_bucket_sort_reducers_power_two.q,infer_bucket_sort_dyn_part.q,infer_bucket_sort_bucketed_table.q,infer_bucket_sort_map_operators.q,infer_bucket_sort_num_buckets.q,leftsemijoin_mr.q,schemeAuthority.q,schemeAuthority2.q,truncate_column_buckets.q,remote_script.q,,load_hdfs_file_with_space_in_the_name.q,parallel_orderby.q,import_exported_table.q,stats_counter.q,auto_sortmerge_join_16.q,quotedid_smb.q,file_with_header_footer.q,external_table_with_space_in_location_path.q,root_dir_external_table.q,index_bitmap3.q,ql_rewrite_gbtoidx.q,index_bitmap_auto.q,udf_using.q
+minimr.query.files=stats_counter_partitioned.q,list_bucket_dml_10.q,input16_cc.q,scriptfile1.q,scriptfile1_win.q,bucket4.q,bucketmapjoin6.q,disable_merge_for_bucketing.q,reduce_deduplicate.q,smb_mapjoin_8.q,join1.q,groupby2.q,bucketizedhiveinputformat.q,bucketmapjoin7.q,optrstat_groupby.q,bucket_num_reducers.q,bucket5.q,load_fs2.q,bucket_num_reducers2.q,infer_bucket_sort_merge.q,infer_bucket_sort_reducers_power_two.q,infer_bucket_sort_dyn_part.q,infer_bucket_sort_bucketed_table.q,infer_bucket_sort_map_operators.q,infer_bucket_sort_num_buckets.q,leftsemijoin_mr.q,schemeAuthority.q,schemeAuthority2.q,truncate_column_buckets.q,remote_script.q,,load_hdfs_file_with_space_in_the_name.q,parallel_orderby.q,import_exported_table.q,stats_counter.q,auto_sortmerge_join_16.q,quotedid_smb.q,file_with_header_footer.q,external_table_with_space_in_location_path.q,root_dir_external_table.q,index_bitmap3.q,ql_rewrite_gbtoidx.q,index_bitmap_auto.q,udf_using.q,empty_dir_in_table.q
 
minimr.query.negative.files=cluster_tasklog_retrieval.q,minimr_broken_pipe.q,mapreduce_stack_trace.q,mapreduce_stack_trace_turnoff.q,mapreduce_stack_trace_hadoop20.q,mapreduce_stack_trace_turnoff_hadoop20.q,file_with_header_footer_negative.q,udf_local_resource.q
 
minitez.query.files=tez_fsstat.q,mapjoin_decimal.q,tez_join_tests.q,tez_joins_explain.q,mrr.q,tez_dml.q,tez_insert_overwrite_local_directory_1.q,tez_union.q,bucket_map_join_tez1.q,bucket_map_join_tez2.q,tez_schema_evolution.q,tez_join_hash.q
 
minitez.query.files.shared=cross_product_check_1.q,cross_product_check_2.q,dynpart_sort_opt_vectorization.q,dynpart_sort_optimization.q,orc_analyze.q,join0.q,join1.q,auto_join0.q,auto_join1.q,bucket2.q,bucket3.q,bucket4.q,count.q,create_merge_compressed.q,cross_join.q,ctas.q,custom_input_output_format.q,disable_merge_for_bucketing.q,enforce_order.q,filter_join_breaktask.q,filter_join_breaktask2.q,groupby1.q,groupby2.q,groupby3.q,having.q,insert1.q,insert_into1.q,insert_into2.q,leftsemijoin.q,limit_pushdown.q,load_dyn_part1.q,load_dyn_part2.q,load_dyn_part3.q,mapjoin_mapjoin.q,mapreduce1.q,mapreduce2.q,merge1.q,merge2.q,metadata_only_queries.q,sample1.q,subquery_in.q,subquery_exists.q,vectorization_15.q,ptf.q,stats_counter.q,stats_noscan_1.q,stats_counter_partitioned.q,union2.q,union3.q,union4.q,union5.q,union6.q,union7.q,union8.q,union9.q,transform1.q,transform2.q,transform_ppr1.q,transform_ppr2.q,script_env_var1.q,script_env_var2.q,script_pipe.q,scriptfile1.q,metadataonly1.q

Added: hive/trunk/ql/src/test/queries/clientpositive/empty_dir_in_table.q
URL: 
http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/empty_dir_in_table.q?rev=1609025&view=auto
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/empty_dir_in_table.q (added)
+++ hive/trunk/ql/src/test/queries/clientpositive/empty_dir_in_table.q Wed Jul  
9 06:49:00 2014
@@ -0,0 +1,10 @@
+dfs ${system:test.dfs.mkdir} hdfs:///target/tmp/test_empty_table;
+
+create external table roottable (key string) row format delimited fields 
terminated by '\\t' stored as textfile location 
'hdfs:///target/tmp/test_empty_table';
+select count(*) from roottable;
+
+insert into table roottable select key from src where (key < 20) order by key;
+select count(*) from roottable;
+
+dfs ${system:test.dfs.mkdir} hdfs:///target/tmp/test_empty_table/empty;
+select count(*) from roottable;
\ No newline at end of file

Added: hive/trunk/ql/src/test/results/clientpositive/empty_dir_in_table.q.out
URL: 
http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientpositive/empty_dir_in_table.q.out?rev=1609025&view=auto
==============================================================================
--- hive/trunk/ql/src/test/results/clientpositive/empty_dir_in_table.q.out 
(added)
+++ hive/trunk/ql/src/test/results/clientpositive/empty_dir_in_table.q.out Wed 
Jul  9 06:49:00 2014
@@ -0,0 +1,45 @@
+#### A masked pattern was here ####
+PREHOOK: type: CREATETABLE
+#### A masked pattern was here ####
+PREHOOK: Output: database:default
+#### A masked pattern was here ####
+POSTHOOK: type: CREATETABLE
+#### A masked pattern was here ####
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@roottable
+PREHOOK: query: select count(*) from roottable
+PREHOOK: type: QUERY
+PREHOOK: Input: default@roottable
+#### A masked pattern was here ####
+POSTHOOK: query: select count(*) from roottable
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@roottable
+#### A masked pattern was here ####
+0
+PREHOOK: query: insert into table roottable select key from src where (key < 
20) order by key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+PREHOOK: Output: default@roottable
+POSTHOOK: query: insert into table roottable select key from src where (key < 
20) order by key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+POSTHOOK: Output: default@roottable
+POSTHOOK: Lineage: roottable.key SIMPLE [(src)src.FieldSchema(name:key, 
type:string, comment:default), ]
+PREHOOK: query: select count(*) from roottable
+PREHOOK: type: QUERY
+PREHOOK: Input: default@roottable
+#### A masked pattern was here ####
+POSTHOOK: query: select count(*) from roottable
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@roottable
+#### A masked pattern was here ####
+20
+PREHOOK: query: select count(*) from roottable
+PREHOOK: type: QUERY
+PREHOOK: Input: default@roottable
+#### A masked pattern was here ####
+POSTHOOK: query: select count(*) from roottable
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@roottable
+#### A masked pattern was here ####
+20

Modified: 
hive/trunk/shims/common-secure/src/main/java/org/apache/hadoop/hive/shims/HadoopShimsSecure.java
URL: 
http://svn.apache.org/viewvc/hive/trunk/shims/common-secure/src/main/java/org/apache/hadoop/hive/shims/HadoopShimsSecure.java?rev=1609025&r1=1609024&r2=1609025&view=diff
==============================================================================
--- 
hive/trunk/shims/common-secure/src/main/java/org/apache/hadoop/hive/shims/HadoopShimsSecure.java
 (original)
+++ 
hive/trunk/shims/common-secure/src/main/java/org/apache/hadoop/hive/shims/HadoopShimsSecure.java
 Wed Jul  9 06:49:00 2014
@@ -26,6 +26,7 @@ import java.net.URI;
 import java.net.URISyntaxException;
 import java.security.PrivilegedExceptionAction;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.Collections;
 import java.util.HashSet;
 import java.util.List;
@@ -67,6 +68,8 @@ import org.apache.hadoop.tools.HadoopArc
 import org.apache.hadoop.util.Progressable;
 import org.apache.hadoop.util.ToolRunner;
 
+import com.google.common.primitives.Longs;
+
 /**
  * Base implemention for shims against secure Hadoop 0.20.3/0.23.
  */
@@ -97,18 +100,12 @@ public abstract class HadoopShimsSecure 
       _isShrinked = false;
     }
 
-    public InputSplitShim(CombineFileSplit old) throws IOException {
-      super(old.getJob(), old.getPaths(), old.getStartOffsets(),
-          old.getLengths(), dedup(old.getLocations()));
+    public InputSplitShim(JobConf conf, Path[] paths, long[] startOffsets,
+      long[] lengths, String[] locations) throws IOException {
+      super(conf, paths, startOffsets, lengths, dedup(locations));
       _isShrinked = false;
     }
 
-    private static String[] dedup(String[] locations) {
-      Set<String> dedup = new HashSet<String>();
-      Collections.addAll(dedup, locations);
-      return dedup.toArray(new String[dedup.size()]);
-    }
-
     @Override
     public void shrinkSplit(long length) {
       _isShrinked = true;
@@ -338,12 +335,22 @@ public abstract class HadoopShimsSecure 
 
       InputSplit[] splits = (InputSplit[]) super.getSplits(job, numSplits);
 
-      InputSplitShim[] isplits = new InputSplitShim[splits.length];
+      ArrayList<InputSplitShim> inputSplitShims = new 
ArrayList<InputSplitShim>();
       for (int pos = 0; pos < splits.length; pos++) {
-        isplits[pos] = new InputSplitShim((CombineFileSplit)splits[pos]);
+        CombineFileSplit split = (CombineFileSplit) splits[pos];
+        Set<Integer> dirIndices = getDirIndices(split.getPaths(), job);
+        if (dirIndices.size() != split.getPaths().length) {
+          List<Path> prunedPaths = prune(dirIndices, 
Arrays.asList(split.getPaths()));
+          List<Long> prunedStartOffsets = prune(dirIndices, Arrays.asList(
+            ArrayUtils.toObject(split.getStartOffsets())));
+          List<Long> prunedLengths = prune(dirIndices, Arrays.asList(
+            ArrayUtils.toObject(split.getLengths())));
+          inputSplitShims.add(new InputSplitShim(job, prunedPaths.toArray(new 
Path[prunedPaths.size()]),
+            Longs.toArray(prunedStartOffsets),
+            Longs.toArray(prunedLengths), split.getLocations()));
+        }
       }
-
-      return isplits;
+      return inputSplitShims.toArray(new 
InputSplitShim[inputSplitShims.size()]);
     }
 
     public InputSplitShim getInputSplitShim() throws IOException {
@@ -623,4 +630,37 @@ public abstract class HadoopShimsSecure 
     int retval = shell.run(command);
     LOG.debug("Return value is :" + retval);
   }
+
+  /**
+   * CombineFileInputFormat sometimes returns directories as splits, need to 
prune them.
+   */
+  private static Set<Integer> getDirIndices(Path[] paths, JobConf conf) throws 
IOException {
+    Set<Integer> result = new HashSet<Integer>();
+    for (int i = 0; i < paths.length; i++) {
+      FileSystem fs = paths[i].getFileSystem(conf);
+      if (!fs.isFile(paths[i])) {
+        result.add(i);
+      }
+    }
+    return result;
+  }
+
+  private static <K> List<K> prune(Set<Integer> indicesToPrune, List<K> elms) {
+    List<K> result = new ArrayList<K>();
+    int i = 0;
+    for (K elm : elms) {
+      if (indicesToPrune.contains(i)) {
+        continue;
+      }
+      result.add(elm);
+      i++;
+    }
+    return result;
+  }
+
+  private static String[] dedup(String[] locations) throws IOException {
+    Set<String> dedup = new HashSet<String>();
+    Collections.addAll(dedup, locations);
+    return dedup.toArray(new String[dedup.size()]);
+  }
 }

svn commit: r1609025 - in /hive/trunk: itests/qtest/ ql/src/test/queries/clientpositive/ ql/src/test/results/clientpositive/ shims/common-secure/src/main/java/org/apache/hadoop/hive/shims/

Reply via email to