Author: navis Date: Wed Jul 9 06:49:00 2014 New Revision: 1609025 URL: http://svn.apache.org/r1609025 Log: HIVE-7220 : Empty dir in external table causes issue (Szehon Ho via Navis)
Added: hive/trunk/ql/src/test/queries/clientpositive/empty_dir_in_table.q hive/trunk/ql/src/test/results/clientpositive/empty_dir_in_table.q.out Modified: hive/trunk/itests/qtest/testconfiguration.properties hive/trunk/shims/common-secure/src/main/java/org/apache/hadoop/hive/shims/HadoopShimsSecure.java Modified: hive/trunk/itests/qtest/testconfiguration.properties URL: http://svn.apache.org/viewvc/hive/trunk/itests/qtest/testconfiguration.properties?rev=1609025&r1=1609024&r2=1609025&view=diff ============================================================================== --- hive/trunk/itests/qtest/testconfiguration.properties (original) +++ hive/trunk/itests/qtest/testconfiguration.properties Wed Jul 9 06:49:00 2014 @@ -1,4 +1,4 @@ -minimr.query.files=stats_counter_partitioned.q,list_bucket_dml_10.q,input16_cc.q,scriptfile1.q,scriptfile1_win.q,bucket4.q,bucketmapjoin6.q,disable_merge_for_bucketing.q,reduce_deduplicate.q,smb_mapjoin_8.q,join1.q,groupby2.q,bucketizedhiveinputformat.q,bucketmapjoin7.q,optrstat_groupby.q,bucket_num_reducers.q,bucket5.q,load_fs2.q,bucket_num_reducers2.q,infer_bucket_sort_merge.q,infer_bucket_sort_reducers_power_two.q,infer_bucket_sort_dyn_part.q,infer_bucket_sort_bucketed_table.q,infer_bucket_sort_map_operators.q,infer_bucket_sort_num_buckets.q,leftsemijoin_mr.q,schemeAuthority.q,schemeAuthority2.q,truncate_column_buckets.q,remote_script.q,,load_hdfs_file_with_space_in_the_name.q,parallel_orderby.q,import_exported_table.q,stats_counter.q,auto_sortmerge_join_16.q,quotedid_smb.q,file_with_header_footer.q,external_table_with_space_in_location_path.q,root_dir_external_table.q,index_bitmap3.q,ql_rewrite_gbtoidx.q,index_bitmap_auto.q,udf_using.q +minimr.query.files=stats_counter_partitioned.q,list_bucket_dml_10.q,input16_cc.q,scriptfile1.q,scriptfile1_win.q,bucket4.q,bucketmapjoin6.q,disable_merge_for_bucketing.q,reduce_deduplicate.q,smb_mapjoin_8.q,join1.q,groupby2.q,bucketizedhiveinputformat.q,bucketmapjoin7.q,optrstat_groupby.q,bucket_num_reducers.q,bucket5.q,load_fs2.q,bucket_num_reducers2.q,infer_bucket_sort_merge.q,infer_bucket_sort_reducers_power_two.q,infer_bucket_sort_dyn_part.q,infer_bucket_sort_bucketed_table.q,infer_bucket_sort_map_operators.q,infer_bucket_sort_num_buckets.q,leftsemijoin_mr.q,schemeAuthority.q,schemeAuthority2.q,truncate_column_buckets.q,remote_script.q,,load_hdfs_file_with_space_in_the_name.q,parallel_orderby.q,import_exported_table.q,stats_counter.q,auto_sortmerge_join_16.q,quotedid_smb.q,file_with_header_footer.q,external_table_with_space_in_location_path.q,root_dir_external_table.q,index_bitmap3.q,ql_rewrite_gbtoidx.q,index_bitmap_auto.q,udf_using.q,empty_dir_in_table.q minimr.query.negative.files=cluster_tasklog_retrieval.q,minimr_broken_pipe.q,mapreduce_stack_trace.q,mapreduce_stack_trace_turnoff.q,mapreduce_stack_trace_hadoop20.q,mapreduce_stack_trace_turnoff_hadoop20.q,file_with_header_footer_negative.q,udf_local_resource.q minitez.query.files=tez_fsstat.q,mapjoin_decimal.q,tez_join_tests.q,tez_joins_explain.q,mrr.q,tez_dml.q,tez_insert_overwrite_local_directory_1.q,tez_union.q,bucket_map_join_tez1.q,bucket_map_join_tez2.q,tez_schema_evolution.q,tez_join_hash.q minitez.query.files.shared=cross_product_check_1.q,cross_product_check_2.q,dynpart_sort_opt_vectorization.q,dynpart_sort_optimization.q,orc_analyze.q,join0.q,join1.q,auto_join0.q,auto_join1.q,bucket2.q,bucket3.q,bucket4.q,count.q,create_merge_compressed.q,cross_join.q,ctas.q,custom_input_output_format.q,disable_merge_for_bucketing.q,enforce_order.q,filter_join_breaktask.q,filter_join_breaktask2.q,groupby1.q,groupby2.q,groupby3.q,having.q,insert1.q,insert_into1.q,insert_into2.q,leftsemijoin.q,limit_pushdown.q,load_dyn_part1.q,load_dyn_part2.q,load_dyn_part3.q,mapjoin_mapjoin.q,mapreduce1.q,mapreduce2.q,merge1.q,merge2.q,metadata_only_queries.q,sample1.q,subquery_in.q,subquery_exists.q,vectorization_15.q,ptf.q,stats_counter.q,stats_noscan_1.q,stats_counter_partitioned.q,union2.q,union3.q,union4.q,union5.q,union6.q,union7.q,union8.q,union9.q,transform1.q,transform2.q,transform_ppr1.q,transform_ppr2.q,script_env_var1.q,script_env_var2.q,script_pipe.q,scriptfile1.q,metadataonly1.q Added: hive/trunk/ql/src/test/queries/clientpositive/empty_dir_in_table.q URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/empty_dir_in_table.q?rev=1609025&view=auto ============================================================================== --- hive/trunk/ql/src/test/queries/clientpositive/empty_dir_in_table.q (added) +++ hive/trunk/ql/src/test/queries/clientpositive/empty_dir_in_table.q Wed Jul 9 06:49:00 2014 @@ -0,0 +1,10 @@ +dfs ${system:test.dfs.mkdir} hdfs:///target/tmp/test_empty_table; + +create external table roottable (key string) row format delimited fields terminated by '\\t' stored as textfile location 'hdfs:///target/tmp/test_empty_table'; +select count(*) from roottable; + +insert into table roottable select key from src where (key < 20) order by key; +select count(*) from roottable; + +dfs ${system:test.dfs.mkdir} hdfs:///target/tmp/test_empty_table/empty; +select count(*) from roottable; \ No newline at end of file Added: hive/trunk/ql/src/test/results/clientpositive/empty_dir_in_table.q.out URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientpositive/empty_dir_in_table.q.out?rev=1609025&view=auto ============================================================================== --- hive/trunk/ql/src/test/results/clientpositive/empty_dir_in_table.q.out (added) +++ hive/trunk/ql/src/test/results/clientpositive/empty_dir_in_table.q.out Wed Jul 9 06:49:00 2014 @@ -0,0 +1,45 @@ +#### A masked pattern was here #### +PREHOOK: type: CREATETABLE +#### A masked pattern was here #### +PREHOOK: Output: database:default +#### A masked pattern was here #### +POSTHOOK: type: CREATETABLE +#### A masked pattern was here #### +POSTHOOK: Output: database:default +POSTHOOK: Output: default@roottable +PREHOOK: query: select count(*) from roottable +PREHOOK: type: QUERY +PREHOOK: Input: default@roottable +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from roottable +POSTHOOK: type: QUERY +POSTHOOK: Input: default@roottable +#### A masked pattern was here #### +0 +PREHOOK: query: insert into table roottable select key from src where (key < 20) order by key +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@roottable +POSTHOOK: query: insert into table roottable select key from src where (key < 20) order by key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@roottable +POSTHOOK: Lineage: roottable.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +PREHOOK: query: select count(*) from roottable +PREHOOK: type: QUERY +PREHOOK: Input: default@roottable +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from roottable +POSTHOOK: type: QUERY +POSTHOOK: Input: default@roottable +#### A masked pattern was here #### +20 +PREHOOK: query: select count(*) from roottable +PREHOOK: type: QUERY +PREHOOK: Input: default@roottable +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from roottable +POSTHOOK: type: QUERY +POSTHOOK: Input: default@roottable +#### A masked pattern was here #### +20 Modified: hive/trunk/shims/common-secure/src/main/java/org/apache/hadoop/hive/shims/HadoopShimsSecure.java URL: http://svn.apache.org/viewvc/hive/trunk/shims/common-secure/src/main/java/org/apache/hadoop/hive/shims/HadoopShimsSecure.java?rev=1609025&r1=1609024&r2=1609025&view=diff ============================================================================== --- hive/trunk/shims/common-secure/src/main/java/org/apache/hadoop/hive/shims/HadoopShimsSecure.java (original) +++ hive/trunk/shims/common-secure/src/main/java/org/apache/hadoop/hive/shims/HadoopShimsSecure.java Wed Jul 9 06:49:00 2014 @@ -26,6 +26,7 @@ import java.net.URI; import java.net.URISyntaxException; import java.security.PrivilegedExceptionAction; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collections; import java.util.HashSet; import java.util.List; @@ -67,6 +68,8 @@ import org.apache.hadoop.tools.HadoopArc import org.apache.hadoop.util.Progressable; import org.apache.hadoop.util.ToolRunner; +import com.google.common.primitives.Longs; + /** * Base implemention for shims against secure Hadoop 0.20.3/0.23. */ @@ -97,18 +100,12 @@ public abstract class HadoopShimsSecure _isShrinked = false; } - public InputSplitShim(CombineFileSplit old) throws IOException { - super(old.getJob(), old.getPaths(), old.getStartOffsets(), - old.getLengths(), dedup(old.getLocations())); + public InputSplitShim(JobConf conf, Path[] paths, long[] startOffsets, + long[] lengths, String[] locations) throws IOException { + super(conf, paths, startOffsets, lengths, dedup(locations)); _isShrinked = false; } - private static String[] dedup(String[] locations) { - Set<String> dedup = new HashSet<String>(); - Collections.addAll(dedup, locations); - return dedup.toArray(new String[dedup.size()]); - } - @Override public void shrinkSplit(long length) { _isShrinked = true; @@ -338,12 +335,22 @@ public abstract class HadoopShimsSecure InputSplit[] splits = (InputSplit[]) super.getSplits(job, numSplits); - InputSplitShim[] isplits = new InputSplitShim[splits.length]; + ArrayList<InputSplitShim> inputSplitShims = new ArrayList<InputSplitShim>(); for (int pos = 0; pos < splits.length; pos++) { - isplits[pos] = new InputSplitShim((CombineFileSplit)splits[pos]); + CombineFileSplit split = (CombineFileSplit) splits[pos]; + Set<Integer> dirIndices = getDirIndices(split.getPaths(), job); + if (dirIndices.size() != split.getPaths().length) { + List<Path> prunedPaths = prune(dirIndices, Arrays.asList(split.getPaths())); + List<Long> prunedStartOffsets = prune(dirIndices, Arrays.asList( + ArrayUtils.toObject(split.getStartOffsets()))); + List<Long> prunedLengths = prune(dirIndices, Arrays.asList( + ArrayUtils.toObject(split.getLengths()))); + inputSplitShims.add(new InputSplitShim(job, prunedPaths.toArray(new Path[prunedPaths.size()]), + Longs.toArray(prunedStartOffsets), + Longs.toArray(prunedLengths), split.getLocations())); + } } - - return isplits; + return inputSplitShims.toArray(new InputSplitShim[inputSplitShims.size()]); } public InputSplitShim getInputSplitShim() throws IOException { @@ -623,4 +630,37 @@ public abstract class HadoopShimsSecure int retval = shell.run(command); LOG.debug("Return value is :" + retval); } + + /** + * CombineFileInputFormat sometimes returns directories as splits, need to prune them. + */ + private static Set<Integer> getDirIndices(Path[] paths, JobConf conf) throws IOException { + Set<Integer> result = new HashSet<Integer>(); + for (int i = 0; i < paths.length; i++) { + FileSystem fs = paths[i].getFileSystem(conf); + if (!fs.isFile(paths[i])) { + result.add(i); + } + } + return result; + } + + private static <K> List<K> prune(Set<Integer> indicesToPrune, List<K> elms) { + List<K> result = new ArrayList<K>(); + int i = 0; + for (K elm : elms) { + if (indicesToPrune.contains(i)) { + continue; + } + result.add(elm); + i++; + } + return result; + } + + private static String[] dedup(String[] locations) throws IOException { + Set<String> dedup = new HashSet<String>(); + Collections.addAll(dedup, locations); + return dedup.toArray(new String[dedup.size()]); + } }