Repository: hive Updated Branches: refs/heads/master cdcc35e61 -> 4df9b4d20
HIVE-12541: SymbolicTextInputFormat should supports the path with regex (Xiaowei Wang, reviewed by Aihua Xu) Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/4df9b4d2 Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/4df9b4d2 Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/4df9b4d2 Branch: refs/heads/master Commit: 4df9b4d208087dd659309cda7d4627d000b2f6ba Parents: cdcc35e Author: Aihua Xu <aihu...@apache.org> Authored: Tue Dec 22 09:37:41 2015 -0500 Committer: Aihua Xu <aihu...@apache.org> Committed: Tue Dec 22 09:37:41 2015 -0500 ---------------------------------------------------------------------- data/files/regex-path-2015-12-10_03.txt | 1 + data/files/regex-path-201512-10_03.txt | 1 + data/files/regex-path-2015121003.txt | 1 + data/files/symlink-with-regex.txt | 2 + .../hadoop/hive/ql/io/SymbolicInputFormat.java | 7 +- .../clientpositive/symlink_text_input_format.q | 26 +++ .../symlink_text_input_format.q.out | 218 +++++++++++++++++++ 7 files changed, 254 insertions(+), 2 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/4df9b4d2/data/files/regex-path-2015-12-10_03.txt ---------------------------------------------------------------------- diff --git a/data/files/regex-path-2015-12-10_03.txt b/data/files/regex-path-2015-12-10_03.txt new file mode 100644 index 0000000..315e406 --- /dev/null +++ b/data/files/regex-path-2015-12-10_03.txt @@ -0,0 +1 @@ +101101 http://git-wip-us.apache.org/repos/asf/hive/blob/4df9b4d2/data/files/regex-path-201512-10_03.txt ---------------------------------------------------------------------- diff --git a/data/files/regex-path-201512-10_03.txt b/data/files/regex-path-201512-10_03.txt new file mode 100644 index 0000000..e2bdf39 --- /dev/null +++ b/data/files/regex-path-201512-10_03.txt @@ -0,0 +1 @@ +102102 http://git-wip-us.apache.org/repos/asf/hive/blob/4df9b4d2/data/files/regex-path-2015121003.txt ---------------------------------------------------------------------- diff --git a/data/files/regex-path-2015121003.txt b/data/files/regex-path-2015121003.txt new file mode 100644 index 0000000..74a4ca1 --- /dev/null +++ b/data/files/regex-path-2015121003.txt @@ -0,0 +1 @@ +103103 http://git-wip-us.apache.org/repos/asf/hive/blob/4df9b4d2/data/files/symlink-with-regex.txt ---------------------------------------------------------------------- diff --git a/data/files/symlink-with-regex.txt b/data/files/symlink-with-regex.txt new file mode 100644 index 0000000..21e119e --- /dev/null +++ b/data/files/symlink-with-regex.txt @@ -0,0 +1,2 @@ +../../data/files/*2015{-,}12{-,}10{_03,03}*.txt +../../data/files/T{1,3}.txt http://git-wip-us.apache.org/repos/asf/hive/blob/4df9b4d2/ql/src/java/org/apache/hadoop/hive/ql/io/SymbolicInputFormat.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/SymbolicInputFormat.java b/ql/src/java/org/apache/hadoop/hive/ql/io/SymbolicInputFormat.java index feef854..8b49204 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/SymbolicInputFormat.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/SymbolicInputFormat.java @@ -75,8 +75,11 @@ public class SymbolicInputFormat implements ReworkMapredInputFormat { while ((line = reader.readLine()) != null) { // no check for the line? How to check? // if the line is invalid for any reason, the job will fail. - toAddPathToPart.put(line, partDesc); - pathToAliases.put(line, aliases); + FileStatus[] matches = fileSystem.globStatus(new Path(line)); + for(FileStatus fileStatus :matches) { + toAddPathToPart.put(fileStatus.getPath().toUri().getPath(), partDesc); + pathToAliases.put(fileStatus.getPath().toUri().getPath(), aliases); + } } } finally { org.apache.hadoop.io.IOUtils.closeStream(reader); http://git-wip-us.apache.org/repos/asf/hive/blob/4df9b4d2/ql/src/test/queries/clientpositive/symlink_text_input_format.q ---------------------------------------------------------------------- diff --git a/ql/src/test/queries/clientpositive/symlink_text_input_format.q b/ql/src/test/queries/clientpositive/symlink_text_input_format.q index 521a617..d89aad4 100644 --- a/ql/src/test/queries/clientpositive/symlink_text_input_format.q +++ b/ql/src/test/queries/clientpositive/symlink_text_input_format.q @@ -22,3 +22,29 @@ EXPLAIN SELECT count(1) FROM symlink_text_input_format; SELECT count(1) FROM symlink_text_input_format; DROP TABLE symlink_text_input_format; + +CREATE TABLE symlink_text_input_format (key STRING, value STRING) STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat'; + +dfs -cp ../../data/files/symlink-with-regex.txt ${system:test.warehouse.dir}/symlink_text_input_format/symlink-with-regex.txt; +dfs -cp ../../data/files/symlink2.txt ${system:test.warehouse.dir}/symlink_text_input_format/symlink2.txt; + +EXPLAIN SELECT * FROM symlink_text_input_format order by key, value; + +SELECT * FROM symlink_text_input_format order by key, value; + +EXPLAIN SELECT value FROM symlink_text_input_format order by value; + +SELECT value FROM symlink_text_input_format order by value; + +EXPLAIN SELECT count(1) FROM symlink_text_input_format; + +SELECT count(1) FROM symlink_text_input_format; + +SET hive.rework.mapredwork = true ; +SET mapred.max.split.size= 0 ; +SET mapred.min.split.size.per.node= 0 ; +SET mapred.min.split.size.per.rack= 0 ; + +SELECT count(1) FROM symlink_text_input_format; + +DROP TABLE symlink_text_input_format; http://git-wip-us.apache.org/repos/asf/hive/blob/4df9b4d2/ql/src/test/results/clientpositive/symlink_text_input_format.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/symlink_text_input_format.q.out b/ql/src/test/results/clientpositive/symlink_text_input_format.q.out index 6c2e2e6..6a091e2 100644 --- a/ql/src/test/results/clientpositive/symlink_text_input_format.q.out +++ b/ql/src/test/results/clientpositive/symlink_text_input_format.q.out @@ -223,3 +223,221 @@ POSTHOOK: query: DROP TABLE symlink_text_input_format POSTHOOK: type: DROPTABLE POSTHOOK: Input: default@symlink_text_input_format POSTHOOK: Output: default@symlink_text_input_format +PREHOOK: query: CREATE TABLE symlink_text_input_format (key STRING, value STRING) STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat' +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@symlink_text_input_format +POSTHOOK: query: CREATE TABLE symlink_text_input_format (key STRING, value STRING) STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat' +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@symlink_text_input_format +PREHOOK: query: EXPLAIN SELECT * FROM symlink_text_input_format order by key, value +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN SELECT * FROM symlink_text_input_format order by key, value +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: symlink_text_input_format + Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string), _col1 (type: string) + sort order: ++ + Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: NONE + Reduce Operator Tree: + Select Operator + expressions: KEY.reducesinkkey0 (type: string), KEY.reducesinkkey1 (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: SELECT * FROM symlink_text_input_format order by key, value +PREHOOK: type: QUERY +PREHOOK: Input: default@symlink_text_input_format +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM symlink_text_input_format order by key, value +POSTHOOK: type: QUERY +POSTHOOK: Input: default@symlink_text_input_format +#### A masked pattern was here #### +1 11 +101 101 +102 102 +103 103 +2 12 +2 12 +2 22 +3 13 +3 13 +4 14 +4 14 +5 15 +6 16 +7 17 +7 17 +8 18 +8 18 +8 18 +8 28 +PREHOOK: query: EXPLAIN SELECT value FROM symlink_text_input_format order by value +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN SELECT value FROM symlink_text_input_format order by value +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: symlink_text_input_format + Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: value (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: NONE + Reduce Operator Tree: + Select Operator + expressions: KEY.reducesinkkey0 (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: SELECT value FROM symlink_text_input_format order by value +PREHOOK: type: QUERY +PREHOOK: Input: default@symlink_text_input_format +#### A masked pattern was here #### +POSTHOOK: query: SELECT value FROM symlink_text_input_format order by value +POSTHOOK: type: QUERY +POSTHOOK: Input: default@symlink_text_input_format +#### A masked pattern was here #### +101 +102 +103 +11 +12 +12 +13 +13 +14 +14 +15 +16 +17 +17 +18 +18 +18 +22 +28 +PREHOOK: query: EXPLAIN SELECT count(1) FROM symlink_text_input_format +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN SELECT count(1) FROM symlink_text_input_format +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: symlink_text_input_format + Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: count(1) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: bigint) + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: SELECT count(1) FROM symlink_text_input_format +PREHOOK: type: QUERY +PREHOOK: Input: default@symlink_text_input_format +#### A masked pattern was here #### +POSTHOOK: query: SELECT count(1) FROM symlink_text_input_format +POSTHOOK: type: QUERY +POSTHOOK: Input: default@symlink_text_input_format +#### A masked pattern was here #### +19 +PREHOOK: query: SELECT count(1) FROM symlink_text_input_format +PREHOOK: type: QUERY +PREHOOK: Input: default@symlink_text_input_format +#### A masked pattern was here #### +POSTHOOK: query: SELECT count(1) FROM symlink_text_input_format +POSTHOOK: type: QUERY +POSTHOOK: Input: default@symlink_text_input_format +#### A masked pattern was here #### +19 +PREHOOK: query: DROP TABLE symlink_text_input_format +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@symlink_text_input_format +PREHOOK: Output: default@symlink_text_input_format +POSTHOOK: query: DROP TABLE symlink_text_input_format +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@symlink_text_input_format +POSTHOOK: Output: default@symlink_text_input_format