Repository: hive
Updated Branches:
  refs/heads/master cdcc35e61 -> 4df9b4d20


HIVE-12541: SymbolicTextInputFormat should supports the path with regex 
(Xiaowei Wang, reviewed by Aihua Xu)


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/4df9b4d2
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/4df9b4d2
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/4df9b4d2

Branch: refs/heads/master
Commit: 4df9b4d208087dd659309cda7d4627d000b2f6ba
Parents: cdcc35e
Author: Aihua Xu <aihu...@apache.org>
Authored: Tue Dec 22 09:37:41 2015 -0500
Committer: Aihua Xu <aihu...@apache.org>
Committed: Tue Dec 22 09:37:41 2015 -0500

----------------------------------------------------------------------
 data/files/regex-path-2015-12-10_03.txt         |   1 +
 data/files/regex-path-201512-10_03.txt          |   1 +
 data/files/regex-path-2015121003.txt            |   1 +
 data/files/symlink-with-regex.txt               |   2 +
 .../hadoop/hive/ql/io/SymbolicInputFormat.java  |   7 +-
 .../clientpositive/symlink_text_input_format.q  |  26 +++
 .../symlink_text_input_format.q.out             | 218 +++++++++++++++++++
 7 files changed, 254 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/4df9b4d2/data/files/regex-path-2015-12-10_03.txt
----------------------------------------------------------------------
diff --git a/data/files/regex-path-2015-12-10_03.txt 
b/data/files/regex-path-2015-12-10_03.txt
new file mode 100644
index 0000000..315e406
--- /dev/null
+++ b/data/files/regex-path-2015-12-10_03.txt
@@ -0,0 +1 @@
+101101

http://git-wip-us.apache.org/repos/asf/hive/blob/4df9b4d2/data/files/regex-path-201512-10_03.txt
----------------------------------------------------------------------
diff --git a/data/files/regex-path-201512-10_03.txt 
b/data/files/regex-path-201512-10_03.txt
new file mode 100644
index 0000000..e2bdf39
--- /dev/null
+++ b/data/files/regex-path-201512-10_03.txt
@@ -0,0 +1 @@
+102102

http://git-wip-us.apache.org/repos/asf/hive/blob/4df9b4d2/data/files/regex-path-2015121003.txt
----------------------------------------------------------------------
diff --git a/data/files/regex-path-2015121003.txt 
b/data/files/regex-path-2015121003.txt
new file mode 100644
index 0000000..74a4ca1
--- /dev/null
+++ b/data/files/regex-path-2015121003.txt
@@ -0,0 +1 @@
+103103

http://git-wip-us.apache.org/repos/asf/hive/blob/4df9b4d2/data/files/symlink-with-regex.txt
----------------------------------------------------------------------
diff --git a/data/files/symlink-with-regex.txt 
b/data/files/symlink-with-regex.txt
new file mode 100644
index 0000000..21e119e
--- /dev/null
+++ b/data/files/symlink-with-regex.txt
@@ -0,0 +1,2 @@
+../../data/files/*2015{-,}12{-,}10{_03,03}*.txt
+../../data/files/T{1,3}.txt

http://git-wip-us.apache.org/repos/asf/hive/blob/4df9b4d2/ql/src/java/org/apache/hadoop/hive/ql/io/SymbolicInputFormat.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/SymbolicInputFormat.java 
b/ql/src/java/org/apache/hadoop/hive/ql/io/SymbolicInputFormat.java
index feef854..8b49204 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/SymbolicInputFormat.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/SymbolicInputFormat.java
@@ -75,8 +75,11 @@ public class SymbolicInputFormat implements 
ReworkMapredInputFormat {
             while ((line = reader.readLine()) != null) {
               // no check for the line? How to check?
               // if the line is invalid for any reason, the job will fail.
-              toAddPathToPart.put(line, partDesc);
-              pathToAliases.put(line, aliases);
+              FileStatus[] matches = fileSystem.globStatus(new Path(line));
+              for(FileStatus fileStatus :matches) {
+                 toAddPathToPart.put(fileStatus.getPath().toUri().getPath(), 
partDesc);
+                 pathToAliases.put(fileStatus.getPath().toUri().getPath(), 
aliases);
+              }
             }
           } finally {
             org.apache.hadoop.io.IOUtils.closeStream(reader);

http://git-wip-us.apache.org/repos/asf/hive/blob/4df9b4d2/ql/src/test/queries/clientpositive/symlink_text_input_format.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/symlink_text_input_format.q 
b/ql/src/test/queries/clientpositive/symlink_text_input_format.q
index 521a617..d89aad4 100644
--- a/ql/src/test/queries/clientpositive/symlink_text_input_format.q
+++ b/ql/src/test/queries/clientpositive/symlink_text_input_format.q
@@ -22,3 +22,29 @@ EXPLAIN SELECT count(1) FROM symlink_text_input_format;
 SELECT count(1) FROM symlink_text_input_format;
 
 DROP TABLE symlink_text_input_format;
+
+CREATE TABLE symlink_text_input_format (key STRING, value STRING) STORED AS 
INPUTFORMAT 'org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat' OUTPUTFORMAT 
'org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat';
+
+dfs -cp ../../data/files/symlink-with-regex.txt 
${system:test.warehouse.dir}/symlink_text_input_format/symlink-with-regex.txt;
+dfs -cp ../../data/files/symlink2.txt 
${system:test.warehouse.dir}/symlink_text_input_format/symlink2.txt;
+
+EXPLAIN SELECT * FROM symlink_text_input_format order by key, value;
+
+SELECT * FROM symlink_text_input_format order by key, value;
+
+EXPLAIN SELECT value FROM symlink_text_input_format order by value;
+
+SELECT value FROM symlink_text_input_format order by value;
+
+EXPLAIN SELECT count(1) FROM symlink_text_input_format;
+
+SELECT count(1) FROM symlink_text_input_format;
+
+SET hive.rework.mapredwork = true ;
+SET mapred.max.split.size= 0 ;
+SET mapred.min.split.size.per.node= 0 ;
+SET mapred.min.split.size.per.rack= 0 ;
+
+SELECT count(1) FROM symlink_text_input_format;
+
+DROP TABLE symlink_text_input_format;

http://git-wip-us.apache.org/repos/asf/hive/blob/4df9b4d2/ql/src/test/results/clientpositive/symlink_text_input_format.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/symlink_text_input_format.q.out 
b/ql/src/test/results/clientpositive/symlink_text_input_format.q.out
index 6c2e2e6..6a091e2 100644
--- a/ql/src/test/results/clientpositive/symlink_text_input_format.q.out
+++ b/ql/src/test/results/clientpositive/symlink_text_input_format.q.out
@@ -223,3 +223,221 @@ POSTHOOK: query: DROP TABLE symlink_text_input_format
 POSTHOOK: type: DROPTABLE
 POSTHOOK: Input: default@symlink_text_input_format
 POSTHOOK: Output: default@symlink_text_input_format
+PREHOOK: query: CREATE TABLE symlink_text_input_format (key STRING, value 
STRING) STORED AS INPUTFORMAT 
'org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat' OUTPUTFORMAT 
'org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat'
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@symlink_text_input_format
+POSTHOOK: query: CREATE TABLE symlink_text_input_format (key STRING, value 
STRING) STORED AS INPUTFORMAT 
'org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat' OUTPUTFORMAT 
'org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat'
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@symlink_text_input_format
+PREHOOK: query: EXPLAIN SELECT * FROM symlink_text_input_format order by key, 
value
+PREHOOK: type: QUERY
+POSTHOOK: query: EXPLAIN SELECT * FROM symlink_text_input_format order by key, 
value
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Map Reduce
+      Map Operator Tree:
+          TableScan
+            alias: symlink_text_input_format
+            Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE 
Column stats: NONE
+            Select Operator
+              expressions: key (type: string), value (type: string)
+              outputColumnNames: _col0, _col1
+              Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE 
Column stats: NONE
+              Reduce Output Operator
+                key expressions: _col0 (type: string), _col1 (type: string)
+                sort order: ++
+                Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE 
Column stats: NONE
+      Reduce Operator Tree:
+        Select Operator
+          expressions: KEY.reducesinkkey0 (type: string), KEY.reducesinkkey1 
(type: string)
+          outputColumnNames: _col0, _col1
+          Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column 
stats: NONE
+          File Output Operator
+            compressed: false
+            Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE 
Column stats: NONE
+            table:
+                input format: org.apache.hadoop.mapred.TextInputFormat
+                output format: 
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+                serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: SELECT * FROM symlink_text_input_format order by key, value
+PREHOOK: type: QUERY
+PREHOOK: Input: default@symlink_text_input_format
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT * FROM symlink_text_input_format order by key, value
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@symlink_text_input_format
+#### A masked pattern was here ####
+1      11
+101    101
+102    102
+103    103
+2      12
+2      12
+2      22
+3      13
+3      13
+4      14
+4      14
+5      15
+6      16
+7      17
+7      17
+8      18
+8      18
+8      18
+8      28
+PREHOOK: query: EXPLAIN SELECT value FROM symlink_text_input_format order by 
value
+PREHOOK: type: QUERY
+POSTHOOK: query: EXPLAIN SELECT value FROM symlink_text_input_format order by 
value
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Map Reduce
+      Map Operator Tree:
+          TableScan
+            alias: symlink_text_input_format
+            Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE 
Column stats: NONE
+            Select Operator
+              expressions: value (type: string)
+              outputColumnNames: _col0
+              Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE 
Column stats: NONE
+              Reduce Output Operator
+                key expressions: _col0 (type: string)
+                sort order: +
+                Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE 
Column stats: NONE
+      Reduce Operator Tree:
+        Select Operator
+          expressions: KEY.reducesinkkey0 (type: string)
+          outputColumnNames: _col0
+          Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column 
stats: NONE
+          File Output Operator
+            compressed: false
+            Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE 
Column stats: NONE
+            table:
+                input format: org.apache.hadoop.mapred.TextInputFormat
+                output format: 
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+                serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: SELECT value FROM symlink_text_input_format order by value
+PREHOOK: type: QUERY
+PREHOOK: Input: default@symlink_text_input_format
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT value FROM symlink_text_input_format order by value
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@symlink_text_input_format
+#### A masked pattern was here ####
+101
+102
+103
+11
+12
+12
+13
+13
+14
+14
+15
+16
+17
+17
+18
+18
+18
+22
+28
+PREHOOK: query: EXPLAIN SELECT count(1) FROM symlink_text_input_format
+PREHOOK: type: QUERY
+POSTHOOK: query: EXPLAIN SELECT count(1) FROM symlink_text_input_format
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Map Reduce
+      Map Operator Tree:
+          TableScan
+            alias: symlink_text_input_format
+            Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE 
Column stats: COMPLETE
+            Select Operator
+              Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE 
Column stats: COMPLETE
+              Group By Operator
+                aggregations: count(1)
+                mode: hash
+                outputColumnNames: _col0
+                Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE 
Column stats: COMPLETE
+                Reduce Output Operator
+                  sort order: 
+                  Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  value expressions: _col0 (type: bigint)
+      Reduce Operator Tree:
+        Group By Operator
+          aggregations: count(VALUE._col0)
+          mode: mergepartial
+          outputColumnNames: _col0
+          Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column 
stats: COMPLETE
+          File Output Operator
+            compressed: false
+            Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column 
stats: COMPLETE
+            table:
+                input format: org.apache.hadoop.mapred.TextInputFormat
+                output format: 
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+                serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: SELECT count(1) FROM symlink_text_input_format
+PREHOOK: type: QUERY
+PREHOOK: Input: default@symlink_text_input_format
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT count(1) FROM symlink_text_input_format
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@symlink_text_input_format
+#### A masked pattern was here ####
+19
+PREHOOK: query: SELECT count(1) FROM symlink_text_input_format
+PREHOOK: type: QUERY
+PREHOOK: Input: default@symlink_text_input_format
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT count(1) FROM symlink_text_input_format
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@symlink_text_input_format
+#### A masked pattern was here ####
+19
+PREHOOK: query: DROP TABLE symlink_text_input_format
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@symlink_text_input_format
+PREHOOK: Output: default@symlink_text_input_format
+POSTHOOK: query: DROP TABLE symlink_text_input_format
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@symlink_text_input_format
+POSTHOOK: Output: default@symlink_text_input_format

Reply via email to