This is an automated email from the ASF dual-hosted git repository.

abstractdog pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git


The following commit(s) were added to refs/heads/master by this push:
     new c5d95776237 HIVE-27498: Support custom delimiter in 
SkippingTextInputFormat (#5265) (Mayank Kunwar reviewed by Laszlo Bodor)
c5d95776237 is described below

commit c5d9577623747e848a7e9e00b208be6d4aa791b5
Author: Mayank Kunwar <[email protected]>
AuthorDate: Thu Jun 6 13:27:43 2024 +0530

    HIVE-27498: Support custom delimiter in SkippingTextInputFormat (#5265) 
(Mayank Kunwar reviewed by Laszlo Bodor)
---
 data/files/header_footer_table_4/0003.txt          |  2 +
 .../hadoop/hive/ql/io/SkippingTextInputFormat.java | 19 ++++++-
 .../queries/clientpositive/file_with_delimiter.q   | 22 ++++++++
 .../clientpositive/llap/file_with_delimiter.q.out  | 63 ++++++++++++++++++++++
 4 files changed, 104 insertions(+), 2 deletions(-)

diff --git a/data/files/header_footer_table_4/0003.txt 
b/data/files/header_footer_table_4/0003.txt
new file mode 100644
index 00000000000..e22f4ae6923
--- /dev/null
+++ b/data/files/header_footer_table_4/0003.txt
@@ -0,0 +1,2 @@
+Code   Name|A  AAAA|B  BBBB
+CCCC|C DDDD
\ No newline at end of file
diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/io/SkippingTextInputFormat.java 
b/ql/src/java/org/apache/hadoop/hive/ql/io/SkippingTextInputFormat.java
index 7427aeab9a7..45634a098d6 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/SkippingTextInputFormat.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/SkippingTextInputFormat.java
@@ -110,17 +110,32 @@ public class SkippingTextInputFormat extends 
TextInputFormat {
       fileSystem = path.getFileSystem(conf);
       try {
         fis = fileSystem.open(path);
+        long currPos = fis.getPos();
+        int delimiterIdx = -1;
         for (int j = 0; j < headerCount; j++) {
-          if (fis.readLine() == null) {
+          String headerLine = fis.readLine();
+          if (headerLine == null) {
             startIndexMap.put(path, Long.MAX_VALUE);
             return Long.MAX_VALUE;
           }
+          if (j == headerCount-1) {
+            String delimiter = conf.get("textinputformat.record.delimiter");
+            // If record delimiter is defined
+            if (delimiter != null && !delimiter.isEmpty()) {
+              delimiterIdx = headerLine.indexOf(delimiter);
+            } else {
+              currPos = fis.getPos();
+            }
+          } else {
+            currPos = fis.getPos();
+          }
         }
         // Readers skip the entire first row if the start index of the
         // split is not zero. We are setting the start of the index as
         // the last byte of the previous row so the last line of header
         // is discarded instead of the first valid input row.
-        startIndexForFile = fis.getPos() - 1;
+        // We consider record delimiters if they exist.
+        startIndexForFile = currPos + delimiterIdx;
       } finally {
         if (fis != null) {
           fis.close();
diff --git a/ql/src/test/queries/clientpositive/file_with_delimiter.q 
b/ql/src/test/queries/clientpositive/file_with_delimiter.q
new file mode 100644
index 00000000000..bb90df1e5c5
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/file_with_delimiter.q
@@ -0,0 +1,22 @@
+CREATE EXTERNAL TABLE test(code string,name string)
+ROW FORMAT SERDE
+   'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
+ WITH SERDEPROPERTIES (
+   'field.delim'='\t')
+ STORED AS INPUTFORMAT
+   'org.apache.hadoop.mapred.TextInputFormat'
+ OUTPUTFORMAT
+   'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
+   location '${system:test.tmp.dir}/test'
+ TBLPROPERTIES (
+   'skip.header.line.count'='1',
+   'textinputformat.record.delimiter'='|');
+
+
+LOAD DATA LOCAL INPATH '../../data/files/header_footer_table_4/0003.txt' INTO 
TABLE test;
+
+
+SELECT COUNT(*) FROM test;
+
+
+SELECT * FROM test;
\ No newline at end of file
diff --git a/ql/src/test/results/clientpositive/llap/file_with_delimiter.q.out 
b/ql/src/test/results/clientpositive/llap/file_with_delimiter.q.out
new file mode 100644
index 00000000000..c213767e552
--- /dev/null
+++ b/ql/src/test/results/clientpositive/llap/file_with_delimiter.q.out
@@ -0,0 +1,63 @@
+PREHOOK: query: CREATE EXTERNAL TABLE test(code string,name string)
+ROW FORMAT SERDE
+   'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
+ WITH SERDEPROPERTIES (
+   'field.delim'='\t')
+ STORED AS INPUTFORMAT
+   'org.apache.hadoop.mapred.TextInputFormat'
+ OUTPUTFORMAT
+   'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
+#### A masked pattern was here ####
+ TBLPROPERTIES (
+   'skip.header.line.count'='1',
+   'textinputformat.record.delimiter'='|')
+PREHOOK: type: CREATETABLE
+#### A masked pattern was here ####
+PREHOOK: Output: database:default
+PREHOOK: Output: default@test
+POSTHOOK: query: CREATE EXTERNAL TABLE test(code string,name string)
+ROW FORMAT SERDE
+   'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
+ WITH SERDEPROPERTIES (
+   'field.delim'='\t')
+ STORED AS INPUTFORMAT
+   'org.apache.hadoop.mapred.TextInputFormat'
+ OUTPUTFORMAT
+   'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
+#### A masked pattern was here ####
+ TBLPROPERTIES (
+   'skip.header.line.count'='1',
+   'textinputformat.record.delimiter'='|')
+POSTHOOK: type: CREATETABLE
+#### A masked pattern was here ####
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@test
+PREHOOK: query: LOAD DATA LOCAL INPATH 
'../../data/files/header_footer_table_4/0003.txt' INTO TABLE test
+PREHOOK: type: LOAD
+#### A masked pattern was here ####
+PREHOOK: Output: default@test
+POSTHOOK: query: LOAD DATA LOCAL INPATH 
'../../data/files/header_footer_table_4/0003.txt' INTO TABLE test
+POSTHOOK: type: LOAD
+#### A masked pattern was here ####
+POSTHOOK: Output: default@test
+PREHOOK: query: SELECT COUNT(*) FROM test
+PREHOOK: type: QUERY
+PREHOOK: Input: default@test
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT COUNT(*) FROM test
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@test
+#### A masked pattern was here ####
+3
+PREHOOK: query: SELECT * FROM test
+PREHOOK: type: QUERY
+PREHOOK: Input: default@test
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT * FROM test
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@test
+#### A masked pattern was here ####
+A      AAAA
+B      BBBB
+CCCC
+C      DDDD

Reply via email to