This is an automated email from the ASF dual-hosted git repository.
abstractdog pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new c5d95776237 HIVE-27498: Support custom delimiter in
SkippingTextInputFormat (#5265) (Mayank Kunwar reviewed by Laszlo Bodor)
c5d95776237 is described below
commit c5d9577623747e848a7e9e00b208be6d4aa791b5
Author: Mayank Kunwar <[email protected]>
AuthorDate: Thu Jun 6 13:27:43 2024 +0530
HIVE-27498: Support custom delimiter in SkippingTextInputFormat (#5265)
(Mayank Kunwar reviewed by Laszlo Bodor)
---
data/files/header_footer_table_4/0003.txt | 2 +
.../hadoop/hive/ql/io/SkippingTextInputFormat.java | 19 ++++++-
.../queries/clientpositive/file_with_delimiter.q | 22 ++++++++
.../clientpositive/llap/file_with_delimiter.q.out | 63 ++++++++++++++++++++++
4 files changed, 104 insertions(+), 2 deletions(-)
diff --git a/data/files/header_footer_table_4/0003.txt
b/data/files/header_footer_table_4/0003.txt
new file mode 100644
index 00000000000..e22f4ae6923
--- /dev/null
+++ b/data/files/header_footer_table_4/0003.txt
@@ -0,0 +1,2 @@
+Code Name|A AAAA|B BBBB
+CCCC|C DDDD
\ No newline at end of file
diff --git
a/ql/src/java/org/apache/hadoop/hive/ql/io/SkippingTextInputFormat.java
b/ql/src/java/org/apache/hadoop/hive/ql/io/SkippingTextInputFormat.java
index 7427aeab9a7..45634a098d6 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/SkippingTextInputFormat.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/SkippingTextInputFormat.java
@@ -110,17 +110,32 @@ public class SkippingTextInputFormat extends
TextInputFormat {
fileSystem = path.getFileSystem(conf);
try {
fis = fileSystem.open(path);
+ long currPos = fis.getPos();
+ int delimiterIdx = -1;
for (int j = 0; j < headerCount; j++) {
- if (fis.readLine() == null) {
+ String headerLine = fis.readLine();
+ if (headerLine == null) {
startIndexMap.put(path, Long.MAX_VALUE);
return Long.MAX_VALUE;
}
+ if (j == headerCount-1) {
+ String delimiter = conf.get("textinputformat.record.delimiter");
+ // If record delimiter is defined
+ if (delimiter != null && !delimiter.isEmpty()) {
+ delimiterIdx = headerLine.indexOf(delimiter);
+ } else {
+ currPos = fis.getPos();
+ }
+ } else {
+ currPos = fis.getPos();
+ }
}
// Readers skip the entire first row if the start index of the
// split is not zero. We are setting the start of the index as
// the last byte of the previous row so the last line of header
// is discarded instead of the first valid input row.
- startIndexForFile = fis.getPos() - 1;
+ // We consider record delimiters if they exist.
+ startIndexForFile = currPos + delimiterIdx;
} finally {
if (fis != null) {
fis.close();
diff --git a/ql/src/test/queries/clientpositive/file_with_delimiter.q
b/ql/src/test/queries/clientpositive/file_with_delimiter.q
new file mode 100644
index 00000000000..bb90df1e5c5
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/file_with_delimiter.q
@@ -0,0 +1,22 @@
+CREATE EXTERNAL TABLE test(code string,name string)
+ROW FORMAT SERDE
+ 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
+ WITH SERDEPROPERTIES (
+ 'field.delim'='\t')
+ STORED AS INPUTFORMAT
+ 'org.apache.hadoop.mapred.TextInputFormat'
+ OUTPUTFORMAT
+ 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
+ location '${system:test.tmp.dir}/test'
+ TBLPROPERTIES (
+ 'skip.header.line.count'='1',
+ 'textinputformat.record.delimiter'='|');
+
+
+LOAD DATA LOCAL INPATH '../../data/files/header_footer_table_4/0003.txt' INTO
TABLE test;
+
+
+SELECT COUNT(*) FROM test;
+
+
+SELECT * FROM test;
\ No newline at end of file
diff --git a/ql/src/test/results/clientpositive/llap/file_with_delimiter.q.out
b/ql/src/test/results/clientpositive/llap/file_with_delimiter.q.out
new file mode 100644
index 00000000000..c213767e552
--- /dev/null
+++ b/ql/src/test/results/clientpositive/llap/file_with_delimiter.q.out
@@ -0,0 +1,63 @@
+PREHOOK: query: CREATE EXTERNAL TABLE test(code string,name string)
+ROW FORMAT SERDE
+ 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
+ WITH SERDEPROPERTIES (
+ 'field.delim'='\t')
+ STORED AS INPUTFORMAT
+ 'org.apache.hadoop.mapred.TextInputFormat'
+ OUTPUTFORMAT
+ 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
+#### A masked pattern was here ####
+ TBLPROPERTIES (
+ 'skip.header.line.count'='1',
+ 'textinputformat.record.delimiter'='|')
+PREHOOK: type: CREATETABLE
+#### A masked pattern was here ####
+PREHOOK: Output: database:default
+PREHOOK: Output: default@test
+POSTHOOK: query: CREATE EXTERNAL TABLE test(code string,name string)
+ROW FORMAT SERDE
+ 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
+ WITH SERDEPROPERTIES (
+ 'field.delim'='\t')
+ STORED AS INPUTFORMAT
+ 'org.apache.hadoop.mapred.TextInputFormat'
+ OUTPUTFORMAT
+ 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
+#### A masked pattern was here ####
+ TBLPROPERTIES (
+ 'skip.header.line.count'='1',
+ 'textinputformat.record.delimiter'='|')
+POSTHOOK: type: CREATETABLE
+#### A masked pattern was here ####
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@test
+PREHOOK: query: LOAD DATA LOCAL INPATH
'../../data/files/header_footer_table_4/0003.txt' INTO TABLE test
+PREHOOK: type: LOAD
+#### A masked pattern was here ####
+PREHOOK: Output: default@test
+POSTHOOK: query: LOAD DATA LOCAL INPATH
'../../data/files/header_footer_table_4/0003.txt' INTO TABLE test
+POSTHOOK: type: LOAD
+#### A masked pattern was here ####
+POSTHOOK: Output: default@test
+PREHOOK: query: SELECT COUNT(*) FROM test
+PREHOOK: type: QUERY
+PREHOOK: Input: default@test
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT COUNT(*) FROM test
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@test
+#### A masked pattern was here ####
+3
+PREHOOK: query: SELECT * FROM test
+PREHOOK: type: QUERY
+PREHOOK: Input: default@test
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT * FROM test
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@test
+#### A masked pattern was here ####
+A AAAA
+B BBBB
+CCCC
+C DDDD