This is an automated email from the ASF dual-hosted git repository.
szita pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new 95e76f0 HIVE-24151: MultiDelimitSerDe shifts data if strings contain
non-ASCII characters (Adam Szita, reviewed by Peter Vary)
95e76f0 is described below
commit 95e76f0bd65e9c7f48252d5c4777ca5ab2a66c3b
Author: Adam Szita <[email protected]>
AuthorDate: Thu Sep 17 10:06:46 2020 +0200
HIVE-24151: MultiDelimitSerDe shifts data if strings contain non-ASCII
characters (Adam Szita, reviewed by Peter Vary)
---
data/files/t4_multi_delimit.csv | 5 ++
.../queries/clientpositive/serde_multi_delimit.q | 12 ++++-
.../clientpositive/llap/serde_multi_delimit.q.out | 45 ++++++++++++++++++
.../hadoop/hive/serde2/MultiDelimitSerDe.java | 14 +++---
.../apache/hadoop/hive/serde2/lazy/LazyStruct.java | 55 +++++++++++++---------
5 files changed, 101 insertions(+), 30 deletions(-)
diff --git a/data/files/t4_multi_delimit.csv b/data/files/t4_multi_delimit.csv
new file mode 100644
index 0000000..8888d1d
--- /dev/null
+++ b/data/files/t4_multi_delimit.csv
@@ -0,0 +1,5 @@
+Сок^,dsadsa
+ááé^,^,üóüóüóüóüó
+^,^,^,^,
+áűáűáűáű^,^,^,^,
+űűű^,ááá^,óóó
\ No newline at end of file
diff --git a/ql/src/test/queries/clientpositive/serde_multi_delimit.q
b/ql/src/test/queries/clientpositive/serde_multi_delimit.q
index 0d85175..e9e7f78 100644
--- a/ql/src/test/queries/clientpositive/serde_multi_delimit.q
+++ b/ql/src/test/queries/clientpositive/serde_multi_delimit.q
@@ -58,8 +58,18 @@ LOAD DATA LOCAL INPATH
"../../data/files/t3_multi_delimit.csv" INTO TABLE t3_mul
SELECT * FROM t3_multi_delimit;
+CREATE TABLE t4_multi_delimit(colA string,
+ colB string,
+ colC string)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.MultiDelimitSerDe'
+WITH SERDEPROPERTIES ("field.delim"="^,")STORED AS TEXTFILE;
+LOAD DATA LOCAL INPATH "../../data/files/t4_multi_delimit.csv" INTO TABLE
t4_multi_delimit;
+
+SELECT * FROM t4_multi_delimit;
+
DROP TABLE t1_multi_delimit;
DROP TABLE t11_csv_serde;
DROP TABLE t2_multi_delimit;
-DROP TABLE t3_multi_delimit;
\ No newline at end of file
+DROP TABLE t3_multi_delimit;
+DROP TABLE t4_multi_delimit;
\ No newline at end of file
diff --git a/ql/src/test/results/clientpositive/llap/serde_multi_delimit.q.out
b/ql/src/test/results/clientpositive/llap/serde_multi_delimit.q.out
index 3437744..837f620 100644
--- a/ql/src/test/results/clientpositive/llap/serde_multi_delimit.q.out
+++ b/ql/src/test/results/clientpositive/llap/serde_multi_delimit.q.out
@@ -198,6 +198,43 @@ NULL NULL NULL NULL NULL
8 8 NULL 8 8
9 9 NULL 9 9
10101010 NULL NULL NULL NULL
+PREHOOK: query: CREATE TABLE t4_multi_delimit(colA string,
+ colB string,
+ colC string)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.MultiDelimitSerDe'
+WITH SERDEPROPERTIES ("field.delim"="^,")STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@t4_multi_delimit
+POSTHOOK: query: CREATE TABLE t4_multi_delimit(colA string,
+ colB string,
+ colC string)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.MultiDelimitSerDe'
+WITH SERDEPROPERTIES ("field.delim"="^,")STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@t4_multi_delimit
+PREHOOK: query: LOAD DATA LOCAL INPATH "../../data/files/t4_multi_delimit.csv"
INTO TABLE t4_multi_delimit
+PREHOOK: type: LOAD
+#### A masked pattern was here ####
+PREHOOK: Output: default@t4_multi_delimit
+POSTHOOK: query: LOAD DATA LOCAL INPATH
"../../data/files/t4_multi_delimit.csv" INTO TABLE t4_multi_delimit
+POSTHOOK: type: LOAD
+#### A masked pattern was here ####
+POSTHOOK: Output: default@t4_multi_delimit
+PREHOOK: query: SELECT * FROM t4_multi_delimit
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t4_multi_delimit
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT * FROM t4_multi_delimit
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t4_multi_delimit
+#### A masked pattern was here ####
+Сок dsadsa NULL
+ááé üóüóüóüóüó
+
+áűáűáűáű
+űűű ááá óóó
PREHOOK: query: DROP TABLE t1_multi_delimit
PREHOOK: type: DROPTABLE
PREHOOK: Input: default@t1_multi_delimit
@@ -230,3 +267,11 @@ POSTHOOK: query: DROP TABLE t3_multi_delimit
POSTHOOK: type: DROPTABLE
POSTHOOK: Input: default@t3_multi_delimit
POSTHOOK: Output: default@t3_multi_delimit
+PREHOOK: query: DROP TABLE t4_multi_delimit
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@t4_multi_delimit
+PREHOOK: Output: default@t4_multi_delimit
+POSTHOOK: query: DROP TABLE t4_multi_delimit
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@t4_multi_delimit
+POSTHOOK: Output: default@t4_multi_delimit
diff --git
a/serde/src/java/org/apache/hadoop/hive/serde2/MultiDelimitSerDe.java
b/serde/src/java/org/apache/hadoop/hive/serde2/MultiDelimitSerDe.java
index efe6597..289f7fe 100644
--- a/serde/src/java/org/apache/hadoop/hive/serde2/MultiDelimitSerDe.java
+++ b/serde/src/java/org/apache/hadoop/hive/serde2/MultiDelimitSerDe.java
@@ -27,7 +27,6 @@ import java.util.regex.Pattern;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.serde.serdeConstants;
-import org.apache.hadoop.hive.serde2.*;
import org.apache.hadoop.hive.serde2.lazy.ByteArrayRef;
import org.apache.hadoop.hive.serde2.lazy.LazyFactory;
import org.apache.hadoop.hive.serde2.lazy.LazyStruct;
@@ -70,7 +69,8 @@ public class MultiDelimitSerDe extends
AbstractEncodingAwareSerDe {
private static final String COLLECTION_DELIM = "collection.delim";
// actual delimiter(fieldDelimited) is replaced by REPLACEMENT_DELIM in row.
- private static final String REPLACEMENT_DELIM = "\1";
+ public static final String REPLACEMENT_DELIM_SEQUENCE = "\1";
+ public static final int REPLACEMENT_DELIM_LENGTH =
REPLACEMENT_DELIM_SEQUENCE.getBytes().length;
private int numColumns;
private String fieldDelimited;
@@ -93,8 +93,6 @@ public class MultiDelimitSerDe extends
AbstractEncodingAwareSerDe {
private final ByteStream.Output serializeStream = new ByteStream.Output();
// The Writable to return in serialize
private final Text serializeCache = new Text();
- // pattern for delimiter
- private Pattern delimiterPattern;
@Override
public void initialize(Configuration conf, Properties tbl) throws
SerDeException {
@@ -106,7 +104,7 @@ public class MultiDelimitSerDe extends
AbstractEncodingAwareSerDe {
if (fieldDelimited == null || fieldDelimited.isEmpty()) {
throw new SerDeException("This table does not have serde property
\"field.delim\"!");
}
- delimiterPattern = Pattern.compile(fieldDelimited, Pattern.LITERAL);
+
// get the collection separator and map key separator
// TODO: use serdeConstants.COLLECTION_DELIM when the typo is fixed
collSep = LazyUtils.getByte(tbl.getProperty(COLLECTION_DELIM),
@@ -141,7 +139,7 @@ public class MultiDelimitSerDe extends
AbstractEncodingAwareSerDe {
}
- @Override
+ @Override
public Object doDeserialize(Writable blob) throws SerDeException {
if (byteArrayRef == null) {
byteArrayRef = new ByteArrayRef();
@@ -159,10 +157,10 @@ public class MultiDelimitSerDe extends
AbstractEncodingAwareSerDe {
} else {
throw new SerDeException(getClass() + ": expects either BytesWritable or
Text object!");
}
- byteArrayRef.setData(rowStr.replaceAll(Pattern.quote(fieldDelimited),
REPLACEMENT_DELIM).getBytes());
+ byteArrayRef.setData(rowStr.replaceAll(Pattern.quote(fieldDelimited),
REPLACEMENT_DELIM_SEQUENCE).getBytes());
cachedLazyStruct.init(byteArrayRef, 0, byteArrayRef.getData().length);
// use the multi-char delimiter to parse the lazy struct
- cachedLazyStruct.parseMultiDelimit(rowStr, delimiterPattern,
REPLACEMENT_DELIM);
+ cachedLazyStruct.parseMultiDelimit(rowStr.getBytes(),
fieldDelimited.getBytes());
return cachedLazyStruct;
}
diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyStruct.java
b/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyStruct.java
index 9163824..66b1537 100644
--- a/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyStruct.java
+++ b/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyStruct.java
@@ -20,9 +20,10 @@ package org.apache.hadoop.hive.serde2.lazy;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
+import com.google.common.primitives.Bytes;
+
+import org.apache.hadoop.hive.serde2.MultiDelimitSerDe;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -279,14 +280,8 @@ public class LazyStruct extends
LazyNonPrimitive<LazySimpleStructObjectInspector
return serializedSize;
}
- /**
- * Parses rawRow using multi-char delimiter.
- *
- * @param rawRow row to be parsed, delimited by fieldDelimit
- * @param fieldDelimit pattern of multi-char delimiter
- * @param replacementDelim delimiter with which fieldDelimit has been
replaced in rawRow
- */
- public void parseMultiDelimit(final String rawRow, final Pattern
fieldDelimit, final String replacementDelim) {
+ // parse the struct using multi-char delimiter
+ public void parseMultiDelimit(byte[] rawRow, byte[] fieldDelimit) {
if (rawRow == null || fieldDelimit == null) {
return;
}
@@ -299,28 +294,46 @@ public class LazyStruct extends
LazyNonPrimitive<LazySimpleStructObjectInspector
fieldInited = new boolean[fields.length];
startPosition = new int[fields.length + 1];
}
- final int delimiterLength = fieldDelimit.toString().length();
- final int extraBytesInDelim = delimiterLength - replacementDelim.length();
-
+ // the indexes of the delimiters
+ int[] delimitIndexes = findIndexes(rawRow, fieldDelimit);
+ int diff = fieldDelimit.length -
MultiDelimitSerDe.REPLACEMENT_DELIM_LENGTH;
// first field always starts from 0, even when missing
startPosition[0] = 0;
- Matcher delimiterMatcher = fieldDelimit.matcher(rawRow);
for (int i = 1; i <= fields.length; i++) {
- if (delimiterMatcher.find()) {
- // MultiDelimitSerDe replaces actual multi-char delimiter by
replacementDelim("\1") which reduces the length
- // however here we are getting rawRow with original multi-char
delimiter
- // due to this we have to subtract those extra chars to match length
of LazyNonPrimitive#bytes which are used
- // while reading data, see uncheckedGetField()
- startPosition[i] = delimiterMatcher.start() + delimiterLength - i *
extraBytesInDelim;
+ if (delimitIndexes[i - 1] != -1) {
+ int start = delimitIndexes[i - 1] + fieldDelimit.length;
+ startPosition[i] = start - i * diff;
} else {
startPosition[i] = length + 1;
}
}
-
Arrays.fill(fieldInited, false);
parsed = true;
}
+ // find all the indexes of the sub byte[]
+ private int[] findIndexes(byte[] array, byte[] target) {
+ if (fields.length <= 1) {
+ return new int[0];
+ }
+ int[] indexes = new int[fields.length];
+ Arrays.fill(indexes, -1);
+ indexes[0] = Bytes.indexOf(array, target);
+ if (indexes[0] == -1) {
+ return indexes;
+ }
+ int indexInNewArray = indexes[0];
+ for (int i = 1; i < indexes.length; i++) {
+ array = Arrays.copyOfRange(array, indexInNewArray + target.length,
array.length);
+ indexInNewArray = Bytes.indexOf(array, target);
+ if (indexInNewArray == -1) {
+ break;
+ }
+ indexes[i] = indexInNewArray + indexes[i - 1] + target.length;
+ }
+ return indexes;
+ }
+
/**
* Return the data in bytes corresponding to this given struct. This is
useful specifically in
* cases where the data is stored in serialized formats like protobufs or
thrift and would need