[
https://issues.apache.org/jira/browse/HIVE-24151?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17194290#comment-17194290
]
Ádám Szita commented on HIVE-24151:
-----------------------------------
{code:java}
szita@szita-MBP16:~/shadow/CDH/hive$ git diff
463dae9ee8f694002af492e7d05924423aeaed09:serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyStruct.java
5de36f990d89fcd5c3d7d2344a28e16e4c1f8c24:serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyStruct.java
diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyStruct.java
b/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyStruct.java
index f066aaa3bf5..66b15374dda 100644
--- a/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyStruct.java
+++ b/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyStruct.java
@@ -22,6 +22,8 @@
import java.util.List; import com.google.common.primitives.Bytes;
+
+import org.apache.hadoop.hive.serde2.MultiDelimitSerDe;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -294,10 +296,10 @@ public void parseMultiDelimit(byte[] rawRow, byte[]
fieldDelimit) {
}
// the indexes of the delimiters
int[] delimitIndexes = findIndexes(rawRow, fieldDelimit);
- int diff = fieldDelimit.length - 1;
+ int diff = fieldDelimit.length -
MultiDelimitSerDe.REPLACEMENT_DELIM_LENGTH;
// first field always starts from 0, even when missing
startPosition[0] = 0;
- for (int i = 1; i < fields.length; i++) {
+ for (int i = 1; i <= fields.length; i++) {
if (delimitIndexes[i - 1] != -1) {
int start = delimitIndexes[i - 1] + fieldDelimit.length;
startPosition[i] = start - i * diff;
@@ -305,7 +307,6 @@ public void parseMultiDelimit(byte[] rawRow, byte[]
fieldDelimit) {
startPosition[i] = length + 1;
}
}
- startPosition[fields.length] = length + 1;
Arrays.fill(fieldInited, false);
parsed = true;
}
@@ -315,7 +316,7 @@ public void parseMultiDelimit(byte[] rawRow, byte[]
fieldDelimit) {
if (fields.length <= 1) {
return new int[0];
}
- int[] indexes = new int[fields.length - 1];
+ int[] indexes = new int[fields.length];
Arrays.fill(indexes, -1);
indexes[0] = Bytes.indexOf(array, target);
if (indexes[0] == -1) {
{code}
> MultiDelimitSerDe shifts data if strings contain non-ASCII characters
> ---------------------------------------------------------------------
>
> Key: HIVE-24151
> URL: https://issues.apache.org/jira/browse/HIVE-24151
> Project: Hive
> Issue Type: Bug
> Reporter: Ádám Szita
> Assignee: Ádám Szita
> Priority: Major
> Labels: pull-request-available
> Time Spent: 20m
> Remaining Estimate: 0h
>
> HIVE-22360 intended to fix another MultiDelimitSerde problem (with NULL last
> columns) but introduced a regression: the approach of the fix is pretty much
> all wrong, as the existing logic that operated on bytes got replaced by regex
> matcher logic which deals in character positions, rather than byte positions.
> As some non ASCII characters consist of more than 1 byte, the whole record
> may get shifted due to this.
> With this ticket I'm going to restore the old logic, and apply the proper fix
> on that, but keeping (and extending) the test cases added with HIVE-22360 so
> that we have a solution for both issues.
--
This message was sent by Atlassian Jira
(v8.3.4#803005)