This is an automated email from the ASF dual-hosted git repository.
zhangbutao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new 36d32ec7fd6 HIVE-26713: StringExpr ArrayIndexOutOfBoundsException with
LIKE '%xxx%' (#4999)(Ryu Kobayashi, reviewed by Attila Turoczy, Butao Zhang)
36d32ec7fd6 is described below
commit 36d32ec7fd6ac9053e6a9d28f01dd431149a5ac4
Author: Ryu Kobayashi <[email protected]>
AuthorDate: Tue Jan 23 13:05:48 2024 +0900
HIVE-26713: StringExpr ArrayIndexOutOfBoundsException with LIKE '%xxx%'
(#4999)(Ryu Kobayashi, reviewed by Attila Turoczy, Butao Zhang)
---
data/files/control_characters.txt | 1 +
.../clientpositive/like_control_characters.q | 13 +++
.../llap/like_control_characters.q.out | 93 ++++++++++++++++++++++
.../ql/exec/vector/expressions/StringExpr.java | 10 ++-
.../ql/exec/vector/expressions/TestStringExpr.java | 24 +++++-
5 files changed, 138 insertions(+), 3 deletions(-)
diff --git a/data/files/control_characters.txt
b/data/files/control_characters.txt
new file mode 100644
index 00000000000..4e3fc6c4535
--- /dev/null
+++ b/data/files/control_characters.txt
@@ -0,0 +1 @@
+abcde�fghi
\ No newline at end of file
diff --git a/ql/src/test/queries/clientpositive/like_control_characters.q
b/ql/src/test/queries/clientpositive/like_control_characters.q
new file mode 100644
index 00000000000..5f9772ed2ef
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/like_control_characters.q
@@ -0,0 +1,13 @@
+set hive.mapred.mode=nonstrict;
+set hive.explain.user=false;
+set hive.vectorized.execution.enabled=true;
+
+create temporary table foo (col string);
+
+-- SORT_QUERY_RESULTS
+
+LOAD DATA LOCAL INPATH '../../data/files/control_characters.txt' INTO TABLE
foo;
+
+explain select col, count(*) from foo where col like '%fg%' group by col;
+select col, count(*) from foo where col like '%fg%' group by col;
+
diff --git
a/ql/src/test/results/clientpositive/llap/like_control_characters.q.out
b/ql/src/test/results/clientpositive/llap/like_control_characters.q.out
new file mode 100644
index 00000000000..14aa86328db
--- /dev/null
+++ b/ql/src/test/results/clientpositive/llap/like_control_characters.q.out
@@ -0,0 +1,93 @@
+PREHOOK: query: create temporary table foo (col string)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@foo
+POSTHOOK: query: create temporary table foo (col string)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@foo
+PREHOOK: query: LOAD DATA LOCAL INPATH
'../../data/files/control_characters.txt' INTO TABLE foo
+PREHOOK: type: LOAD
+#### A masked pattern was here ####
+PREHOOK: Output: default@foo
+POSTHOOK: query: LOAD DATA LOCAL INPATH
'../../data/files/control_characters.txt' INTO TABLE foo
+POSTHOOK: type: LOAD
+#### A masked pattern was here ####
+POSTHOOK: Output: default@foo
+PREHOOK: query: explain select col, count(*) from foo where col like '%fg%'
group by col
+PREHOOK: type: QUERY
+PREHOOK: Input: default@foo
+#### A masked pattern was here ####
+POSTHOOK: query: explain select col, count(*) from foo where col like '%fg%'
group by col
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@foo
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Tez
+#### A masked pattern was here ####
+ Edges:
+ Reducer 2 <- Map 1 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: foo
+ filterExpr: (col like '%fg%') (type: boolean)
+ Statistics: Num rows: 1 Data size: 184 Basic stats: COMPLETE
Column stats: NONE
+ Filter Operator
+ predicate: (col like '%fg%') (type: boolean)
+ Statistics: Num rows: 1 Data size: 184 Basic stats:
COMPLETE Column stats: NONE
+ Group By Operator
+ aggregations: count()
+ keys: col (type: string)
+ minReductionHashAggr: 0.99
+ mode: hash
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 1 Data size: 184 Basic stats:
COMPLETE Column stats: NONE
+ Reduce Output Operator
+ key expressions: _col0 (type: string)
+ null sort order: z
+ sort order: +
+ Map-reduce partition columns: _col0 (type: string)
+ Statistics: Num rows: 1 Data size: 184 Basic stats:
COMPLETE Column stats: NONE
+ value expressions: _col1 (type: bigint)
+ Execution mode: vectorized, llap
+ LLAP IO: all inputs
+ Reducer 2
+ Execution mode: vectorized, llap
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations: count(VALUE._col0)
+ keys: KEY._col0 (type: string)
+ mode: mergepartial
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 1 Data size: 184 Basic stats: COMPLETE
Column stats: NONE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 1 Data size: 184 Basic stats: COMPLETE
Column stats: NONE
+ table:
+ input format:
org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format:
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: select col, count(*) from foo where col like '%fg%' group by
col
+PREHOOK: type: QUERY
+PREHOOK: Input: default@foo
+#### A masked pattern was here ####
+POSTHOOK: query: select col, count(*) from foo where col like '%fg%' group by
col
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@foo
+#### A masked pattern was here ####
+abcde�fghi 1
diff --git
a/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringExpr.java
b/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringExpr.java
index b6d3184ffed..34097167ac1 100644
---
a/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringExpr.java
+++
b/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringExpr.java
@@ -342,7 +342,15 @@ public class StringExpr {
}
s_tmp--;
}
- next += shift[input[next] & MAX_BYTE];
+
+ // if the character string contains control characters,
+ // overflow occurs.
+ int shiftIndex = input[next] & MAX_BYTE;
+ if (shiftIndex >= MAX_BYTE) {
+ next++;
+ } else {
+ next += shift[shiftIndex];
+ }
}
return -1;
}
diff --git
a/storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestStringExpr.java
b/storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestStringExpr.java
index 6fb66115277..483eb68b979 100644
---
a/storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestStringExpr.java
+++
b/storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestStringExpr.java
@@ -20,9 +20,11 @@ package org.apache.hadoop.hive.ql.exec.vector.expressions;
import org.junit.Test;
-import java.nio.charset.StandardCharsets;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.*;
+import java.io.ByteArrayOutputStream;
+import java.nio.charset.StandardCharsets;
public class TestStringExpr {
@Test
@@ -49,6 +51,24 @@ public class TestStringExpr {
assertEquals("Testing match at end of string", 24, find(pattern, input4));
}
+ @Test
+ public void testControlCharacters() throws Exception {
+ StringExpr.Finder pattern = compile("pattern");
+ assertNotNull(pattern);
+
+ byte b = -1;
+ byte[] controlBytes1 = "abcedf".getBytes(StandardCharsets.UTF_8);
+ byte[] controlBytes2 = "pattern".getBytes(StandardCharsets.UTF_8);
+ ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
+ outputStream.write(controlBytes1);
+ outputStream.write(b);
+ outputStream.write(controlBytes2);
+ byte[] controlChar = outputStream.toByteArray();
+ outputStream.close();
+
+ assertEquals("Testing valid match", 7, pattern.find(controlChar, 0,
controlChar.length));
+ }
+
private StringExpr.Finder compile(String pattern) {
return StringExpr.compile(pattern.getBytes(StandardCharsets.UTF_8));
}