This is an automated email from the ASF dual-hosted git repository.
abstractdog pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new 0d65a985b95 HIVE-26651: MultiDelimitSerDe shouldn't rely on default
charset when returning the deserialized string (#3690) (Laszlo Bodor reviewed
by Ayush Saxena)
0d65a985b95 is described below
commit 0d65a985b953e6a50a9f2d239ee0254ccf3e9402
Author: Bodor Laszlo <[email protected]>
AuthorDate: Thu Oct 20 08:20:56 2022 +0200
HIVE-26651: MultiDelimitSerDe shouldn't rely on default charset when
returning the deserialized string (#3690) (Laszlo Bodor reviewed by Ayush
Saxena)
---
.../clientpositive/chinese_utf8_characters.q | 19 +++
.../llap/chinese_utf8_characters.q.out | 132 +++++++++++++++++++++
.../hadoop/hive/serde2/MultiDelimitSerDe.java | 9 +-
3 files changed, 158 insertions(+), 2 deletions(-)
diff --git a/ql/src/test/queries/clientpositive/chinese_utf8_characters.q
b/ql/src/test/queries/clientpositive/chinese_utf8_characters.q
index c4c3871c10c..aa9286b8211 100644
--- a/ql/src/test/queries/clientpositive/chinese_utf8_characters.q
+++ b/ql/src/test/queries/clientpositive/chinese_utf8_characters.q
@@ -1,6 +1,13 @@
CREATE EXTERNAL TABLE tbl_chinese_chars(a int, b string, c string);
INSERT INTO tbl_chinese_chars values(1,'上海','徐汇'),(2,'北京','海淀');
+CREATE EXTERNAL TABLE tbl_chinese_chars_multidelimitserde (col1 varchar(100),
col2 varchar(100))
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.MultiDelimitSerDe'
+WITH SERDEPROPERTIES ('field.delim'='|~|', 'serialization.encoding'='UTF-8')
+STORED AS TEXTFILE;
+INSERT INTO TABLE tbl_chinese_chars_multidelimitserde values('测试1','测试2');
+
+
set hive.fetch.task.conversion=more;
EXPLAIN SELECT * FROM default.tbl_chinese_chars where b='北京';
SELECT * FROM default.tbl_chinese_chars where b='北京';
@@ -8,3 +15,15 @@ SELECT * FROM default.tbl_chinese_chars where b='北京';
set hive.fetch.task.conversion=none;
EXPLAIN SELECT * FROM default.tbl_chinese_chars where b='北京';
SELECT * FROM default.tbl_chinese_chars where b='北京';
+
+
+set hive.fetch.task.conversion=more;
+SELECT * FROM default.tbl_chinese_chars_multidelimitserde;
+EXPLAIN SELECT * FROM default.tbl_chinese_chars_multidelimitserde where col1 =
'测试1';
+SELECT * FROM default.tbl_chinese_chars_multidelimitserde where col1 = '测试1';
+
+
+set hive.fetch.task.conversion=none;
+SELECT * FROM default.tbl_chinese_chars_multidelimitserde;
+EXPLAIN SELECT * FROM default.tbl_chinese_chars_multidelimitserde where col1 =
'测试1';
+SELECT * FROM default.tbl_chinese_chars_multidelimitserde where col1 = '测试1';
diff --git
a/ql/src/test/results/clientpositive/llap/chinese_utf8_characters.q.out
b/ql/src/test/results/clientpositive/llap/chinese_utf8_characters.q.out
index fd6bfb982f2..911d1802cc4 100644
--- a/ql/src/test/results/clientpositive/llap/chinese_utf8_characters.q.out
+++ b/ql/src/test/results/clientpositive/llap/chinese_utf8_characters.q.out
@@ -17,6 +17,30 @@ POSTHOOK: Output: default@tbl_chinese_chars
POSTHOOK: Lineage: tbl_chinese_chars.a SCRIPT []
POSTHOOK: Lineage: tbl_chinese_chars.b SCRIPT []
POSTHOOK: Lineage: tbl_chinese_chars.c SCRIPT []
+PREHOOK: query: CREATE EXTERNAL TABLE tbl_chinese_chars_multidelimitserde
(col1 varchar(100), col2 varchar(100))
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.MultiDelimitSerDe'
+WITH SERDEPROPERTIES ('field.delim'='|~|', 'serialization.encoding'='UTF-8')
+STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@tbl_chinese_chars_multidelimitserde
+POSTHOOK: query: CREATE EXTERNAL TABLE tbl_chinese_chars_multidelimitserde
(col1 varchar(100), col2 varchar(100))
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.MultiDelimitSerDe'
+WITH SERDEPROPERTIES ('field.delim'='|~|', 'serialization.encoding'='UTF-8')
+STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@tbl_chinese_chars_multidelimitserde
+PREHOOK: query: INSERT INTO TABLE tbl_chinese_chars_multidelimitserde
values('测试1','测试2')
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@tbl_chinese_chars_multidelimitserde
+POSTHOOK: query: INSERT INTO TABLE tbl_chinese_chars_multidelimitserde
values('测试1','测试2')
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@tbl_chinese_chars_multidelimitserde
+POSTHOOK: Lineage: tbl_chinese_chars_multidelimitserde.col1 SCRIPT []
+POSTHOOK: Lineage: tbl_chinese_chars_multidelimitserde.col2 SCRIPT []
PREHOOK: query: EXPLAIN SELECT * FROM default.tbl_chinese_chars where b='北京'
PREHOOK: type: QUERY
PREHOOK: Input: default@tbl_chinese_chars
@@ -107,3 +131,111 @@ POSTHOOK: type: QUERY
POSTHOOK: Input: default@tbl_chinese_chars
#### A masked pattern was here ####
2 北京 海淀
+PREHOOK: query: SELECT * FROM default.tbl_chinese_chars_multidelimitserde
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl_chinese_chars_multidelimitserde
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT * FROM default.tbl_chinese_chars_multidelimitserde
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl_chinese_chars_multidelimitserde
+#### A masked pattern was here ####
+测试1 测试2
+PREHOOK: query: EXPLAIN SELECT * FROM
default.tbl_chinese_chars_multidelimitserde where col1 = '测试1'
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl_chinese_chars_multidelimitserde
+#### A masked pattern was here ####
+POSTHOOK: query: EXPLAIN SELECT * FROM
default.tbl_chinese_chars_multidelimitserde where col1 = '测试1'
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl_chinese_chars_multidelimitserde
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ TableScan
+ alias: tbl_chinese_chars_multidelimitserde
+ filterExpr: (col1 = '测试1') (type: boolean)
+ Filter Operator
+ predicate: (col1 = '测试1') (type: boolean)
+ Select Operator
+ expressions: '测试1' (type: varchar(100)), col2 (type:
varchar(100))
+ outputColumnNames: _col0, _col1
+ ListSink
+
+PREHOOK: query: SELECT * FROM default.tbl_chinese_chars_multidelimitserde
where col1 = '测试1'
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl_chinese_chars_multidelimitserde
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT * FROM default.tbl_chinese_chars_multidelimitserde
where col1 = '测试1'
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl_chinese_chars_multidelimitserde
+#### A masked pattern was here ####
+测试1 测试2
+PREHOOK: query: SELECT * FROM default.tbl_chinese_chars_multidelimitserde
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl_chinese_chars_multidelimitserde
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT * FROM default.tbl_chinese_chars_multidelimitserde
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl_chinese_chars_multidelimitserde
+#### A masked pattern was here ####
+测试1 测试2
+PREHOOK: query: EXPLAIN SELECT * FROM
default.tbl_chinese_chars_multidelimitserde where col1 = '测试1'
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl_chinese_chars_multidelimitserde
+#### A masked pattern was here ####
+POSTHOOK: query: EXPLAIN SELECT * FROM
default.tbl_chinese_chars_multidelimitserde where col1 = '测试1'
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl_chinese_chars_multidelimitserde
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Tez
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: tbl_chinese_chars_multidelimitserde
+ filterExpr: (col1 = '测试1') (type: boolean)
+ Statistics: Num rows: 1 Data size: 174 Basic stats: COMPLETE
Column stats: COMPLETE
+ Filter Operator
+ predicate: (col1 = '测试1') (type: boolean)
+ Statistics: Num rows: 1 Data size: 174 Basic stats:
COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: '测试1' (type: varchar(100)), col2 (type:
varchar(100))
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 1 Data size: 174 Basic stats:
COMPLETE Column stats: COMPLETE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 1 Data size: 174 Basic stats:
COMPLETE Column stats: COMPLETE
+ table:
+ input format:
org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format:
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde:
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ Execution mode: vectorized, llap
+ LLAP IO: all inputs
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: SELECT * FROM default.tbl_chinese_chars_multidelimitserde
where col1 = '测试1'
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl_chinese_chars_multidelimitserde
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT * FROM default.tbl_chinese_chars_multidelimitserde
where col1 = '测试1'
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl_chinese_chars_multidelimitserde
+#### A masked pattern was here ####
+测试1 测试2
diff --git
a/serde/src/java/org/apache/hadoop/hive/serde2/MultiDelimitSerDe.java
b/serde/src/java/org/apache/hadoop/hive/serde2/MultiDelimitSerDe.java
index 0ec33f4a14c..46a48fb2e88 100644
--- a/serde/src/java/org/apache/hadoop/hive/serde2/MultiDelimitSerDe.java
+++ b/serde/src/java/org/apache/hadoop/hive/serde2/MultiDelimitSerDe.java
@@ -20,6 +20,7 @@
package org.apache.hadoop.hive.serde2;
import java.io.IOException;
+import java.nio.charset.StandardCharsets;
import java.util.List;
import java.util.Map;
import java.util.Properties;
@@ -154,10 +155,14 @@ public class MultiDelimitSerDe extends
AbstractEncodingAwareSerDe {
} else {
throw new SerDeException(getClass() + ": expects either BytesWritable or
Text object!");
}
- byteArrayRef.setData(rowStr.replaceAll(Pattern.quote(fieldDelimited),
REPLACEMENT_DELIM_SEQUENCE).getBytes());
+
+ // at this point, rowStr is supposed to be encoded with UTF8 (not with the
serde's charset)
+ byteArrayRef.setData(
+ rowStr.replaceAll(Pattern.quote(fieldDelimited),
REPLACEMENT_DELIM_SEQUENCE).getBytes(StandardCharsets.UTF_8));
cachedLazyStruct.init(byteArrayRef, 0, byteArrayRef.getData().length);
// use the multi-char delimiter to parse the lazy struct
- cachedLazyStruct.parseMultiDelimit(rowStr.getBytes(),
fieldDelimited.getBytes());
+ cachedLazyStruct.parseMultiDelimit(rowStr.getBytes(StandardCharsets.UTF_8),
+ fieldDelimited.getBytes(StandardCharsets.UTF_8));
return cachedLazyStruct;
}