This is an automated email from the ASF dual-hosted git repository. dkuzmenko pushed a commit to annotated tag release-4.0.0-alpha-2-rc0 in repository https://gitbox.apache.org/repos/asf/hive.git
commit 5ffe0a8a2caaa6c8408308df2df275060b416a40 Author: Bodor Laszlo <[email protected]> AuthorDate: Thu Oct 20 08:20:56 2022 +0200 HIVE-26651: MultiDelimitSerDe shouldn't rely on default charset when returning the deserialized string (#3690) (Laszlo Bodor reviewed by Ayush Saxena) (cherry picked from commit 0d65a985b953e6a50a9f2d239ee0254ccf3e9402) --- .../clientpositive/chinese_utf8_characters.q | 19 +++ .../llap/chinese_utf8_characters.q.out | 132 +++++++++++++++++++++ .../hadoop/hive/serde2/MultiDelimitSerDe.java | 9 +- 3 files changed, 158 insertions(+), 2 deletions(-) diff --git a/ql/src/test/queries/clientpositive/chinese_utf8_characters.q b/ql/src/test/queries/clientpositive/chinese_utf8_characters.q index c4c3871c10c..aa9286b8211 100644 --- a/ql/src/test/queries/clientpositive/chinese_utf8_characters.q +++ b/ql/src/test/queries/clientpositive/chinese_utf8_characters.q @@ -1,6 +1,13 @@ CREATE EXTERNAL TABLE tbl_chinese_chars(a int, b string, c string); INSERT INTO tbl_chinese_chars values(1,'上海','徐汇'),(2,'北京','海淀'); +CREATE EXTERNAL TABLE tbl_chinese_chars_multidelimitserde (col1 varchar(100), col2 varchar(100)) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.MultiDelimitSerDe' +WITH SERDEPROPERTIES ('field.delim'='|~|', 'serialization.encoding'='UTF-8') +STORED AS TEXTFILE; +INSERT INTO TABLE tbl_chinese_chars_multidelimitserde values('测试1','测试2'); + + set hive.fetch.task.conversion=more; EXPLAIN SELECT * FROM default.tbl_chinese_chars where b='北京'; SELECT * FROM default.tbl_chinese_chars where b='北京'; @@ -8,3 +15,15 @@ SELECT * FROM default.tbl_chinese_chars where b='北京'; set hive.fetch.task.conversion=none; EXPLAIN SELECT * FROM default.tbl_chinese_chars where b='北京'; SELECT * FROM default.tbl_chinese_chars where b='北京'; + + +set hive.fetch.task.conversion=more; +SELECT * FROM default.tbl_chinese_chars_multidelimitserde; +EXPLAIN SELECT * FROM default.tbl_chinese_chars_multidelimitserde where col1 = '测试1'; +SELECT * FROM default.tbl_chinese_chars_multidelimitserde where col1 = '测试1'; + + +set hive.fetch.task.conversion=none; +SELECT * FROM default.tbl_chinese_chars_multidelimitserde; +EXPLAIN SELECT * FROM default.tbl_chinese_chars_multidelimitserde where col1 = '测试1'; +SELECT * FROM default.tbl_chinese_chars_multidelimitserde where col1 = '测试1'; diff --git a/ql/src/test/results/clientpositive/llap/chinese_utf8_characters.q.out b/ql/src/test/results/clientpositive/llap/chinese_utf8_characters.q.out index fd6bfb982f2..911d1802cc4 100644 --- a/ql/src/test/results/clientpositive/llap/chinese_utf8_characters.q.out +++ b/ql/src/test/results/clientpositive/llap/chinese_utf8_characters.q.out @@ -17,6 +17,30 @@ POSTHOOK: Output: default@tbl_chinese_chars POSTHOOK: Lineage: tbl_chinese_chars.a SCRIPT [] POSTHOOK: Lineage: tbl_chinese_chars.b SCRIPT [] POSTHOOK: Lineage: tbl_chinese_chars.c SCRIPT [] +PREHOOK: query: CREATE EXTERNAL TABLE tbl_chinese_chars_multidelimitserde (col1 varchar(100), col2 varchar(100)) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.MultiDelimitSerDe' +WITH SERDEPROPERTIES ('field.delim'='|~|', 'serialization.encoding'='UTF-8') +STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@tbl_chinese_chars_multidelimitserde +POSTHOOK: query: CREATE EXTERNAL TABLE tbl_chinese_chars_multidelimitserde (col1 varchar(100), col2 varchar(100)) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.MultiDelimitSerDe' +WITH SERDEPROPERTIES ('field.delim'='|~|', 'serialization.encoding'='UTF-8') +STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@tbl_chinese_chars_multidelimitserde +PREHOOK: query: INSERT INTO TABLE tbl_chinese_chars_multidelimitserde values('测试1','测试2') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@tbl_chinese_chars_multidelimitserde +POSTHOOK: query: INSERT INTO TABLE tbl_chinese_chars_multidelimitserde values('测试1','测试2') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@tbl_chinese_chars_multidelimitserde +POSTHOOK: Lineage: tbl_chinese_chars_multidelimitserde.col1 SCRIPT [] +POSTHOOK: Lineage: tbl_chinese_chars_multidelimitserde.col2 SCRIPT [] PREHOOK: query: EXPLAIN SELECT * FROM default.tbl_chinese_chars where b='北京' PREHOOK: type: QUERY PREHOOK: Input: default@tbl_chinese_chars @@ -107,3 +131,111 @@ POSTHOOK: type: QUERY POSTHOOK: Input: default@tbl_chinese_chars #### A masked pattern was here #### 2 北京 海淀 +PREHOOK: query: SELECT * FROM default.tbl_chinese_chars_multidelimitserde +PREHOOK: type: QUERY +PREHOOK: Input: default@tbl_chinese_chars_multidelimitserde +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM default.tbl_chinese_chars_multidelimitserde +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tbl_chinese_chars_multidelimitserde +#### A masked pattern was here #### +测试1 测试2 +PREHOOK: query: EXPLAIN SELECT * FROM default.tbl_chinese_chars_multidelimitserde where col1 = '测试1' +PREHOOK: type: QUERY +PREHOOK: Input: default@tbl_chinese_chars_multidelimitserde +#### A masked pattern was here #### +POSTHOOK: query: EXPLAIN SELECT * FROM default.tbl_chinese_chars_multidelimitserde where col1 = '测试1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tbl_chinese_chars_multidelimitserde +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + TableScan + alias: tbl_chinese_chars_multidelimitserde + filterExpr: (col1 = '测试1') (type: boolean) + Filter Operator + predicate: (col1 = '测试1') (type: boolean) + Select Operator + expressions: '测试1' (type: varchar(100)), col2 (type: varchar(100)) + outputColumnNames: _col0, _col1 + ListSink + +PREHOOK: query: SELECT * FROM default.tbl_chinese_chars_multidelimitserde where col1 = '测试1' +PREHOOK: type: QUERY +PREHOOK: Input: default@tbl_chinese_chars_multidelimitserde +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM default.tbl_chinese_chars_multidelimitserde where col1 = '测试1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tbl_chinese_chars_multidelimitserde +#### A masked pattern was here #### +测试1 测试2 +PREHOOK: query: SELECT * FROM default.tbl_chinese_chars_multidelimitserde +PREHOOK: type: QUERY +PREHOOK: Input: default@tbl_chinese_chars_multidelimitserde +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM default.tbl_chinese_chars_multidelimitserde +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tbl_chinese_chars_multidelimitserde +#### A masked pattern was here #### +测试1 测试2 +PREHOOK: query: EXPLAIN SELECT * FROM default.tbl_chinese_chars_multidelimitserde where col1 = '测试1' +PREHOOK: type: QUERY +PREHOOK: Input: default@tbl_chinese_chars_multidelimitserde +#### A masked pattern was here #### +POSTHOOK: query: EXPLAIN SELECT * FROM default.tbl_chinese_chars_multidelimitserde where col1 = '测试1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tbl_chinese_chars_multidelimitserde +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: tbl_chinese_chars_multidelimitserde + filterExpr: (col1 = '测试1') (type: boolean) + Statistics: Num rows: 1 Data size: 174 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: (col1 = '测试1') (type: boolean) + Statistics: Num rows: 1 Data size: 174 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: '测试1' (type: varchar(100)), col2 (type: varchar(100)) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 174 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 174 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Execution mode: vectorized, llap + LLAP IO: all inputs + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: SELECT * FROM default.tbl_chinese_chars_multidelimitserde where col1 = '测试1' +PREHOOK: type: QUERY +PREHOOK: Input: default@tbl_chinese_chars_multidelimitserde +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM default.tbl_chinese_chars_multidelimitserde where col1 = '测试1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tbl_chinese_chars_multidelimitserde +#### A masked pattern was here #### +测试1 测试2 diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/MultiDelimitSerDe.java b/serde/src/java/org/apache/hadoop/hive/serde2/MultiDelimitSerDe.java index 0ec33f4a14c..46a48fb2e88 100644 --- a/serde/src/java/org/apache/hadoop/hive/serde2/MultiDelimitSerDe.java +++ b/serde/src/java/org/apache/hadoop/hive/serde2/MultiDelimitSerDe.java @@ -20,6 +20,7 @@ package org.apache.hadoop.hive.serde2; import java.io.IOException; +import java.nio.charset.StandardCharsets; import java.util.List; import java.util.Map; import java.util.Properties; @@ -154,10 +155,14 @@ public class MultiDelimitSerDe extends AbstractEncodingAwareSerDe { } else { throw new SerDeException(getClass() + ": expects either BytesWritable or Text object!"); } - byteArrayRef.setData(rowStr.replaceAll(Pattern.quote(fieldDelimited), REPLACEMENT_DELIM_SEQUENCE).getBytes()); + + // at this point, rowStr is supposed to be encoded with UTF8 (not with the serde's charset) + byteArrayRef.setData( + rowStr.replaceAll(Pattern.quote(fieldDelimited), REPLACEMENT_DELIM_SEQUENCE).getBytes(StandardCharsets.UTF_8)); cachedLazyStruct.init(byteArrayRef, 0, byteArrayRef.getData().length); // use the multi-char delimiter to parse the lazy struct - cachedLazyStruct.parseMultiDelimit(rowStr.getBytes(), fieldDelimited.getBytes()); + cachedLazyStruct.parseMultiDelimit(rowStr.getBytes(StandardCharsets.UTF_8), + fieldDelimited.getBytes(StandardCharsets.UTF_8)); return cachedLazyStruct; }
