This is an automated email from the ASF dual-hosted git repository.
ayushtkn pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new 1ea685ef3ba HIVE-28728: fix STR_TO_MAP() returning garbled utf-8
characters when vectorization is enabled (#6559)
1ea685ef3ba is described below
commit 1ea685ef3babdaf2080079146f9031b9a7905e95
Author: cyanzheng2926 <[email protected]>
AuthorDate: Thu Jun 25 19:29:16 2026 +0800
HIVE-28728: fix STR_TO_MAP() returning garbled utf-8 characters when
vectorization is enabled (#6559)
---
.../test/resources/testconfiguration.properties | 1 +
.../hive/ql/exec/vector/VectorAssignRow.java | 3 +-
.../test/queries/clientpositive/str_to_map_utf8.q | 55 +++++
.../clientpositive/llap/str_to_map_utf8.q.out | 270 +++++++++++++++++++++
4 files changed, 328 insertions(+), 1 deletion(-)
diff --git a/itests/src/test/resources/testconfiguration.properties
b/itests/src/test/resources/testconfiguration.properties
index b4441518a64..bfefc3b5f02 100644
--- a/itests/src/test/resources/testconfiguration.properties
+++ b/itests/src/test/resources/testconfiguration.properties
@@ -180,6 +180,7 @@ minillap.query.files=\
skip_header_footer_aggr.q,\
skip_header_footer_proj.q,\
str_to_map.q,\
+ str_to_map_utf8.q,\
table_nonprintable.q,\
temp_table_add_part_with_loc.q,\
temp_table_add_partition_with_location.q,\
diff --git
a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorAssignRow.java
b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorAssignRow.java
index b7f3377a518..5e3768660f6 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorAssignRow.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorAssignRow.java
@@ -18,6 +18,7 @@
package org.apache.hadoop.hive.ql.exec.vector;
+import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
@@ -465,7 +466,7 @@ private void assignRowColumn(
{
if (object instanceof String) {
String string = (String) object;
- byte[] bytes = string.getBytes();
+ byte[] bytes = string.getBytes(StandardCharsets.UTF_8);
((BytesColumnVector) columnVector).setVal(
batchIndex, bytes, 0, bytes.length);
} else {
diff --git a/ql/src/test/queries/clientpositive/str_to_map_utf8.q
b/ql/src/test/queries/clientpositive/str_to_map_utf8.q
new file mode 100644
index 00000000000..c8f54cf2571
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/str_to_map_utf8.q
@@ -0,0 +1,55 @@
+-- HIVE-28728: STR_TO_MAP() must preserve UTF-8 in vectorized execution when
JVM default charset is not UTF-8.
+-- Tez container opts below force US-ASCII in Tez tasks
+-- Use driver-level mimic for testing with llap:
-Dmaven.test.jvm.args="-Dfile.encoding=US-ASCII"
+
+SET tez.am.launch.cmd-opts=-Dfile.encoding=US-ASCII;
+SET hive.tez.java.opts=-Dfile.encoding=US-ASCII;
+SET hive.vectorized.execution.enabled=true;
+SET hive.fetch.task.conversion=none;
+
+DROP TABLE IF EXISTS hive28728_src;
+DROP TABLE IF EXISTS hive28728_result;
+DROP TABLE IF EXISTS hive28728_multi;
+DROP TABLE IF EXISTS hive28728_result_novec;
+
+CREATE TABLE hive28728_src (id string, name string, multi string) STORED AS
ORC;
+INSERT INTO hive28728_src VALUES
+ ('100','hive', 'en:1'),
+ ('200','spark', null),
+ ('300','oozie', 'a:1,b:2'),
+ ('400','airflow', 'ascii:值'),
+ ('500','优惠活动', '上海:北京,优惠活动:折扣'),
+ ('600','日本語', 'val:1,val:2');
+
+SELECT STR_TO_MAP(CONCAT(id, ':', name), ',', ':') FROM hive28728_src ORDER BY
id;
+SELECT STR_TO_MAP(multi, ',', ':') FROM hive28728_src WHERE multi IS NOT NULL
ORDER BY id;
+SELECT STR_TO_MAP(multi, ',', ':')['优惠活动'] FROM hive28728_src WHERE id = '500';
+SELECT STR_TO_MAP('优惠活动:折扣,北京:海淀', ',', ':');
+
+SELECT STR_TO_MAP(multi, ',', ':') FROM hive28728_src WHERE id = '200';
+SELECT STR_TO_MAP('700', ',', ':');
+
+-- Vectorized INSERT OVERWRITE
+CREATE TABLE hive28728_result (cd MAP<STRING, STRING>) STORED AS ORC;
+INSERT OVERWRITE TABLE hive28728_result
+ SELECT STR_TO_MAP(CONCAT(id, ':', name), ',', ':') FROM hive28728_src;
+SELECT * FROM hive28728_result ORDER BY cd;
+
+CREATE TABLE hive28728_multi (cd MAP<STRING, STRING>) STORED AS ORC;
+INSERT OVERWRITE TABLE hive28728_multi
+ SELECT STR_TO_MAP(multi, ',', ':') FROM hive28728_src WHERE multi IS NOT
NULL ORDER BY id;
+SELECT * FROM hive28728_multi ORDER BY cd;
+
+-- Non-vectorized baseline
+SET hive.vectorized.execution.enabled=false;
+CREATE TABLE hive28728_result_novec (cd MAP<STRING, STRING>) STORED AS ORC;
+INSERT OVERWRITE TABLE hive28728_result_novec
+ SELECT STR_TO_MAP(CONCAT(id, ':', name), ',', ':') FROM hive28728_src;
+SELECT * FROM hive28728_result_novec ORDER BY cd;
+
+SELECT STR_TO_MAP(CONCAT(id, ':', name), ',', ':') FROM hive28728_src ORDER BY
id;
+
+DROP TABLE IF EXISTS hive28728_src;
+DROP TABLE IF EXISTS hive28728_result;
+DROP TABLE IF EXISTS hive28728_multi;
+DROP TABLE IF EXISTS hive28728_result_novec;
diff --git a/ql/src/test/results/clientpositive/llap/str_to_map_utf8.q.out
b/ql/src/test/results/clientpositive/llap/str_to_map_utf8.q.out
new file mode 100644
index 00000000000..403c94bd044
--- /dev/null
+++ b/ql/src/test/results/clientpositive/llap/str_to_map_utf8.q.out
@@ -0,0 +1,270 @@
+PREHOOK: query: DROP TABLE IF EXISTS hive28728_src
+PREHOOK: type: DROPTABLE
+PREHOOK: Output: database:default
+POSTHOOK: query: DROP TABLE IF EXISTS hive28728_src
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Output: database:default
+PREHOOK: query: DROP TABLE IF EXISTS hive28728_result
+PREHOOK: type: DROPTABLE
+PREHOOK: Output: database:default
+POSTHOOK: query: DROP TABLE IF EXISTS hive28728_result
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Output: database:default
+PREHOOK: query: DROP TABLE IF EXISTS hive28728_multi
+PREHOOK: type: DROPTABLE
+PREHOOK: Output: database:default
+POSTHOOK: query: DROP TABLE IF EXISTS hive28728_multi
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Output: database:default
+PREHOOK: query: DROP TABLE IF EXISTS hive28728_result_novec
+PREHOOK: type: DROPTABLE
+PREHOOK: Output: database:default
+POSTHOOK: query: DROP TABLE IF EXISTS hive28728_result_novec
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Output: database:default
+PREHOOK: query: CREATE TABLE hive28728_src (id string, name string, multi
string) STORED AS ORC
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@hive28728_src
+POSTHOOK: query: CREATE TABLE hive28728_src (id string, name string, multi
string) STORED AS ORC
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@hive28728_src
+PREHOOK: query: INSERT INTO hive28728_src VALUES
+ ('100','hive', 'en:1'),
+ ('200','spark', null),
+ ('300','oozie', 'a:1,b:2'),
+ ('400','airflow', 'ascii:值'),
+ ('500','优惠活动', '上海:北京,优惠活动:折扣'),
+ ('600','日本語', 'val:1,val:2')
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@hive28728_src
+POSTHOOK: query: INSERT INTO hive28728_src VALUES
+ ('100','hive', 'en:1'),
+ ('200','spark', null),
+ ('300','oozie', 'a:1,b:2'),
+ ('400','airflow', 'ascii:值'),
+ ('500','优惠活动', '上海:北京,优惠活动:折扣'),
+ ('600','日本語', 'val:1,val:2')
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@hive28728_src
+POSTHOOK: Lineage: hive28728_src.id SCRIPT []
+POSTHOOK: Lineage: hive28728_src.multi SCRIPT []
+POSTHOOK: Lineage: hive28728_src.name SCRIPT []
+PREHOOK: query: SELECT STR_TO_MAP(CONCAT(id, ':', name), ',', ':') FROM
hive28728_src ORDER BY id
+PREHOOK: type: QUERY
+PREHOOK: Input: default@hive28728_src
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: SELECT STR_TO_MAP(CONCAT(id, ':', name), ',', ':') FROM
hive28728_src ORDER BY id
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@hive28728_src
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+{"100":"hive"}
+{"200":"spark"}
+{"300":"oozie"}
+{"400":"airflow"}
+{"500":"优惠活动"}
+{"600":"日本語"}
+PREHOOK: query: SELECT STR_TO_MAP(multi, ',', ':') FROM hive28728_src WHERE
multi IS NOT NULL ORDER BY id
+PREHOOK: type: QUERY
+PREHOOK: Input: default@hive28728_src
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: SELECT STR_TO_MAP(multi, ',', ':') FROM hive28728_src WHERE
multi IS NOT NULL ORDER BY id
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@hive28728_src
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+{"en":"1"}
+{"a":"1","b":"2"}
+{"ascii":"值"}
+{"上海":"北京","优惠活动":"折扣"}
+{"val":"2"}
+PREHOOK: query: SELECT STR_TO_MAP(multi, ',', ':')['优惠活动'] FROM hive28728_src
WHERE id = '500'
+PREHOOK: type: QUERY
+PREHOOK: Input: default@hive28728_src
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: SELECT STR_TO_MAP(multi, ',', ':')['优惠活动'] FROM hive28728_src
WHERE id = '500'
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@hive28728_src
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+折扣
+PREHOOK: query: SELECT STR_TO_MAP('优惠活动:折扣,北京:海淀', ',', ':')
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: SELECT STR_TO_MAP('优惠活动:折扣,北京:海淀', ',', ':')
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+{"优惠活动":"折扣","北京":"海淀"}
+PREHOOK: query: SELECT STR_TO_MAP(multi, ',', ':') FROM hive28728_src WHERE id
= '200'
+PREHOOK: type: QUERY
+PREHOOK: Input: default@hive28728_src
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: SELECT STR_TO_MAP(multi, ',', ':') FROM hive28728_src WHERE
id = '200'
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@hive28728_src
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+{}
+PREHOOK: query: SELECT STR_TO_MAP('700', ',', ':')
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: SELECT STR_TO_MAP('700', ',', ':')
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+{"700":null}
+PREHOOK: query: CREATE TABLE hive28728_result (cd MAP<STRING, STRING>) STORED
AS ORC
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@hive28728_result
+POSTHOOK: query: CREATE TABLE hive28728_result (cd MAP<STRING, STRING>) STORED
AS ORC
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@hive28728_result
+PREHOOK: query: INSERT OVERWRITE TABLE hive28728_result
+ SELECT STR_TO_MAP(CONCAT(id, ':', name), ',', ':') FROM hive28728_src
+PREHOOK: type: QUERY
+PREHOOK: Input: default@hive28728_src
+PREHOOK: Output: default@hive28728_result
+POSTHOOK: query: INSERT OVERWRITE TABLE hive28728_result
+ SELECT STR_TO_MAP(CONCAT(id, ':', name), ',', ':') FROM hive28728_src
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@hive28728_src
+POSTHOOK: Output: default@hive28728_result
+POSTHOOK: Lineage: hive28728_result.cd EXPRESSION
[(hive28728_src)hive28728_src.FieldSchema(name:id, type:string, comment:null),
(hive28728_src)hive28728_src.FieldSchema(name:name, type:string, comment:null),
]
+PREHOOK: query: SELECT * FROM hive28728_result ORDER BY cd
+PREHOOK: type: QUERY
+PREHOOK: Input: default@hive28728_result
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: SELECT * FROM hive28728_result ORDER BY cd
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@hive28728_result
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+{"100":"hive"}
+{"200":"spark"}
+{"300":"oozie"}
+{"400":"airflow"}
+{"500":"优惠活动"}
+{"600":"日本語"}
+PREHOOK: query: CREATE TABLE hive28728_multi (cd MAP<STRING, STRING>) STORED
AS ORC
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@hive28728_multi
+POSTHOOK: query: CREATE TABLE hive28728_multi (cd MAP<STRING, STRING>) STORED
AS ORC
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@hive28728_multi
+PREHOOK: query: INSERT OVERWRITE TABLE hive28728_multi
+ SELECT STR_TO_MAP(multi, ',', ':') FROM hive28728_src WHERE multi IS NOT
NULL ORDER BY id
+PREHOOK: type: QUERY
+PREHOOK: Input: default@hive28728_src
+PREHOOK: Output: default@hive28728_multi
+POSTHOOK: query: INSERT OVERWRITE TABLE hive28728_multi
+ SELECT STR_TO_MAP(multi, ',', ':') FROM hive28728_src WHERE multi IS NOT
NULL ORDER BY id
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@hive28728_src
+POSTHOOK: Output: default@hive28728_multi
+POSTHOOK: Lineage: hive28728_multi.cd EXPRESSION
[(hive28728_src)hive28728_src.FieldSchema(name:multi, type:string,
comment:null), ]
+PREHOOK: query: SELECT * FROM hive28728_multi ORDER BY cd
+PREHOOK: type: QUERY
+PREHOOK: Input: default@hive28728_multi
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: SELECT * FROM hive28728_multi ORDER BY cd
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@hive28728_multi
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+{"a":"1","b":"2"}
+{"ascii":"值"}
+{"en":"1"}
+{"val":"2"}
+{"上海":"北京","优惠活动":"折扣"}
+PREHOOK: query: CREATE TABLE hive28728_result_novec (cd MAP<STRING, STRING>)
STORED AS ORC
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@hive28728_result_novec
+POSTHOOK: query: CREATE TABLE hive28728_result_novec (cd MAP<STRING, STRING>)
STORED AS ORC
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@hive28728_result_novec
+PREHOOK: query: INSERT OVERWRITE TABLE hive28728_result_novec
+ SELECT STR_TO_MAP(CONCAT(id, ':', name), ',', ':') FROM hive28728_src
+PREHOOK: type: QUERY
+PREHOOK: Input: default@hive28728_src
+PREHOOK: Output: default@hive28728_result_novec
+POSTHOOK: query: INSERT OVERWRITE TABLE hive28728_result_novec
+ SELECT STR_TO_MAP(CONCAT(id, ':', name), ',', ':') FROM hive28728_src
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@hive28728_src
+POSTHOOK: Output: default@hive28728_result_novec
+POSTHOOK: Lineage: hive28728_result_novec.cd EXPRESSION
[(hive28728_src)hive28728_src.FieldSchema(name:id, type:string, comment:null),
(hive28728_src)hive28728_src.FieldSchema(name:name, type:string, comment:null),
]
+PREHOOK: query: SELECT * FROM hive28728_result_novec ORDER BY cd
+PREHOOK: type: QUERY
+PREHOOK: Input: default@hive28728_result_novec
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: SELECT * FROM hive28728_result_novec ORDER BY cd
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@hive28728_result_novec
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+{"100":"hive"}
+{"200":"spark"}
+{"300":"oozie"}
+{"400":"airflow"}
+{"500":"优惠活动"}
+{"600":"日本語"}
+PREHOOK: query: SELECT STR_TO_MAP(CONCAT(id, ':', name), ',', ':') FROM
hive28728_src ORDER BY id
+PREHOOK: type: QUERY
+PREHOOK: Input: default@hive28728_src
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: SELECT STR_TO_MAP(CONCAT(id, ':', name), ',', ':') FROM
hive28728_src ORDER BY id
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@hive28728_src
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+{"100":"hive"}
+{"200":"spark"}
+{"300":"oozie"}
+{"400":"airflow"}
+{"500":"优惠活动"}
+{"600":"日本語"}
+PREHOOK: query: DROP TABLE IF EXISTS hive28728_src
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@hive28728_src
+PREHOOK: Output: database:default
+PREHOOK: Output: default@hive28728_src
+POSTHOOK: query: DROP TABLE IF EXISTS hive28728_src
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@hive28728_src
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@hive28728_src
+PREHOOK: query: DROP TABLE IF EXISTS hive28728_result
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@hive28728_result
+PREHOOK: Output: database:default
+PREHOOK: Output: default@hive28728_result
+POSTHOOK: query: DROP TABLE IF EXISTS hive28728_result
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@hive28728_result
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@hive28728_result
+PREHOOK: query: DROP TABLE IF EXISTS hive28728_multi
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@hive28728_multi
+PREHOOK: Output: database:default
+PREHOOK: Output: default@hive28728_multi
+POSTHOOK: query: DROP TABLE IF EXISTS hive28728_multi
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@hive28728_multi
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@hive28728_multi
+PREHOOK: query: DROP TABLE IF EXISTS hive28728_result_novec
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@hive28728_result_novec
+PREHOOK: Output: database:default
+PREHOOK: Output: default@hive28728_result_novec
+POSTHOOK: query: DROP TABLE IF EXISTS hive28728_result_novec
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@hive28728_result_novec
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@hive28728_result_novec