This is an automated email from the ASF dual-hosted git repository.
krisztiankasa pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new 6722bcb5283 HIVE-15826: Add 'serialization.encoding' To All SerDes
(Shailesh Gupta, reviewed by Laszlo Bodor, Krisztian Kasa )
6722bcb5283 is described below
commit 6722bcb5283b04080cfbf06ca45c75772a8f2e4c
Author: Shailesh Gupta <[email protected]>
AuthorDate: Thu Mar 2 13:12:42 2023 +0530
HIVE-15826: Add 'serialization.encoding' To All SerDes (Shailesh Gupta,
reviewed by Laszlo Bodor, Krisztian Kasa )
Co-authored-by: Shailesh Gupta <[email protected]>
---
.../hadoop/hive/contrib/serde2/RegexSerDe.java | 23 ++++--
.../src/test/queries/clientpositive/serde_regex.q | 30 ++++++-
.../test/results/clientpositive/serde_regex.q.out | 93 ++++++++++++++++++++++
data/files/opencsv-data.txt | 3 +-
.../org/apache/hive/hcatalog/data/JsonSerDe.java | 30 +++++--
ql/src/test/queries/clientpositive/json_serde1.q | 11 +++
ql/src/test/queries/clientpositive/serde_opencsv.q | 8 +-
ql/src/test/queries/clientpositive/serde_regex.q | 29 +++++++
.../results/clientpositive/llap/json_serde1.q.out | 46 +++++++++++
.../clientpositive/llap/serde_opencsv.q.out | 27 +++++--
.../results/clientpositive/llap/serde_regex.q.out | 93 ++++++++++++++++++++++
.../apache/hadoop/hive/serde2/OpenCSVSerde.java | 18 ++++-
.../org/apache/hadoop/hive/serde2/RegexSerDe.java | 20 ++++-
13 files changed, 404 insertions(+), 27 deletions(-)
diff --git
a/contrib/src/java/org/apache/hadoop/hive/contrib/serde2/RegexSerDe.java
b/contrib/src/java/org/apache/hadoop/hive/contrib/serde2/RegexSerDe.java
index 9f1a9b6919b..ff8fb8f339d 100644
--- a/contrib/src/java/org/apache/hadoop/hive/contrib/serde2/RegexSerDe.java
+++ b/contrib/src/java/org/apache/hadoop/hive/contrib/serde2/RegexSerDe.java
@@ -25,11 +25,12 @@ import java.util.Properties;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
+import org.apache.hadoop.hive.serde2.SerDeUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.serde.serdeConstants;
-import org.apache.hadoop.hive.serde2.AbstractSerDe;
+import org.apache.hadoop.hive.serde2.AbstractEncodingAwareSerDe;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.SerDeSpec;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
@@ -70,10 +71,10 @@ import org.apache.hadoop.io.Writable;
* based Regex library.
*/
@SerDeSpec(schemaProps = {
- serdeConstants.LIST_COLUMNS, serdeConstants.LIST_COLUMN_TYPES,
+ serdeConstants.LIST_COLUMNS, serdeConstants.LIST_COLUMN_TYPES,
serdeConstants.SERIALIZATION_ENCODING,
RegexSerDe.INPUT_REGEX, RegexSerDe.OUTPUT_FORMAT_STRING,
RegexSerDe.INPUT_REGEX_CASE_SENSITIVE })
-public class RegexSerDe extends AbstractSerDe {
+public class RegexSerDe extends AbstractEncodingAwareSerDe {
public static final String INPUT_REGEX = "input.regex";
public static final String OUTPUT_FORMAT_STRING = "output.format.string";
@@ -156,7 +157,7 @@ public class RegexSerDe extends AbstractSerDe {
}
@Override
- public Object deserialize(Writable blob) throws SerDeException {
+ public Object doDeserialize(Writable blob) throws SerDeException {
if (inputPattern == null) {
throw new SerDeException(
@@ -200,7 +201,7 @@ public class RegexSerDe extends AbstractSerDe {
Text outputRowText;
@Override
- public Writable serialize(Object obj, ObjectInspector objInspector)
+ public Writable doSerialize(Object obj, ObjectInspector objInspector)
throws SerDeException {
if (outputFormatString == null) {
@@ -252,4 +253,16 @@ public class RegexSerDe extends AbstractSerDe {
return outputRowText;
}
+ @Override
+ protected Writable transformFromUTF8(Writable blob) {
+ Text text = (Text)blob;
+ return SerDeUtils.transformTextFromUTF8(text, this.charset);
+ }
+
+ @Override
+ protected Writable transformToUTF8(Writable blob) {
+ Text text = (Text)blob;
+ return SerDeUtils.transformTextToUTF8(text, this.charset);
+ }
+
}
diff --git a/contrib/src/test/queries/clientpositive/serde_regex.q
b/contrib/src/test/queries/clientpositive/serde_regex.q
index 8aa3eda201d..8487fa01a0e 100644
--- a/contrib/src/test/queries/clientpositive/serde_regex.q
+++ b/contrib/src/test/queries/clientpositive/serde_regex.q
@@ -39,4 +39,32 @@ STORED AS TEXTFILE;
LOAD DATA LOCAL INPATH "../../data/files/apache.access.log" INTO TABLE
serde_regex;
LOAD DATA LOCAL INPATH "../../data/files/apache.access.2.log" INTO TABLE
serde_regex;
-SELECT * FROM serde_regex ORDER BY `time`;
\ No newline at end of file
+SELECT * FROM serde_regex ORDER BY `time`;
+
+
+EXPLAIN
+CREATE TABLE serde_regex2(
+ key STRING,
+ value STRING)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
+WITH SERDEPROPERTIES (
+ "input.regex" = "([^ ]*),([^ ]*)",
+ "serialization.encoding" = "ISO8859_1"
+)
+STORED AS TEXTFILE;
+
+CREATE TABLE serde_regex2(
+ key STRING,
+ value STRING)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
+WITH SERDEPROPERTIES (
+ "input.regex" = "([^ ]*),([^ ]*)",
+ "serialization.encoding" = "ISO8859_1"
+)
+STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH "../../data/files/encoding_iso-8859-1.txt" INTO TABLE
serde_regex2;
+
+SELECT key, value FROM serde_regex2 ORDER BY key, value;
+
+DROP TABLE serde_regex2;
\ No newline at end of file
diff --git a/contrib/src/test/results/clientpositive/serde_regex.q.out
b/contrib/src/test/results/clientpositive/serde_regex.q.out
index 7462568938b..ff1b771b813 100644
--- a/contrib/src/test/results/clientpositive/serde_regex.q.out
+++ b/contrib/src/test/results/clientpositive/serde_regex.q.out
@@ -117,3 +117,96 @@ POSTHOOK: Input: default@serde_regex
#### A masked pattern was here ####
127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET
/apache_pb.gif HTTP/1.0" 200 2326 NULL NULL
127.0.0.1 - - [26/May/2009:00:00:00 +0000] "GET
/someurl/?track=Blabla(Main) HTTP/1.1" 200 5864 - "Mozilla/5.0
(Windows; U; Windows NT 6.0; en-US) AppleWebKit/525.19 (KHTML, like Gecko)
Chrome/1.0.154.65 Safari/525.19"
+PREHOOK: query: EXPLAIN
+CREATE TABLE serde_regex2(
+ key STRING,
+ value STRING)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
+WITH SERDEPROPERTIES (
+ "input.regex" = "([^ ]*),([^ ]*)",
+ "serialization.encoding" = "ISO8859_1"
+)
+STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@serde_regex2
+POSTHOOK: query: EXPLAIN
+CREATE TABLE serde_regex2(
+ key STRING,
+ value STRING)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
+WITH SERDEPROPERTIES (
+ "input.regex" = "([^ ]*),([^ ]*)",
+ "serialization.encoding" = "ISO8859_1"
+)
+STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@serde_regex2
+STAGE DEPENDENCIES:
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-0
+ Create Table
+ columns: key string, value string
+ name: default.serde_regex2
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat
+ serde name: org.apache.hadoop.hive.serde2.RegexSerDe
+ serde properties:
+ input.regex ([^ ]*),([^ ]*)
+ serialization.encoding ISO8859_1
+
+PREHOOK: query: CREATE TABLE serde_regex2(
+ key STRING,
+ value STRING)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
+WITH SERDEPROPERTIES (
+ "input.regex" = "([^ ]*),([^ ]*)",
+ "serialization.encoding" = "ISO8859_1"
+)
+STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@serde_regex2
+POSTHOOK: query: CREATE TABLE serde_regex2(
+ key STRING,
+ value STRING)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
+WITH SERDEPROPERTIES (
+ "input.regex" = "([^ ]*),([^ ]*)",
+ "serialization.encoding" = "ISO8859_1"
+)
+STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@serde_regex2
+PREHOOK: query: LOAD DATA LOCAL INPATH
"../../data/files/encoding_iso-8859-1.txt" INTO TABLE serde_regex2
+PREHOOK: type: LOAD
+#### A masked pattern was here ####
+PREHOOK: Output: default@serde_regex2
+POSTHOOK: query: LOAD DATA LOCAL INPATH
"../../data/files/encoding_iso-8859-1.txt" INTO TABLE serde_regex2
+POSTHOOK: type: LOAD
+#### A masked pattern was here ####
+POSTHOOK: Output: default@serde_regex2
+PREHOOK: query: SELECT key, value FROM serde_regex2 ORDER BY key, value
+PREHOOK: type: QUERY
+PREHOOK: Input: default@serde_regex2
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT key, value FROM serde_regex2 ORDER BY key, value
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@serde_regex2
+#### A masked pattern was here ####
+Jørgensen Jørgen
+Müller Thomas
+Nåm Fæk
+Peña Andrés
+PREHOOK: query: DROP TABLE serde_regex2
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@serde_regex2
+PREHOOK: Output: default@serde_regex2
+POSTHOOK: query: DROP TABLE serde_regex2
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@serde_regex2
+POSTHOOK: Output: default@serde_regex2
diff --git a/data/files/opencsv-data.txt b/data/files/opencsv-data.txt
index 7d5968b468f..3314bf57151 100644
--- a/data/files/opencsv-data.txt
+++ b/data/files/opencsv-data.txt
@@ -1,3 +1,4 @@
why hello there,42,3,100,1412341,true,42.43,85.23423424
another record,98,4,101,9999999,false,99.89,0.00000009
-third record,45,5,102,999999999,true,89.99,0.00000000000009
\ No newline at end of file
+third record,45,5,102,999999999,true,89.99,0.00000000000009
+Müller Thomas,42,3,100,1412341,true,42.43,85.23423424
\ No newline at end of file
diff --git
a/hcatalog/core/src/main/java/org/apache/hive/hcatalog/data/JsonSerDe.java
b/hcatalog/core/src/main/java/org/apache/hive/hcatalog/data/JsonSerDe.java
index 0525b7e51cb..7defd21c081 100644
--- a/hcatalog/core/src/main/java/org/apache/hive/hcatalog/data/JsonSerDe.java
+++ b/hcatalog/core/src/main/java/org/apache/hive/hcatalog/data/JsonSerDe.java
@@ -31,9 +31,10 @@ import org.apache.commons.lang3.ArrayUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.serde.serdeConstants;
-import org.apache.hadoop.hive.serde2.AbstractSerDe;
+import org.apache.hadoop.hive.serde2.AbstractEncodingAwareSerDe;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.SerDeSpec;
+import org.apache.hadoop.hive.serde2.SerDeUtils;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo;
import org.apache.hadoop.io.Text;
@@ -44,8 +45,9 @@ import org.apache.hive.hcatalog.data.schema.HCatSchemaUtils;
@SerDeSpec(schemaProps = {serdeConstants.LIST_COLUMNS,
serdeConstants.LIST_COLUMN_TYPES,
- serdeConstants.TIMESTAMP_FORMATS})
-public class JsonSerDe extends AbstractSerDe {
+ serdeConstants.TIMESTAMP_FORMATS,
+ serdeConstants.SERIALIZATION_ENCODING})
+public class JsonSerDe extends AbstractEncodingAwareSerDe {
private HCatSchema schema;
@@ -78,7 +80,7 @@ public class JsonSerDe extends AbstractSerDe {
* our own object implementation, and we use HCatRecord for it
*/
@Override
- public Object deserialize(Writable blob) throws SerDeException {
+ public Object doDeserialize(Writable blob) throws SerDeException {
try {
List<?> row = (List<?>) jsonSerde.deserialize(blob);
List<Object> fatRow = fatLand(row);
@@ -165,7 +167,7 @@ public class JsonSerDe extends AbstractSerDe {
* and generate a Text representation of the object.
*/
@Override
- public Writable serialize(Object obj, ObjectInspector objInspector)
+ public Writable doSerialize(Object obj, ObjectInspector objInspector)
throws SerDeException {
return jsonSerde.serialize(obj, objInspector);
}
@@ -184,4 +186,22 @@ public class JsonSerDe extends AbstractSerDe {
return Text.class;
}
+ /**
+ * Transform Writable data from UTF-8 to charset before serialize.
+ * @param blob
+ * @return
+ */
+
+ @Override
+ protected Writable transformFromUTF8(Writable blob) {
+ Text text = (Text)blob;
+ return SerDeUtils.transformTextFromUTF8(text, this.charset);
+ }
+
+ @Override
+ protected Writable transformToUTF8(Writable blob) {
+ Text text = (Text)blob;
+ return SerDeUtils.transformTextToUTF8(text, this.charset);
+ }
+
}
diff --git a/ql/src/test/queries/clientpositive/json_serde1.q
b/ql/src/test/queries/clientpositive/json_serde1.q
index fcbf1c07529..3cb0c0fe481 100644
--- a/ql/src/test/queries/clientpositive/json_serde1.q
+++ b/ql/src/test/queries/clientpositive/json_serde1.q
@@ -3,6 +3,7 @@
drop table if exists json_serde1_1;
drop table if exists json_serde1_2;
drop table if exists json_serde1_3;
+drop table if exists json_serde1_4;
create table json_serde1_1 (a array<string>,b map<string,int>)
row format serde 'org.apache.hive.hcatalog.data.JsonSerDe';
@@ -35,6 +36,16 @@ select * from json_serde1_2;
create table json_serde1_3 (c1 int, c2 string) stored as jsonfile;
show create table json_serde1_3;
+create table json_serde1_4 (a array<string>,b map<string,int>)
+ row format serde 'org.apache.hive.hcatalog.data.JsonSerDe'
+ WITH SERDEPROPERTIES ('serialization.encoding'='ISO8859_1');
+
+insert into table json_serde1_4
+ select array('Müller'),map('Müller',1) from src limit 2;
+
+select * from json_serde1_4;
+
drop table json_serde1_1;
drop table json_serde1_2;
drop table json_serde1_3;
+drop table json_serde1_4;
diff --git a/ql/src/test/queries/clientpositive/serde_opencsv.q
b/ql/src/test/queries/clientpositive/serde_opencsv.q
index 26d79a66305..6e620c099fe 100644
--- a/ql/src/test/queries/clientpositive/serde_opencsv.q
+++ b/ql/src/test/queries/clientpositive/serde_opencsv.q
@@ -12,7 +12,8 @@ ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
WITH SERDEPROPERTIES(
"separatorChar" = ",",
"quoteChar" = "\'",
- "escapeChar" = "\\"
+ "escapeChar" = "\\",
+ "serialization.encoding" = "ISO8859_1"
) stored as textfile;
CREATE TABLE serde_opencsv(
@@ -28,9 +29,12 @@ ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
WITH SERDEPROPERTIES(
"separatorChar" = ",",
"quoteChar" = "\'",
- "escapeChar" = "\\"
+ "escapeChar" = "\\",
+ "serialization.encoding" = "ISO8859_1"
) stored as textfile;
LOAD DATA LOCAL INPATH "../../data/files/opencsv-data.txt" INTO TABLE
serde_opencsv;
SELECT count(*) FROM serde_opencsv;
+
+SELECT * FROM serde_opencsv;
diff --git a/ql/src/test/queries/clientpositive/serde_regex.q
b/ql/src/test/queries/clientpositive/serde_regex.q
index fc716ed8327..a193ef76555 100644
--- a/ql/src/test/queries/clientpositive/serde_regex.q
+++ b/ql/src/test/queries/clientpositive/serde_regex.q
@@ -41,6 +41,7 @@ SELECT host, size, status, `time` from serde_regex ORDER BY
`time`;
DROP TABLE serde_regex;
+
EXPLAIN
CREATE TABLE serde_regex1(
key decimal(38,18),
@@ -65,3 +66,31 @@ LOAD DATA LOCAL INPATH "../../data/files/kv7.txt" INTO TABLE
serde_regex1;
SELECT key, value FROM serde_regex1 ORDER BY key, value;
DROP TABLE serde_regex1;
+
+
+EXPLAIN
+CREATE TABLE serde_regex2(
+ key STRING,
+ value STRING)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
+WITH SERDEPROPERTIES (
+ "input.regex" = "([^ ]*),([^ ]*)",
+ "serialization.encoding" = "ISO8859_1"
+)
+STORED AS TEXTFILE;
+
+CREATE TABLE serde_regex2(
+ key STRING,
+ value STRING)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
+WITH SERDEPROPERTIES (
+ "input.regex" = "([^ ]*),([^ ]*)",
+ "serialization.encoding" = "ISO8859_1"
+)
+STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH "../../data/files/encoding_iso-8859-1.txt" INTO TABLE
serde_regex2;
+
+SELECT key, value FROM serde_regex2 ORDER BY key, value;
+
+DROP TABLE serde_regex2;
diff --git a/ql/src/test/results/clientpositive/llap/json_serde1.q.out
b/ql/src/test/results/clientpositive/llap/json_serde1.q.out
index 341a494e0c5..0ee5f505de7 100644
--- a/ql/src/test/results/clientpositive/llap/json_serde1.q.out
+++ b/ql/src/test/results/clientpositive/llap/json_serde1.q.out
@@ -10,6 +10,10 @@ PREHOOK: query: drop table if exists json_serde1_3
PREHOOK: type: DROPTABLE
POSTHOOK: query: drop table if exists json_serde1_3
POSTHOOK: type: DROPTABLE
+PREHOOK: query: drop table if exists json_serde1_4
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: drop table if exists json_serde1_4
+POSTHOOK: type: DROPTABLE
PREHOOK: query: create table json_serde1_1 (a array<string>,b map<string,int>)
row format serde 'org.apache.hive.hcatalog.data.JsonSerDe'
PREHOOK: type: CREATETABLE
@@ -127,6 +131,40 @@ LOCATION
TBLPROPERTIES (
'bucketing_version'='2',
#### A masked pattern was here ####
+PREHOOK: query: create table json_serde1_4 (a array<string>,b map<string,int>)
+ row format serde 'org.apache.hive.hcatalog.data.JsonSerDe'
+ WITH SERDEPROPERTIES ('serialization.encoding'='ISO8859_1')
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@json_serde1_4
+POSTHOOK: query: create table json_serde1_4 (a array<string>,b map<string,int>)
+ row format serde 'org.apache.hive.hcatalog.data.JsonSerDe'
+ WITH SERDEPROPERTIES ('serialization.encoding'='ISO8859_1')
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@json_serde1_4
+PREHOOK: query: insert into table json_serde1_4
+ select array('Müller'),map('Müller',1) from src limit 2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+PREHOOK: Output: default@json_serde1_4
+POSTHOOK: query: insert into table json_serde1_4
+ select array('Müller'),map('Müller',1) from src limit 2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+POSTHOOK: Output: default@json_serde1_4
+POSTHOOK: Lineage: json_serde1_4.a EXPRESSION []
+POSTHOOK: Lineage: json_serde1_4.b EXPRESSION []
+PREHOOK: query: select * from json_serde1_4
+PREHOOK: type: QUERY
+PREHOOK: Input: default@json_serde1_4
+#### A masked pattern was here ####
+POSTHOOK: query: select * from json_serde1_4
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@json_serde1_4
+#### A masked pattern was here ####
+["Müller"] {"Müller":1}
+["Müller"] {"Müller":1}
PREHOOK: query: drop table json_serde1_1
PREHOOK: type: DROPTABLE
PREHOOK: Input: default@json_serde1_1
@@ -151,3 +189,11 @@ POSTHOOK: query: drop table json_serde1_3
POSTHOOK: type: DROPTABLE
POSTHOOK: Input: default@json_serde1_3
POSTHOOK: Output: default@json_serde1_3
+PREHOOK: query: drop table json_serde1_4
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@json_serde1_4
+PREHOOK: Output: default@json_serde1_4
+POSTHOOK: query: drop table json_serde1_4
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@json_serde1_4
+POSTHOOK: Output: default@json_serde1_4
diff --git a/ql/src/test/results/clientpositive/llap/serde_opencsv.q.out
b/ql/src/test/results/clientpositive/llap/serde_opencsv.q.out
index f39ee7322c5..730a385f59c 100644
--- a/ql/src/test/results/clientpositive/llap/serde_opencsv.q.out
+++ b/ql/src/test/results/clientpositive/llap/serde_opencsv.q.out
@@ -12,7 +12,8 @@ ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
WITH SERDEPROPERTIES(
"separatorChar" = ",",
"quoteChar" = "\'",
- "escapeChar" = "\\"
+ "escapeChar" = "\\",
+ "serialization.encoding" = "ISO8859_1"
) stored as textfile
PREHOOK: type: CREATETABLE
PREHOOK: Output: database:default
@@ -31,7 +32,8 @@ ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
WITH SERDEPROPERTIES(
"separatorChar" = ",",
"quoteChar" = "\'",
- "escapeChar" = "\\"
+ "escapeChar" = "\\",
+ "serialization.encoding" = "ISO8859_1"
) stored as textfile
POSTHOOK: type: CREATETABLE
POSTHOOK: Output: database:default
@@ -51,6 +53,7 @@ STAGE PLANS:
escapeChar \
quoteChar '
separatorChar ,
+ serialization.encoding ISO8859_1
PREHOOK: query: CREATE TABLE serde_opencsv(
words STRING,
@@ -65,7 +68,8 @@ ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
WITH SERDEPROPERTIES(
"separatorChar" = ",",
"quoteChar" = "\'",
- "escapeChar" = "\\"
+ "escapeChar" = "\\",
+ "serialization.encoding" = "ISO8859_1"
) stored as textfile
PREHOOK: type: CREATETABLE
PREHOOK: Output: database:default
@@ -83,7 +87,8 @@ ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
WITH SERDEPROPERTIES(
"separatorChar" = ",",
"quoteChar" = "\'",
- "escapeChar" = "\\"
+ "escapeChar" = "\\",
+ "serialization.encoding" = "ISO8859_1"
) stored as textfile
POSTHOOK: type: CREATETABLE
POSTHOOK: Output: database:default
@@ -104,4 +109,16 @@ POSTHOOK: query: SELECT count(*) FROM serde_opencsv
POSTHOOK: type: QUERY
POSTHOOK: Input: default@serde_opencsv
#### A masked pattern was here ####
-3
+4
+PREHOOK: query: SELECT * FROM serde_opencsv
+PREHOOK: type: QUERY
+PREHOOK: Input: default@serde_opencsv
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT * FROM serde_opencsv
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@serde_opencsv
+#### A masked pattern was here ####
+why hello there 42 3 100 1412341 true 42.43
85.23423424
+another record 98 4 101 9999999 false 99.89 0.00000009
+third record 45 5 102 999999999 true 89.99
0.00000000000009
+Müller Thomas 42 3 100 1412341 true 42.43 85.23423424
diff --git a/ql/src/test/results/clientpositive/llap/serde_regex.q.out
b/ql/src/test/results/clientpositive/llap/serde_regex.q.out
index dfa39c24e39..1e5bc8f2ec0 100644
--- a/ql/src/test/results/clientpositive/llap/serde_regex.q.out
+++ b/ql/src/test/results/clientpositive/llap/serde_regex.q.out
@@ -252,3 +252,96 @@ POSTHOOK: query: DROP TABLE serde_regex1
POSTHOOK: type: DROPTABLE
POSTHOOK: Input: default@serde_regex1
POSTHOOK: Output: default@serde_regex1
+PREHOOK: query: EXPLAIN
+CREATE TABLE serde_regex2(
+ key STRING,
+ value STRING)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
+WITH SERDEPROPERTIES (
+ "input.regex" = "([^ ]*),([^ ]*)",
+ "serialization.encoding" = "ISO8859_1"
+)
+STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@serde_regex2
+POSTHOOK: query: EXPLAIN
+CREATE TABLE serde_regex2(
+ key STRING,
+ value STRING)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
+WITH SERDEPROPERTIES (
+ "input.regex" = "([^ ]*),([^ ]*)",
+ "serialization.encoding" = "ISO8859_1"
+)
+STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@serde_regex2
+STAGE DEPENDENCIES:
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-0
+ Create Table
+ columns: key string, value string
+ name: default.serde_regex2
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat
+ serde name: org.apache.hadoop.hive.serde2.RegexSerDe
+ serde properties:
+ input.regex ([^ ]*),([^ ]*)
+ serialization.encoding ISO8859_1
+
+PREHOOK: query: CREATE TABLE serde_regex2(
+ key STRING,
+ value STRING)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
+WITH SERDEPROPERTIES (
+ "input.regex" = "([^ ]*),([^ ]*)",
+ "serialization.encoding" = "ISO8859_1"
+)
+STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@serde_regex2
+POSTHOOK: query: CREATE TABLE serde_regex2(
+ key STRING,
+ value STRING)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
+WITH SERDEPROPERTIES (
+ "input.regex" = "([^ ]*),([^ ]*)",
+ "serialization.encoding" = "ISO8859_1"
+)
+STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@serde_regex2
+PREHOOK: query: LOAD DATA LOCAL INPATH
"../../data/files/encoding_iso-8859-1.txt" INTO TABLE serde_regex2
+PREHOOK: type: LOAD
+#### A masked pattern was here ####
+PREHOOK: Output: default@serde_regex2
+POSTHOOK: query: LOAD DATA LOCAL INPATH
"../../data/files/encoding_iso-8859-1.txt" INTO TABLE serde_regex2
+POSTHOOK: type: LOAD
+#### A masked pattern was here ####
+POSTHOOK: Output: default@serde_regex2
+PREHOOK: query: SELECT key, value FROM serde_regex2 ORDER BY key, value
+PREHOOK: type: QUERY
+PREHOOK: Input: default@serde_regex2
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT key, value FROM serde_regex2 ORDER BY key, value
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@serde_regex2
+#### A masked pattern was here ####
+Jørgensen Jørgen
+Müller Thomas
+Nåm Fæk
+Peña Andrés
+PREHOOK: query: DROP TABLE serde_regex2
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@serde_regex2
+PREHOOK: Output: default@serde_regex2
+POSTHOOK: query: DROP TABLE serde_regex2
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@serde_regex2
+POSTHOOK: Output: default@serde_regex2
diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/OpenCSVSerde.java
b/serde/src/java/org/apache/hadoop/hive/serde2/OpenCSVSerde.java
index e1518da541c..243eeacf771 100644
--- a/serde/src/java/org/apache/hadoop/hive/serde2/OpenCSVSerde.java
+++ b/serde/src/java/org/apache/hadoop/hive/serde2/OpenCSVSerde.java
@@ -49,9 +49,9 @@ import au.com.bytecode.opencsv.CSVWriter;
*
*/
@SerDeSpec(schemaProps = {
- serdeConstants.LIST_COLUMNS,
+ serdeConstants.LIST_COLUMNS, serdeConstants.SERIALIZATION_ENCODING,
OpenCSVSerde.SEPARATORCHAR, OpenCSVSerde.QUOTECHAR,
OpenCSVSerde.ESCAPECHAR})
-public final class OpenCSVSerde extends AbstractSerDe {
+public final class OpenCSVSerde extends AbstractEncodingAwareSerDe {
private ObjectInspector inspector;
private String[] outputFields;
@@ -103,7 +103,7 @@ public final class OpenCSVSerde extends AbstractSerDe {
}
@Override
- public Writable serialize(Object obj, ObjectInspector objInspector) throws
SerDeException {
+ public Writable doSerialize(Object obj, ObjectInspector objInspector) throws
SerDeException {
final StructObjectInspector outputRowOI = (StructObjectInspector)
objInspector;
final List<? extends StructField> outputFieldRefs =
outputRowOI.getAllStructFieldRefs();
@@ -144,7 +144,7 @@ public final class OpenCSVSerde extends AbstractSerDe {
}
@Override
- public Object deserialize(final Writable blob) throws SerDeException {
+ public Object doDeserialize(final Writable blob) throws SerDeException {
Text rowText = (Text) blob;
CSVReader csv = null;
@@ -202,4 +202,14 @@ public final class OpenCSVSerde extends AbstractSerDe {
public Class<? extends Writable> getSerializedClass() {
return Text.class;
}
+
+ protected Text transformFromUTF8(Writable blob) {
+ Text text = (Text)blob;
+ return SerDeUtils.transformTextFromUTF8(text, this.charset);
+ }
+
+ protected Text transformToUTF8(Writable blob) {
+ Text text = (Text) blob;
+ return SerDeUtils.transformTextToUTF8(text, this.charset);
+ }
}
diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/RegexSerDe.java
b/serde/src/java/org/apache/hadoop/hive/serde2/RegexSerDe.java
index d1a24bdce8a..d3f4848a9a3 100644
--- a/serde/src/java/org/apache/hadoop/hive/serde2/RegexSerDe.java
+++ b/serde/src/java/org/apache/hadoop/hive/serde2/RegexSerDe.java
@@ -67,9 +67,9 @@ import com.google.common.collect.Lists;
* based Regex library.
*/
@SerDeSpec(schemaProps = {
- serdeConstants.LIST_COLUMNS, serdeConstants.LIST_COLUMN_TYPES,
+ serdeConstants.LIST_COLUMNS, serdeConstants.LIST_COLUMN_TYPES,
serdeConstants.SERIALIZATION_ENCODING,
RegexSerDe.INPUT_REGEX, RegexSerDe.INPUT_REGEX_CASE_SENSITIVE })
-public class RegexSerDe extends AbstractSerDe {
+public class RegexSerDe extends AbstractEncodingAwareSerDe {
public static final String INPUT_REGEX = "input.regex";
public static final String INPUT_REGEX_CASE_SENSITIVE =
"input.regex.case.insensitive";
@@ -159,7 +159,7 @@ public class RegexSerDe extends AbstractSerDe {
long partialMatchedRowsCount = 0;
@Override
- public Object deserialize(Writable blob) throws SerDeException {
+ public Object doDeserialize(Writable blob) throws SerDeException {
Text rowText = (Text) blob;
Matcher m = inputPattern.matcher(rowText.toString());
@@ -267,9 +267,21 @@ public class RegexSerDe extends AbstractSerDe {
}
@Override
- public Writable serialize(Object obj, ObjectInspector objInspector)
+ public Writable doSerialize(Object obj, ObjectInspector objInspector)
throws SerDeException {
throw new UnsupportedOperationException(
"Regex SerDe doesn't support the serialize() method");
}
+
+ @Override
+ protected Writable transformFromUTF8(Writable blob) {
+ Text text = (Text)blob;
+ return SerDeUtils.transformTextFromUTF8(text, this.charset);
+ }
+
+ @Override
+ protected Writable transformToUTF8(Writable blob) {
+ Text text = (Text)blob;
+ return SerDeUtils.transformTextToUTF8(text, this.charset);
+ }
}