This is an automated email from the ASF dual-hosted git repository.

krisztiankasa pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git


The following commit(s) were added to refs/heads/master by this push:
     new 6722bcb5283 HIVE-15826: Add 'serialization.encoding' To All SerDes 
(Shailesh Gupta, reviewed by Laszlo Bodor, Krisztian Kasa )
6722bcb5283 is described below

commit 6722bcb5283b04080cfbf06ca45c75772a8f2e4c
Author: Shailesh Gupta <[email protected]>
AuthorDate: Thu Mar 2 13:12:42 2023 +0530

    HIVE-15826: Add 'serialization.encoding' To All SerDes (Shailesh Gupta, 
reviewed by Laszlo Bodor, Krisztian Kasa )
    
    Co-authored-by: Shailesh Gupta <[email protected]>
---
 .../hadoop/hive/contrib/serde2/RegexSerDe.java     | 23 ++++--
 .../src/test/queries/clientpositive/serde_regex.q  | 30 ++++++-
 .../test/results/clientpositive/serde_regex.q.out  | 93 ++++++++++++++++++++++
 data/files/opencsv-data.txt                        |  3 +-
 .../org/apache/hive/hcatalog/data/JsonSerDe.java   | 30 +++++--
 ql/src/test/queries/clientpositive/json_serde1.q   | 11 +++
 ql/src/test/queries/clientpositive/serde_opencsv.q |  8 +-
 ql/src/test/queries/clientpositive/serde_regex.q   | 29 +++++++
 .../results/clientpositive/llap/json_serde1.q.out  | 46 +++++++++++
 .../clientpositive/llap/serde_opencsv.q.out        | 27 +++++--
 .../results/clientpositive/llap/serde_regex.q.out  | 93 ++++++++++++++++++++++
 .../apache/hadoop/hive/serde2/OpenCSVSerde.java    | 18 ++++-
 .../org/apache/hadoop/hive/serde2/RegexSerDe.java  | 20 ++++-
 13 files changed, 404 insertions(+), 27 deletions(-)

diff --git 
a/contrib/src/java/org/apache/hadoop/hive/contrib/serde2/RegexSerDe.java 
b/contrib/src/java/org/apache/hadoop/hive/contrib/serde2/RegexSerDe.java
index 9f1a9b6919b..ff8fb8f339d 100644
--- a/contrib/src/java/org/apache/hadoop/hive/contrib/serde2/RegexSerDe.java
+++ b/contrib/src/java/org/apache/hadoop/hive/contrib/serde2/RegexSerDe.java
@@ -25,11 +25,12 @@ import java.util.Properties;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
+import org.apache.hadoop.hive.serde2.SerDeUtils;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.hive.serde.serdeConstants;
-import org.apache.hadoop.hive.serde2.AbstractSerDe;
+import org.apache.hadoop.hive.serde2.AbstractEncodingAwareSerDe;
 import org.apache.hadoop.hive.serde2.SerDeException;
 import org.apache.hadoop.hive.serde2.SerDeSpec;
 import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
@@ -70,10 +71,10 @@ import org.apache.hadoop.io.Writable;
  * based Regex library.
  */
 @SerDeSpec(schemaProps = {
-    serdeConstants.LIST_COLUMNS, serdeConstants.LIST_COLUMN_TYPES,
+    serdeConstants.LIST_COLUMNS, serdeConstants.LIST_COLUMN_TYPES, 
serdeConstants.SERIALIZATION_ENCODING,
     RegexSerDe.INPUT_REGEX, RegexSerDe.OUTPUT_FORMAT_STRING,
     RegexSerDe.INPUT_REGEX_CASE_SENSITIVE })
-public class RegexSerDe extends AbstractSerDe {
+public class RegexSerDe extends AbstractEncodingAwareSerDe {
 
   public static final String INPUT_REGEX = "input.regex";
   public static final String OUTPUT_FORMAT_STRING = "output.format.string";
@@ -156,7 +157,7 @@ public class RegexSerDe extends AbstractSerDe {
   }
 
   @Override
-  public Object deserialize(Writable blob) throws SerDeException {
+  public Object doDeserialize(Writable blob) throws SerDeException {
 
     if (inputPattern == null) {
       throw new SerDeException(
@@ -200,7 +201,7 @@ public class RegexSerDe extends AbstractSerDe {
   Text outputRowText;
 
   @Override
-  public Writable serialize(Object obj, ObjectInspector objInspector)
+  public Writable doSerialize(Object obj, ObjectInspector objInspector)
       throws SerDeException {
 
     if (outputFormatString == null) {
@@ -252,4 +253,16 @@ public class RegexSerDe extends AbstractSerDe {
     return outputRowText;
   }
 
+  @Override
+  protected Writable transformFromUTF8(Writable blob) {
+    Text text = (Text)blob;
+    return SerDeUtils.transformTextFromUTF8(text, this.charset);
+  }
+
+  @Override
+  protected Writable transformToUTF8(Writable blob) {
+    Text text = (Text)blob;
+    return SerDeUtils.transformTextToUTF8(text, this.charset);
+  }
+
 }
diff --git a/contrib/src/test/queries/clientpositive/serde_regex.q 
b/contrib/src/test/queries/clientpositive/serde_regex.q
index 8aa3eda201d..8487fa01a0e 100644
--- a/contrib/src/test/queries/clientpositive/serde_regex.q
+++ b/contrib/src/test/queries/clientpositive/serde_regex.q
@@ -39,4 +39,32 @@ STORED AS TEXTFILE;
 LOAD DATA LOCAL INPATH "../../data/files/apache.access.log" INTO TABLE 
serde_regex;
 LOAD DATA LOCAL INPATH "../../data/files/apache.access.2.log" INTO TABLE 
serde_regex;
 
-SELECT * FROM serde_regex ORDER BY `time`;
\ No newline at end of file
+SELECT * FROM serde_regex ORDER BY `time`;
+
+
+EXPLAIN
+CREATE TABLE serde_regex2(
+  key STRING,
+  value STRING)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
+WITH SERDEPROPERTIES (
+  "input.regex" = "([^ ]*),([^ ]*)",
+  "serialization.encoding" = "ISO8859_1"
+)
+STORED AS TEXTFILE;
+
+CREATE TABLE serde_regex2(
+  key STRING,
+  value STRING)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
+WITH SERDEPROPERTIES (
+  "input.regex" = "([^ ]*),([^ ]*)",
+  "serialization.encoding" = "ISO8859_1"
+)
+STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH "../../data/files/encoding_iso-8859-1.txt" INTO TABLE 
serde_regex2;
+
+SELECT key, value FROM serde_regex2 ORDER BY key, value;
+
+DROP TABLE serde_regex2;
\ No newline at end of file
diff --git a/contrib/src/test/results/clientpositive/serde_regex.q.out 
b/contrib/src/test/results/clientpositive/serde_regex.q.out
index 7462568938b..ff1b771b813 100644
--- a/contrib/src/test/results/clientpositive/serde_regex.q.out
+++ b/contrib/src/test/results/clientpositive/serde_regex.q.out
@@ -117,3 +117,96 @@ POSTHOOK: Input: default@serde_regex
 #### A masked pattern was here ####
 127.0.0.1      -       frank   [10/Oct/2000:13:55:36 -0700]    "GET 
/apache_pb.gif HTTP/1.0"   200     2326    NULL    NULL
 127.0.0.1      -       -       [26/May/2009:00:00:00 +0000]    "GET 
/someurl/?track=Blabla(Main) HTTP/1.1"     200     5864    -       "Mozilla/5.0 
(Windows; U; Windows NT 6.0; en-US) AppleWebKit/525.19 (KHTML, like Gecko) 
Chrome/1.0.154.65 Safari/525.19"
+PREHOOK: query: EXPLAIN
+CREATE TABLE serde_regex2(
+  key STRING,
+  value STRING)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
+WITH SERDEPROPERTIES (
+  "input.regex" = "([^ ]*),([^ ]*)",
+  "serialization.encoding" = "ISO8859_1"
+)
+STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@serde_regex2
+POSTHOOK: query: EXPLAIN
+CREATE TABLE serde_regex2(
+  key STRING,
+  value STRING)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
+WITH SERDEPROPERTIES (
+  "input.regex" = "([^ ]*),([^ ]*)",
+  "serialization.encoding" = "ISO8859_1"
+)
+STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@serde_regex2
+STAGE DEPENDENCIES:
+  Stage-0 is a root stage
+
+STAGE PLANS:
+  Stage: Stage-0
+    Create Table
+      columns: key string, value string
+      name: default.serde_regex2
+      input format: org.apache.hadoop.mapred.TextInputFormat
+      output format: org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat
+      serde name: org.apache.hadoop.hive.serde2.RegexSerDe
+      serde properties:
+        input.regex ([^ ]*),([^ ]*)
+        serialization.encoding ISO8859_1
+
+PREHOOK: query: CREATE TABLE serde_regex2(
+  key STRING,
+  value STRING)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
+WITH SERDEPROPERTIES (
+  "input.regex" = "([^ ]*),([^ ]*)",
+  "serialization.encoding" = "ISO8859_1"
+)
+STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@serde_regex2
+POSTHOOK: query: CREATE TABLE serde_regex2(
+  key STRING,
+  value STRING)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
+WITH SERDEPROPERTIES (
+  "input.regex" = "([^ ]*),([^ ]*)",
+  "serialization.encoding" = "ISO8859_1"
+)
+STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@serde_regex2
+PREHOOK: query: LOAD DATA LOCAL INPATH 
"../../data/files/encoding_iso-8859-1.txt" INTO TABLE serde_regex2
+PREHOOK: type: LOAD
+#### A masked pattern was here ####
+PREHOOK: Output: default@serde_regex2
+POSTHOOK: query: LOAD DATA LOCAL INPATH 
"../../data/files/encoding_iso-8859-1.txt" INTO TABLE serde_regex2
+POSTHOOK: type: LOAD
+#### A masked pattern was here ####
+POSTHOOK: Output: default@serde_regex2
+PREHOOK: query: SELECT key, value FROM serde_regex2 ORDER BY key, value
+PREHOOK: type: QUERY
+PREHOOK: Input: default@serde_regex2
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT key, value FROM serde_regex2 ORDER BY key, value
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@serde_regex2
+#### A masked pattern was here ####
+Jørgensen      Jørgen
+Müller Thomas
+Nåm    Fæk
+Peña   Andrés
+PREHOOK: query: DROP TABLE serde_regex2
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@serde_regex2
+PREHOOK: Output: default@serde_regex2
+POSTHOOK: query: DROP TABLE serde_regex2
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@serde_regex2
+POSTHOOK: Output: default@serde_regex2
diff --git a/data/files/opencsv-data.txt b/data/files/opencsv-data.txt
index 7d5968b468f..3314bf57151 100644
--- a/data/files/opencsv-data.txt
+++ b/data/files/opencsv-data.txt
@@ -1,3 +1,4 @@
 why hello there,42,3,100,1412341,true,42.43,85.23423424
 another record,98,4,101,9999999,false,99.89,0.00000009
-third record,45,5,102,999999999,true,89.99,0.00000000000009
\ No newline at end of file
+third record,45,5,102,999999999,true,89.99,0.00000000000009
+Müller Thomas,42,3,100,1412341,true,42.43,85.23423424
\ No newline at end of file
diff --git 
a/hcatalog/core/src/main/java/org/apache/hive/hcatalog/data/JsonSerDe.java 
b/hcatalog/core/src/main/java/org/apache/hive/hcatalog/data/JsonSerDe.java
index 0525b7e51cb..7defd21c081 100644
--- a/hcatalog/core/src/main/java/org/apache/hive/hcatalog/data/JsonSerDe.java
+++ b/hcatalog/core/src/main/java/org/apache/hive/hcatalog/data/JsonSerDe.java
@@ -31,9 +31,10 @@ import org.apache.commons.lang3.ArrayUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.hive.conf.HiveConf;
 import org.apache.hadoop.hive.serde.serdeConstants;
-import org.apache.hadoop.hive.serde2.AbstractSerDe;
+import org.apache.hadoop.hive.serde2.AbstractEncodingAwareSerDe;
 import org.apache.hadoop.hive.serde2.SerDeException;
 import org.apache.hadoop.hive.serde2.SerDeSpec;
+import org.apache.hadoop.hive.serde2.SerDeUtils;
 import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
 import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo;
 import org.apache.hadoop.io.Text;
@@ -44,8 +45,9 @@ import org.apache.hive.hcatalog.data.schema.HCatSchemaUtils;
 
 @SerDeSpec(schemaProps = {serdeConstants.LIST_COLUMNS,
                           serdeConstants.LIST_COLUMN_TYPES,
-                          serdeConstants.TIMESTAMP_FORMATS})
-public class JsonSerDe extends AbstractSerDe {
+                          serdeConstants.TIMESTAMP_FORMATS,
+                          serdeConstants.SERIALIZATION_ENCODING})
+public class JsonSerDe extends AbstractEncodingAwareSerDe {
 
   private HCatSchema schema;
 
@@ -78,7 +80,7 @@ public class JsonSerDe extends AbstractSerDe {
    * our own object implementation, and we use HCatRecord for it
    */
   @Override
-  public Object deserialize(Writable blob) throws SerDeException {
+  public Object doDeserialize(Writable blob) throws SerDeException {
     try {
       List<?> row = (List<?>) jsonSerde.deserialize(blob);
       List<Object> fatRow = fatLand(row);
@@ -165,7 +167,7 @@ public class JsonSerDe extends AbstractSerDe {
    * and generate a Text representation of the object.
    */
   @Override
-  public Writable serialize(Object obj, ObjectInspector objInspector)
+  public Writable doSerialize(Object obj, ObjectInspector objInspector)
     throws SerDeException {
     return jsonSerde.serialize(obj, objInspector);
   }
@@ -184,4 +186,22 @@ public class JsonSerDe extends AbstractSerDe {
     return Text.class;
   }
 
+  /**
+   * Transform Writable data from UTF-8 to charset before serialize.
+   * @param blob
+   * @return
+   */
+  
+  @Override
+  protected Writable transformFromUTF8(Writable blob) {
+    Text text = (Text)blob;
+    return SerDeUtils.transformTextFromUTF8(text, this.charset);
+  }
+
+  @Override
+  protected Writable transformToUTF8(Writable blob) {
+    Text text = (Text)blob;
+    return SerDeUtils.transformTextToUTF8(text, this.charset);
+  }
+
 }
diff --git a/ql/src/test/queries/clientpositive/json_serde1.q 
b/ql/src/test/queries/clientpositive/json_serde1.q
index fcbf1c07529..3cb0c0fe481 100644
--- a/ql/src/test/queries/clientpositive/json_serde1.q
+++ b/ql/src/test/queries/clientpositive/json_serde1.q
@@ -3,6 +3,7 @@
 drop table if exists json_serde1_1;
 drop table if exists json_serde1_2;
 drop table if exists json_serde1_3;
+drop table if exists json_serde1_4;
 
 create table json_serde1_1 (a array<string>,b map<string,int>)
   row format serde 'org.apache.hive.hcatalog.data.JsonSerDe';
@@ -35,6 +36,16 @@ select * from json_serde1_2;
 create table json_serde1_3 (c1 int, c2 string) stored as jsonfile;
 show create table json_serde1_3;
 
+create table json_serde1_4 (a array<string>,b map<string,int>)
+  row format serde 'org.apache.hive.hcatalog.data.JsonSerDe'
+  WITH SERDEPROPERTIES ('serialization.encoding'='ISO8859_1');
+
+insert into table json_serde1_4
+  select array('Müller'),map('Müller',1) from src limit 2;
+
+select * from json_serde1_4;
+
 drop table json_serde1_1;
 drop table json_serde1_2;
 drop table json_serde1_3;
+drop table json_serde1_4;
diff --git a/ql/src/test/queries/clientpositive/serde_opencsv.q 
b/ql/src/test/queries/clientpositive/serde_opencsv.q
index 26d79a66305..6e620c099fe 100644
--- a/ql/src/test/queries/clientpositive/serde_opencsv.q
+++ b/ql/src/test/queries/clientpositive/serde_opencsv.q
@@ -12,7 +12,8 @@ ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
 WITH SERDEPROPERTIES(
   "separatorChar" = ",",
   "quoteChar"     = "\'",
-  "escapeChar"    = "\\"
+  "escapeChar"    = "\\",
+  "serialization.encoding" = "ISO8859_1"
 ) stored as textfile;
 
 CREATE TABLE serde_opencsv(
@@ -28,9 +29,12 @@ ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
 WITH SERDEPROPERTIES(
   "separatorChar" = ",",
   "quoteChar"     = "\'",
-  "escapeChar"    = "\\"
+  "escapeChar"    = "\\",
+  "serialization.encoding" = "ISO8859_1"
 ) stored as textfile;
 
 LOAD DATA LOCAL INPATH "../../data/files/opencsv-data.txt" INTO TABLE 
serde_opencsv;
 
 SELECT count(*) FROM serde_opencsv;
+
+SELECT * FROM serde_opencsv;
diff --git a/ql/src/test/queries/clientpositive/serde_regex.q 
b/ql/src/test/queries/clientpositive/serde_regex.q
index fc716ed8327..a193ef76555 100644
--- a/ql/src/test/queries/clientpositive/serde_regex.q
+++ b/ql/src/test/queries/clientpositive/serde_regex.q
@@ -41,6 +41,7 @@ SELECT host, size, status, `time` from serde_regex ORDER BY 
`time`;
 
 DROP TABLE serde_regex;
 
+
 EXPLAIN
 CREATE TABLE serde_regex1(
   key decimal(38,18),
@@ -65,3 +66,31 @@ LOAD DATA LOCAL INPATH "../../data/files/kv7.txt" INTO TABLE 
serde_regex1;
 SELECT key, value FROM serde_regex1 ORDER BY key, value;
 
 DROP TABLE serde_regex1;
+
+
+EXPLAIN
+CREATE TABLE serde_regex2(
+  key STRING,
+  value STRING)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
+WITH SERDEPROPERTIES (
+  "input.regex" = "([^ ]*),([^ ]*)",
+  "serialization.encoding" = "ISO8859_1"
+)
+STORED AS TEXTFILE;
+
+CREATE TABLE serde_regex2(
+  key STRING,
+  value STRING)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
+WITH SERDEPROPERTIES (
+  "input.regex" = "([^ ]*),([^ ]*)",
+  "serialization.encoding" = "ISO8859_1"
+)
+STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH "../../data/files/encoding_iso-8859-1.txt" INTO TABLE 
serde_regex2;
+
+SELECT key, value FROM serde_regex2 ORDER BY key, value;
+
+DROP TABLE serde_regex2;
diff --git a/ql/src/test/results/clientpositive/llap/json_serde1.q.out 
b/ql/src/test/results/clientpositive/llap/json_serde1.q.out
index 341a494e0c5..0ee5f505de7 100644
--- a/ql/src/test/results/clientpositive/llap/json_serde1.q.out
+++ b/ql/src/test/results/clientpositive/llap/json_serde1.q.out
@@ -10,6 +10,10 @@ PREHOOK: query: drop table if exists json_serde1_3
 PREHOOK: type: DROPTABLE
 POSTHOOK: query: drop table if exists json_serde1_3
 POSTHOOK: type: DROPTABLE
+PREHOOK: query: drop table if exists json_serde1_4
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: drop table if exists json_serde1_4
+POSTHOOK: type: DROPTABLE
 PREHOOK: query: create table json_serde1_1 (a array<string>,b map<string,int>)
   row format serde 'org.apache.hive.hcatalog.data.JsonSerDe'
 PREHOOK: type: CREATETABLE
@@ -127,6 +131,40 @@ LOCATION
 TBLPROPERTIES (
   'bucketing_version'='2', 
 #### A masked pattern was here ####
+PREHOOK: query: create table json_serde1_4 (a array<string>,b map<string,int>)
+  row format serde 'org.apache.hive.hcatalog.data.JsonSerDe'
+  WITH SERDEPROPERTIES ('serialization.encoding'='ISO8859_1')
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@json_serde1_4
+POSTHOOK: query: create table json_serde1_4 (a array<string>,b map<string,int>)
+  row format serde 'org.apache.hive.hcatalog.data.JsonSerDe'
+  WITH SERDEPROPERTIES ('serialization.encoding'='ISO8859_1')
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@json_serde1_4
+PREHOOK: query: insert into table json_serde1_4
+  select array('Müller'),map('Müller',1) from src limit 2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+PREHOOK: Output: default@json_serde1_4
+POSTHOOK: query: insert into table json_serde1_4
+  select array('Müller'),map('Müller',1) from src limit 2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+POSTHOOK: Output: default@json_serde1_4
+POSTHOOK: Lineage: json_serde1_4.a EXPRESSION []
+POSTHOOK: Lineage: json_serde1_4.b EXPRESSION []
+PREHOOK: query: select * from json_serde1_4
+PREHOOK: type: QUERY
+PREHOOK: Input: default@json_serde1_4
+#### A masked pattern was here ####
+POSTHOOK: query: select * from json_serde1_4
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@json_serde1_4
+#### A masked pattern was here ####
+["Müller"]    {"Müller":1}
+["Müller"]    {"Müller":1}
 PREHOOK: query: drop table json_serde1_1
 PREHOOK: type: DROPTABLE
 PREHOOK: Input: default@json_serde1_1
@@ -151,3 +189,11 @@ POSTHOOK: query: drop table json_serde1_3
 POSTHOOK: type: DROPTABLE
 POSTHOOK: Input: default@json_serde1_3
 POSTHOOK: Output: default@json_serde1_3
+PREHOOK: query: drop table json_serde1_4
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@json_serde1_4
+PREHOOK: Output: default@json_serde1_4
+POSTHOOK: query: drop table json_serde1_4
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@json_serde1_4
+POSTHOOK: Output: default@json_serde1_4
diff --git a/ql/src/test/results/clientpositive/llap/serde_opencsv.q.out 
b/ql/src/test/results/clientpositive/llap/serde_opencsv.q.out
index f39ee7322c5..730a385f59c 100644
--- a/ql/src/test/results/clientpositive/llap/serde_opencsv.q.out
+++ b/ql/src/test/results/clientpositive/llap/serde_opencsv.q.out
@@ -12,7 +12,8 @@ ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
 WITH SERDEPROPERTIES(
   "separatorChar" = ",",
   "quoteChar"     = "\'",
-  "escapeChar"    = "\\"
+  "escapeChar"    = "\\",
+  "serialization.encoding" = "ISO8859_1"
 ) stored as textfile
 PREHOOK: type: CREATETABLE
 PREHOOK: Output: database:default
@@ -31,7 +32,8 @@ ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
 WITH SERDEPROPERTIES(
   "separatorChar" = ",",
   "quoteChar"     = "\'",
-  "escapeChar"    = "\\"
+  "escapeChar"    = "\\",
+  "serialization.encoding" = "ISO8859_1"
 ) stored as textfile
 POSTHOOK: type: CREATETABLE
 POSTHOOK: Output: database:default
@@ -51,6 +53,7 @@ STAGE PLANS:
         escapeChar \
         quoteChar '
         separatorChar ,
+        serialization.encoding ISO8859_1
 
 PREHOOK: query: CREATE TABLE serde_opencsv(
                           words STRING,
@@ -65,7 +68,8 @@ ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
 WITH SERDEPROPERTIES(
   "separatorChar" = ",",
   "quoteChar"     = "\'",
-  "escapeChar"    = "\\"
+  "escapeChar"    = "\\",
+  "serialization.encoding" = "ISO8859_1"
 ) stored as textfile
 PREHOOK: type: CREATETABLE
 PREHOOK: Output: database:default
@@ -83,7 +87,8 @@ ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
 WITH SERDEPROPERTIES(
   "separatorChar" = ",",
   "quoteChar"     = "\'",
-  "escapeChar"    = "\\"
+  "escapeChar"    = "\\",
+  "serialization.encoding" = "ISO8859_1"
 ) stored as textfile
 POSTHOOK: type: CREATETABLE
 POSTHOOK: Output: database:default
@@ -104,4 +109,16 @@ POSTHOOK: query: SELECT count(*) FROM serde_opencsv
 POSTHOOK: type: QUERY
 POSTHOOK: Input: default@serde_opencsv
 #### A masked pattern was here ####
-3
+4
+PREHOOK: query: SELECT * FROM serde_opencsv
+PREHOOK: type: QUERY
+PREHOOK: Input: default@serde_opencsv
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT * FROM serde_opencsv
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@serde_opencsv
+#### A masked pattern was here ####
+why hello there        42      3       100     1412341 true    42.43   
85.23423424
+another record 98      4       101     9999999 false   99.89   0.00000009
+third record   45      5       102     999999999       true    89.99   
0.00000000000009
+Müller Thomas 42      3       100     1412341 true    42.43   85.23423424
diff --git a/ql/src/test/results/clientpositive/llap/serde_regex.q.out 
b/ql/src/test/results/clientpositive/llap/serde_regex.q.out
index dfa39c24e39..1e5bc8f2ec0 100644
--- a/ql/src/test/results/clientpositive/llap/serde_regex.q.out
+++ b/ql/src/test/results/clientpositive/llap/serde_regex.q.out
@@ -252,3 +252,96 @@ POSTHOOK: query: DROP TABLE serde_regex1
 POSTHOOK: type: DROPTABLE
 POSTHOOK: Input: default@serde_regex1
 POSTHOOK: Output: default@serde_regex1
+PREHOOK: query: EXPLAIN
+CREATE TABLE serde_regex2(
+  key STRING,
+  value STRING)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
+WITH SERDEPROPERTIES (
+  "input.regex" = "([^ ]*),([^ ]*)",
+  "serialization.encoding" = "ISO8859_1"
+)
+STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@serde_regex2
+POSTHOOK: query: EXPLAIN
+CREATE TABLE serde_regex2(
+  key STRING,
+  value STRING)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
+WITH SERDEPROPERTIES (
+  "input.regex" = "([^ ]*),([^ ]*)",
+  "serialization.encoding" = "ISO8859_1"
+)
+STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@serde_regex2
+STAGE DEPENDENCIES:
+  Stage-0 is a root stage
+
+STAGE PLANS:
+  Stage: Stage-0
+    Create Table
+      columns: key string, value string
+      name: default.serde_regex2
+      input format: org.apache.hadoop.mapred.TextInputFormat
+      output format: org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat
+      serde name: org.apache.hadoop.hive.serde2.RegexSerDe
+      serde properties:
+        input.regex ([^ ]*),([^ ]*)
+        serialization.encoding ISO8859_1
+
+PREHOOK: query: CREATE TABLE serde_regex2(
+  key STRING,
+  value STRING)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
+WITH SERDEPROPERTIES (
+  "input.regex" = "([^ ]*),([^ ]*)",
+  "serialization.encoding" = "ISO8859_1"
+)
+STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@serde_regex2
+POSTHOOK: query: CREATE TABLE serde_regex2(
+  key STRING,
+  value STRING)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
+WITH SERDEPROPERTIES (
+  "input.regex" = "([^ ]*),([^ ]*)",
+  "serialization.encoding" = "ISO8859_1"
+)
+STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@serde_regex2
+PREHOOK: query: LOAD DATA LOCAL INPATH 
"../../data/files/encoding_iso-8859-1.txt" INTO TABLE serde_regex2
+PREHOOK: type: LOAD
+#### A masked pattern was here ####
+PREHOOK: Output: default@serde_regex2
+POSTHOOK: query: LOAD DATA LOCAL INPATH 
"../../data/files/encoding_iso-8859-1.txt" INTO TABLE serde_regex2
+POSTHOOK: type: LOAD
+#### A masked pattern was here ####
+POSTHOOK: Output: default@serde_regex2
+PREHOOK: query: SELECT key, value FROM serde_regex2 ORDER BY key, value
+PREHOOK: type: QUERY
+PREHOOK: Input: default@serde_regex2
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT key, value FROM serde_regex2 ORDER BY key, value
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@serde_regex2
+#### A masked pattern was here ####
+Jørgensen      Jørgen
+Müller Thomas
+Nåm    Fæk
+Peña   Andrés
+PREHOOK: query: DROP TABLE serde_regex2
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@serde_regex2
+PREHOOK: Output: default@serde_regex2
+POSTHOOK: query: DROP TABLE serde_regex2
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@serde_regex2
+POSTHOOK: Output: default@serde_regex2
diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/OpenCSVSerde.java 
b/serde/src/java/org/apache/hadoop/hive/serde2/OpenCSVSerde.java
index e1518da541c..243eeacf771 100644
--- a/serde/src/java/org/apache/hadoop/hive/serde2/OpenCSVSerde.java
+++ b/serde/src/java/org/apache/hadoop/hive/serde2/OpenCSVSerde.java
@@ -49,9 +49,9 @@ import au.com.bytecode.opencsv.CSVWriter;
  *
  */
 @SerDeSpec(schemaProps = {
-    serdeConstants.LIST_COLUMNS,
+    serdeConstants.LIST_COLUMNS, serdeConstants.SERIALIZATION_ENCODING,
     OpenCSVSerde.SEPARATORCHAR, OpenCSVSerde.QUOTECHAR, 
OpenCSVSerde.ESCAPECHAR})
-public final class OpenCSVSerde extends AbstractSerDe {
+public final class OpenCSVSerde extends AbstractEncodingAwareSerDe {
 
   private ObjectInspector inspector;
   private String[] outputFields;
@@ -103,7 +103,7 @@ public final class OpenCSVSerde extends AbstractSerDe {
   }
 
   @Override
-  public Writable serialize(Object obj, ObjectInspector objInspector) throws 
SerDeException {
+  public Writable doSerialize(Object obj, ObjectInspector objInspector) throws 
SerDeException {
     final StructObjectInspector outputRowOI = (StructObjectInspector) 
objInspector;
     final List<? extends StructField> outputFieldRefs = 
outputRowOI.getAllStructFieldRefs();
 
@@ -144,7 +144,7 @@ public final class OpenCSVSerde extends AbstractSerDe {
   }
 
   @Override
-  public Object deserialize(final Writable blob) throws SerDeException {
+  public Object doDeserialize(final Writable blob) throws SerDeException {
     Text rowText = (Text) blob;
 
     CSVReader csv = null;
@@ -202,4 +202,14 @@ public final class OpenCSVSerde extends AbstractSerDe {
   public Class<? extends Writable> getSerializedClass() {
     return Text.class;
   }
+
+  protected Text transformFromUTF8(Writable blob) {
+    Text text = (Text)blob;
+    return SerDeUtils.transformTextFromUTF8(text, this.charset);
+  }
+
+  protected Text transformToUTF8(Writable blob) {
+    Text text = (Text) blob;
+    return SerDeUtils.transformTextToUTF8(text, this.charset);
+  }
 }
diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/RegexSerDe.java 
b/serde/src/java/org/apache/hadoop/hive/serde2/RegexSerDe.java
index d1a24bdce8a..d3f4848a9a3 100644
--- a/serde/src/java/org/apache/hadoop/hive/serde2/RegexSerDe.java
+++ b/serde/src/java/org/apache/hadoop/hive/serde2/RegexSerDe.java
@@ -67,9 +67,9 @@ import com.google.common.collect.Lists;
  * based Regex library.
  */
 @SerDeSpec(schemaProps = {
-    serdeConstants.LIST_COLUMNS, serdeConstants.LIST_COLUMN_TYPES,
+    serdeConstants.LIST_COLUMNS, serdeConstants.LIST_COLUMN_TYPES, 
serdeConstants.SERIALIZATION_ENCODING,
     RegexSerDe.INPUT_REGEX, RegexSerDe.INPUT_REGEX_CASE_SENSITIVE })
-public class RegexSerDe extends AbstractSerDe {
+public class RegexSerDe extends AbstractEncodingAwareSerDe {
 
   public static final String INPUT_REGEX = "input.regex";
   public static final String INPUT_REGEX_CASE_SENSITIVE = 
"input.regex.case.insensitive";
@@ -159,7 +159,7 @@ public class RegexSerDe extends AbstractSerDe {
   long partialMatchedRowsCount = 0;
 
   @Override
-  public Object deserialize(Writable blob) throws SerDeException {
+  public Object doDeserialize(Writable blob) throws SerDeException {
 
     Text rowText = (Text) blob;
     Matcher m = inputPattern.matcher(rowText.toString());
@@ -267,9 +267,21 @@ public class RegexSerDe extends AbstractSerDe {
   }
 
   @Override
-  public Writable serialize(Object obj, ObjectInspector objInspector)
+  public Writable doSerialize(Object obj, ObjectInspector objInspector)
       throws SerDeException {
         throw new UnsupportedOperationException(
           "Regex SerDe doesn't support the serialize() method");
   }
+
+  @Override
+  protected Writable transformFromUTF8(Writable blob) {
+    Text text = (Text)blob;
+    return SerDeUtils.transformTextFromUTF8(text, this.charset);
+  }
+
+  @Override
+  protected Writable transformToUTF8(Writable blob) {
+    Text text = (Text)blob;
+    return SerDeUtils.transformTextToUTF8(text, this.charset);
+  }
 }

Reply via email to