HIVE-12164 : non-ascii characters shows improper with insert into (Aleksei 
Statkevich via Xuefu Zhang)

Signed-off-by: Ashutosh Chauhan <[email protected]>


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/e8c8a330
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/e8c8a330
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/e8c8a330

Branch: refs/heads/master-fixed
Commit: e8c8a33029da7e7ebef7ecc4af454b26912491bc
Parents: 9dae39c
Author: Aleksei Statkevich <[email protected]>
Authored: Mon Oct 19 22:37:00 2015 -0800
Committer: Ashutosh Chauhan <[email protected]>
Committed: Thu Nov 5 13:54:53 2015 -0800

----------------------------------------------------------------------
 .../hadoop/hive/ql/parse/SemanticAnalyzer.java  | 16 ++++++++---
 .../clientpositive/insert_values_nonascii.q     |  9 +++++++
 .../clientpositive/insert_values_nonascii.q.out | 28 ++++++++++++++++++++
 3 files changed, 50 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/e8c8a330/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java 
b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java
index f3d7057..f7e2039 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java
@@ -216,6 +216,7 @@ import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
 import org.apache.hadoop.hive.shims.HadoopShims;
 import org.apache.hadoop.hive.shims.Utils;
 import org.apache.hadoop.io.IOUtils;
+import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapred.InputFormat;
 import org.apache.hadoop.mapred.OutputFormat;
 import org.apache.hadoop.security.UserGroupInformation;
@@ -733,6 +734,15 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer 
{
   }
 
   /**
+   * Convert a string to Text format and write its bytes in the same way 
TextOutputFormat would do.
+   * This is needed to properly encode non-ascii characters.
+   */
+  private static void writeAsText(String text, FSDataOutputStream out) throws 
IOException {
+    Text to = new Text(text);
+    out.write(to.getBytes(), 0, to.getLength());
+  }
+
+  /**
    * Generate a temp table out of a value clause
    * See also {@link #preProcessForInsert(ASTNode, QB)}
    */
@@ -810,10 +820,10 @@ public class SemanticAnalyzer extends 
BaseSemanticAnalyzer {
             fields.add(new FieldSchema("tmp_values_col" + nextColNum++, 
"string", ""));
           }
           if (isFirst) isFirst = false;
-          else out.writeBytes("\u0001");
-          out.writeBytes(unparseExprForValuesClause(value));
+          else writeAsText("\u0001", out);
+          writeAsText(unparseExprForValuesClause(value), out);
         }
-        out.writeBytes("\n");
+        writeAsText("\n", out);
         firstRow = false;
       }
       out.close();

http://git-wip-us.apache.org/repos/asf/hive/blob/e8c8a330/ql/src/test/queries/clientpositive/insert_values_nonascii.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/insert_values_nonascii.q 
b/ql/src/test/queries/clientpositive/insert_values_nonascii.q
new file mode 100644
index 0000000..2e4ef41
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/insert_values_nonascii.q
@@ -0,0 +1,9 @@
+set hive.support.concurrency=true;
+set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager;
+set hive.enforce.bucketing=true;
+
+create table insert_values_nonascii(t1 char(32), t2 string);
+
+insert into insert_values_nonascii values("Абвгде Garçu 谢谢",  
"Kôkaku ありがとう"), ("ございます", "kidôtai한국어");
+
+select * from insert_values_nonascii;

http://git-wip-us.apache.org/repos/asf/hive/blob/e8c8a330/ql/src/test/results/clientpositive/insert_values_nonascii.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/insert_values_nonascii.q.out 
b/ql/src/test/results/clientpositive/insert_values_nonascii.q.out
new file mode 100644
index 0000000..ca07bef
--- /dev/null
+++ b/ql/src/test/results/clientpositive/insert_values_nonascii.q.out
@@ -0,0 +1,28 @@
+PREHOOK: query: create table insert_values_nonascii(t1 char(32), t2 string)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@insert_values_nonascii
+POSTHOOK: query: create table insert_values_nonascii(t1 char(32), t2 string)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@insert_values_nonascii
+PREHOOK: query: insert into insert_values_nonascii values("Абвгде Garçu 
谢谢",  "Kôkaku ありがとう"), ("ございます", "kidôtai한국어")
+PREHOOK: type: QUERY
+PREHOOK: Input: default@values__tmp__table__1
+PREHOOK: Output: default@insert_values_nonascii
+POSTHOOK: query: insert into insert_values_nonascii values("Абвгде 
Garçu 谢谢",  "Kôkaku ありがとう"), ("ございます", 
"kidôtai한국어")
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@values__tmp__table__1
+POSTHOOK: Output: default@insert_values_nonascii
+POSTHOOK: Lineage: insert_values_nonascii.t1 EXPRESSION 
[(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1, 
type:string, comment:), ]
+POSTHOOK: Lineage: insert_values_nonascii.t2 SIMPLE 
[(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col2, 
type:string, comment:), ]
+PREHOOK: query: select * from insert_values_nonascii
+PREHOOK: type: QUERY
+PREHOOK: Input: default@insert_values_nonascii
+#### A masked pattern was here ####
+POSTHOOK: query: select * from insert_values_nonascii
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@insert_values_nonascii
+#### A masked pattern was here ####
+Абвгде Garçu 谢谢                     Kôkaku ありがとう
+ございます                                kidôtai한국어

Reply via email to