HIVE-12164 : non-ascii characters shows improper with insert into (Aleksei Statkevich via Xuefu Zhang)
Signed-off-by: Ashutosh Chauhan <[email protected]> Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/e8c8a330 Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/e8c8a330 Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/e8c8a330 Branch: refs/heads/master-fixed Commit: e8c8a33029da7e7ebef7ecc4af454b26912491bc Parents: 9dae39c Author: Aleksei Statkevich <[email protected]> Authored: Mon Oct 19 22:37:00 2015 -0800 Committer: Ashutosh Chauhan <[email protected]> Committed: Thu Nov 5 13:54:53 2015 -0800 ---------------------------------------------------------------------- .../hadoop/hive/ql/parse/SemanticAnalyzer.java | 16 ++++++++--- .../clientpositive/insert_values_nonascii.q | 9 +++++++ .../clientpositive/insert_values_nonascii.q.out | 28 ++++++++++++++++++++ 3 files changed, 50 insertions(+), 3 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/e8c8a330/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java index f3d7057..f7e2039 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java @@ -216,6 +216,7 @@ import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; import org.apache.hadoop.hive.shims.HadoopShims; import org.apache.hadoop.hive.shims.Utils; import org.apache.hadoop.io.IOUtils; +import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.InputFormat; import org.apache.hadoop.mapred.OutputFormat; import org.apache.hadoop.security.UserGroupInformation; @@ -733,6 +734,15 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer { } /** + * Convert a string to Text format and write its bytes in the same way TextOutputFormat would do. + * This is needed to properly encode non-ascii characters. + */ + private static void writeAsText(String text, FSDataOutputStream out) throws IOException { + Text to = new Text(text); + out.write(to.getBytes(), 0, to.getLength()); + } + + /** * Generate a temp table out of a value clause * See also {@link #preProcessForInsert(ASTNode, QB)} */ @@ -810,10 +820,10 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer { fields.add(new FieldSchema("tmp_values_col" + nextColNum++, "string", "")); } if (isFirst) isFirst = false; - else out.writeBytes("\u0001"); - out.writeBytes(unparseExprForValuesClause(value)); + else writeAsText("\u0001", out); + writeAsText(unparseExprForValuesClause(value), out); } - out.writeBytes("\n"); + writeAsText("\n", out); firstRow = false; } out.close(); http://git-wip-us.apache.org/repos/asf/hive/blob/e8c8a330/ql/src/test/queries/clientpositive/insert_values_nonascii.q ---------------------------------------------------------------------- diff --git a/ql/src/test/queries/clientpositive/insert_values_nonascii.q b/ql/src/test/queries/clientpositive/insert_values_nonascii.q new file mode 100644 index 0000000..2e4ef41 --- /dev/null +++ b/ql/src/test/queries/clientpositive/insert_values_nonascii.q @@ -0,0 +1,9 @@ +set hive.support.concurrency=true; +set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager; +set hive.enforce.bucketing=true; + +create table insert_values_nonascii(t1 char(32), t2 string); + +insert into insert_values_nonascii values("Ðбвгде Garçu 谢谢", "Kôkaku ãããã¨ã"), ("ãããã¾ã", "kidôtaiíêµì´"); + +select * from insert_values_nonascii; http://git-wip-us.apache.org/repos/asf/hive/blob/e8c8a330/ql/src/test/results/clientpositive/insert_values_nonascii.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/insert_values_nonascii.q.out b/ql/src/test/results/clientpositive/insert_values_nonascii.q.out new file mode 100644 index 0000000..ca07bef --- /dev/null +++ b/ql/src/test/results/clientpositive/insert_values_nonascii.q.out @@ -0,0 +1,28 @@ +PREHOOK: query: create table insert_values_nonascii(t1 char(32), t2 string) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@insert_values_nonascii +POSTHOOK: query: create table insert_values_nonascii(t1 char(32), t2 string) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@insert_values_nonascii +PREHOOK: query: insert into insert_values_nonascii values("Ðбвгде Garçu 谢谢", "Kôkaku ãããã¨ã"), ("ãããã¾ã", "kidôtaiíêµì´") +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__1 +PREHOOK: Output: default@insert_values_nonascii +POSTHOOK: query: insert into insert_values_nonascii values("Ðбвгде Garçu 谢谢", "Kôkaku ãããã¨ã"), ("ãããã¾ã", "kidôtaiíêµì´") +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__1 +POSTHOOK: Output: default@insert_values_nonascii +POSTHOOK: Lineage: insert_values_nonascii.t1 EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: insert_values_nonascii.t2 SIMPLE [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +PREHOOK: query: select * from insert_values_nonascii +PREHOOK: type: QUERY +PREHOOK: Input: default@insert_values_nonascii +#### A masked pattern was here #### +POSTHOOK: query: select * from insert_values_nonascii +POSTHOOK: type: QUERY +POSTHOOK: Input: default@insert_values_nonascii +#### A masked pattern was here #### +Ðбвгде Garçu 谢谢 Kôkaku ãããã¨ã +ãããã¾ã kidôtaiíêµì´
