TAJO-902: Unicode delimiter does not work correctly. (jinho) Closes #53
Project: http://git-wip-us.apache.org/repos/asf/tajo/repo Commit: http://git-wip-us.apache.org/repos/asf/tajo/commit/10caff07 Tree: http://git-wip-us.apache.org/repos/asf/tajo/tree/10caff07 Diff: http://git-wip-us.apache.org/repos/asf/tajo/diff/10caff07 Branch: refs/heads/window_function Commit: 10caff074e2f00887134f94b0dc918b3cef0e824 Parents: d84d5ca Author: jinossy <[email protected]> Authored: Mon Jul 7 14:14:16 2014 +0900 Committer: jinossy <[email protected]> Committed: Mon Jul 7 14:14:16 2014 +0900 ---------------------------------------------------------------------- CHANGES | 2 + .../org/apache/tajo/cli/DescTableCommand.java | 18 +++- .../java/org/apache/tajo/util/StringUtils.java | 15 ++++ .../org/apache/tajo/util/TestStringUtil.java | 92 ++++++++++++++++++++ .../apache/tajo/engine/parser/SQLAnalyzer.java | 15 +--- 5 files changed, 128 insertions(+), 14 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tajo/blob/10caff07/CHANGES ---------------------------------------------------------------------- diff --git a/CHANGES b/CHANGES index 74958dd..661b488 100644 --- a/CHANGES +++ b/CHANGES @@ -74,6 +74,8 @@ Release 0.9.0 - unreleased BUG FIXES + TAJO-902: Unicode delimiter does not work correctly. (jinho) + TAJO-905: When to_date() parses some date without day, the result will be wrong. (hyunsik) http://git-wip-us.apache.org/repos/asf/tajo/blob/10caff07/tajo-client/src/main/java/org/apache/tajo/cli/DescTableCommand.java ---------------------------------------------------------------------- diff --git a/tajo-client/src/main/java/org/apache/tajo/cli/DescTableCommand.java b/tajo-client/src/main/java/org/apache/tajo/cli/DescTableCommand.java index 6bda7c9..d8023f2 100644 --- a/tajo-client/src/main/java/org/apache/tajo/cli/DescTableCommand.java +++ b/tajo-client/src/main/java/org/apache/tajo/cli/DescTableCommand.java @@ -18,6 +18,8 @@ package org.apache.tajo.cli; +import org.apache.commons.lang.CharUtils; +import org.apache.commons.lang.StringEscapeUtils; import org.apache.tajo.catalog.Column; import org.apache.tajo.catalog.TableDesc; import org.apache.tajo.catalog.partition.PartitionMethodDesc; @@ -84,8 +86,22 @@ public class DescTableCommand extends TajoShellCommand { } sb.append("Options: \n"); for(Map.Entry<String, String> entry : desc.getMeta().toMap().entrySet()){ + + /* + * Checks whether the character is ASCII 7 bit printable. + * For example, a printable unicode '\u007c' become the character â|â. + * + * Control-chars : ctrl-a(\u0001), tab(\u0009) .. + * Printable-chars : '|'(\u007c), ','(\u002c) .. + * */ + + String value = entry.getValue(); + String unescaped = StringEscapeUtils.unescapeJava(value); + if (unescaped.length() == 1 && CharUtils.isAsciiPrintable(unescaped.charAt(0))) { + value = unescaped; + } sb.append("\t").append("'").append(entry.getKey()).append("'").append("=") - .append("'").append(entry.getValue()).append("'").append("\n"); + .append("'").append(value).append("'").append("\n"); } sb.append("\n"); sb.append("schema: \n"); http://git-wip-us.apache.org/repos/asf/tajo/blob/10caff07/tajo-common/src/main/java/org/apache/tajo/util/StringUtils.java ---------------------------------------------------------------------- diff --git a/tajo-common/src/main/java/org/apache/tajo/util/StringUtils.java b/tajo-common/src/main/java/org/apache/tajo/util/StringUtils.java index ed9014d..41ea153 100644 --- a/tajo-common/src/main/java/org/apache/tajo/util/StringUtils.java +++ b/tajo-common/src/main/java/org/apache/tajo/util/StringUtils.java @@ -18,6 +18,8 @@ package org.apache.tajo.util; +import org.apache.commons.lang.CharUtils; +import org.apache.commons.lang.StringEscapeUtils; import org.apache.commons.lang.SystemUtils; import org.apache.hadoop.util.ShutdownHookManager; import org.apache.hadoop.util.SignalLogger; @@ -165,4 +167,17 @@ public class StringUtils { } }, SHUTDOWN_HOOK_PRIORITY); } + + public static String unicodeEscapedDelimiter(String value) { + try { + String delimiter = StringEscapeUtils.unescapeJava(value); + return unicodeEscapedDelimiter(delimiter.charAt(0)); + } catch (Throwable e) { + } + return value; + } + + public static String unicodeEscapedDelimiter(char c) { + return CharUtils.unicodeEscaped(c); + } } http://git-wip-us.apache.org/repos/asf/tajo/blob/10caff07/tajo-common/src/test/java/org/apache/tajo/util/TestStringUtil.java ---------------------------------------------------------------------- diff --git a/tajo-common/src/test/java/org/apache/tajo/util/TestStringUtil.java b/tajo-common/src/test/java/org/apache/tajo/util/TestStringUtil.java new file mode 100644 index 0000000..5c13f8f --- /dev/null +++ b/tajo-common/src/test/java/org/apache/tajo/util/TestStringUtil.java @@ -0,0 +1,92 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tajo.util; + +import org.apache.commons.lang.CharUtils; +import org.apache.commons.lang.StringEscapeUtils; +import org.junit.Test; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotEquals; + +public class TestStringUtil { + + @Test + public void testUnicodeEscapedDelimiter() { + for (int i = 0; i < 128; i++) { + char c = (char) i; + String delimiter = CharUtils.unicodeEscaped(c); + String escapedDelimiter = StringUtils.unicodeEscapedDelimiter(delimiter); + assertEquals(delimiter, escapedDelimiter); + assertEquals(1, StringEscapeUtils.unescapeJava(escapedDelimiter).length()); + assertEquals(c, StringEscapeUtils.unescapeJava(escapedDelimiter).charAt(0)); + } + } + + @Test + public void testUnescapedDelimiter() { + for (int i = 0; i < 128; i++) { + char c = (char) i; + String delimiter = String.valueOf(c); + String escapedDelimiter = StringUtils.unicodeEscapedDelimiter(delimiter); + assertEquals(CharUtils.unicodeEscaped(c), escapedDelimiter); + assertEquals(1, StringEscapeUtils.unescapeJava(escapedDelimiter).length()); + assertEquals(c, StringEscapeUtils.unescapeJava(escapedDelimiter).charAt(0)); + } + } + + @Test + public void testVariousDelimiter() { + /* + * Character ASCII Unicode + * + * Horizontal tab 9 <U0009> + * Space Bar 32 <U0020> + * 1 49 <U0031> + * | 124 <U007c> + * + * */ + + + String escapedDelimiter = "\\u0031"; + + assertEquals(escapedDelimiter, StringUtils.unicodeEscapedDelimiter("1")); + assertEquals(escapedDelimiter, StringUtils.unicodeEscapedDelimiter("\\1")); + assertEquals(escapedDelimiter, StringUtils.unicodeEscapedDelimiter("\\u0031")); + assertEquals(escapedDelimiter, StringUtils.unicodeEscapedDelimiter((char)49)); + assertNotEquals(escapedDelimiter, StringUtils.unicodeEscapedDelimiter('\001')); + + String delimiter = "|"; + assertEquals("\\u007c", StringUtils.unicodeEscapedDelimiter(delimiter)); + assertEquals(delimiter, StringEscapeUtils.unescapeJava(StringUtils.unicodeEscapedDelimiter(delimiter))); + + + String commaDelimiter = ","; + assertEquals("\\u002c", StringUtils.unicodeEscapedDelimiter(commaDelimiter)); + assertEquals(commaDelimiter, StringEscapeUtils.unescapeJava(StringUtils.unicodeEscapedDelimiter(commaDelimiter))); + + String tabDelimiter = "\t"; + assertEquals("\\u0009", StringUtils.unicodeEscapedDelimiter(tabDelimiter)); + assertEquals(tabDelimiter, StringEscapeUtils.unescapeJava(StringUtils.unicodeEscapedDelimiter(tabDelimiter))); + + String spaceDelimiter = " "; + assertEquals("\\u0020", StringUtils.unicodeEscapedDelimiter(spaceDelimiter)); + assertEquals(spaceDelimiter, StringEscapeUtils.unescapeJava(StringUtils.unicodeEscapedDelimiter(spaceDelimiter))); + } +} http://git-wip-us.apache.org/repos/asf/tajo/blob/10caff07/tajo-core/src/main/java/org/apache/tajo/engine/parser/SQLAnalyzer.java ---------------------------------------------------------------------- diff --git a/tajo-core/src/main/java/org/apache/tajo/engine/parser/SQLAnalyzer.java b/tajo-core/src/main/java/org/apache/tajo/engine/parser/SQLAnalyzer.java index a638735..01568af 100644 --- a/tajo-core/src/main/java/org/apache/tajo/engine/parser/SQLAnalyzer.java +++ b/tajo-core/src/main/java/org/apache/tajo/engine/parser/SQLAnalyzer.java @@ -24,15 +24,14 @@ import org.antlr.v4.runtime.ANTLRInputStream; import org.antlr.v4.runtime.CommonTokenStream; import org.antlr.v4.runtime.misc.NotNull; import org.antlr.v4.runtime.tree.TerminalNode; -import org.apache.commons.lang.StringEscapeUtils; import org.apache.tajo.algebra.*; import org.apache.tajo.algebra.Aggregation.GroupType; import org.apache.tajo.algebra.LiteralValue.LiteralType; import org.apache.tajo.catalog.CatalogUtil; import org.apache.tajo.engine.parser.SQLParser.*; import org.apache.tajo.storage.StorageConstants; +import org.apache.tajo.util.StringUtils; -import java.nio.charset.Charset; import java.util.*; import static org.apache.tajo.algebra.Aggregation.GroupElement; @@ -1341,7 +1340,7 @@ public class SQLAnalyzer extends SQLParserBaseVisitor<Expr> { Map<String, String> params = new HashMap<String, String>(); for (Map.Entry<String, String> entry : map.entrySet()) { if (entry.getKey().equals(StorageConstants.CSVFILE_DELIMITER)) { - params.put(entry.getKey(), escapeDelimiter(entry.getValue())); + params.put(entry.getKey(), StringUtils.unicodeEscapedDelimiter(entry.getValue())); } else { params.put(entry.getKey(), entry.getValue()); } @@ -1349,16 +1348,6 @@ public class SQLAnalyzer extends SQLParserBaseVisitor<Expr> { return params; } - public static String escapeDelimiter(String value) { - try { - String delimiter = StringEscapeUtils.unescapeJava(value); - delimiter = new String(new byte[]{Byte.valueOf(delimiter).byteValue()}, Charset.defaultCharset()); - return StringEscapeUtils.escapeJava(delimiter); - } catch (NumberFormatException e) { - } - return value; - } - private static String stripQuote(String str) { return str.substring(1, str.length() - 1); }
