Repository: tajo Updated Branches: refs/heads/branch-0.11.0 9aefc9f01 -> 3e9b9f61a
TAJO-1777: JsonLineDeserializer returns invalid unicode text, if contains control character. Closes #696 Project: http://git-wip-us.apache.org/repos/asf/tajo/repo Commit: http://git-wip-us.apache.org/repos/asf/tajo/commit/3e9b9f61 Tree: http://git-wip-us.apache.org/repos/asf/tajo/tree/3e9b9f61 Diff: http://git-wip-us.apache.org/repos/asf/tajo/diff/3e9b9f61 Branch: refs/heads/branch-0.11.0 Commit: 3e9b9f61a65dafe83d6ef2223b1462eb8bf94867 Parents: 9aefc9f Author: Jinho Kim <[email protected]> Authored: Tue Aug 18 16:55:18 2015 +0900 Committer: Jinho Kim <[email protected]> Committed: Tue Aug 18 16:55:18 2015 +0900 ---------------------------------------------------------------------- CHANGES | 3 ++ .../tajo/storage/json/JsonLineDeserializer.java | 15 +++++---- .../apache/tajo/storage/json/TestJsonSerDe.java | 32 ++++++++++++++++++++ .../testUnicodeWithControlChar.json | 1 + 4 files changed, 43 insertions(+), 8 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tajo/blob/3e9b9f61/CHANGES ---------------------------------------------------------------------- diff --git a/CHANGES b/CHANGES index f6c5461..16f186f 100644 --- a/CHANGES +++ b/CHANGES @@ -221,6 +221,9 @@ Release 0.11.0 - unreleased BUG FIXES + TAJO-1777: JsonLineDeserializer returns invalid unicode text, + if contains control character. (jinho) + TAJO-1779: Remove "DFSInputStream has been closed already" messages in DelimitedLineReader. (jinho) http://git-wip-us.apache.org/repos/asf/tajo/blob/3e9b9f61/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/json/JsonLineDeserializer.java ---------------------------------------------------------------------- diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/json/JsonLineDeserializer.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/json/JsonLineDeserializer.java index c720118..9216025 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/json/JsonLineDeserializer.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/json/JsonLineDeserializer.java @@ -21,23 +21,22 @@ package org.apache.tajo.storage.json; import com.facebook.presto.hive.shaded.com.google.common.collect.Lists; import io.netty.buffer.ByteBuf; +import io.netty.util.CharsetUtil; import net.minidev.json.JSONObject; import net.minidev.json.parser.JSONParser; import net.minidev.json.parser.ParseException; -import org.apache.tajo.catalog.*; import org.apache.commons.net.util.Base64; -import org.apache.tajo.catalog.Schema; -import org.apache.tajo.catalog.TableMeta; +import org.apache.tajo.catalog.*; import org.apache.tajo.common.TajoDataTypes.Type; import org.apache.tajo.datum.DatumFactory; import org.apache.tajo.datum.NullDatum; -import org.apache.tajo.datum.TextDatum; import org.apache.tajo.exception.NotImplementedException; import org.apache.tajo.storage.Tuple; import org.apache.tajo.storage.text.TextLineDeserializer; import org.apache.tajo.storage.text.TextLineParsingError; import java.io.IOException; +import java.nio.charset.CharsetDecoder; import java.util.Map; public class JsonLineDeserializer extends TextLineDeserializer { @@ -46,6 +45,7 @@ public class JsonLineDeserializer extends TextLineDeserializer { // Full Path -> Type private final Map<String, Type> types; private final String [] projectedPaths; + private final CharsetDecoder decoder = CharsetUtil.getDecoder(CharsetUtil.UTF_8); public JsonLineDeserializer(Schema schema, TableMeta meta, Column [] projected) { super(schema, meta); @@ -214,17 +214,16 @@ public class JsonLineDeserializer extends TextLineDeserializer { @Override public void deserialize(ByteBuf buf, Tuple output) throws IOException, TextLineParsingError { - byte[] line = new byte[buf.readableBytes()]; - buf.readBytes(line); + String line = decoder.decode(buf.nioBuffer(buf.readerIndex(), buf.readableBytes())).toString(); JSONObject object; try { object = (JSONObject) parser.parse(line); } catch (ParseException pe) { - throw new TextLineParsingError(new String(line, TextDatum.DEFAULT_CHARSET), pe); + throw new TextLineParsingError(line, pe); } catch (ArrayIndexOutOfBoundsException ae) { // truncated value - throw new TextLineParsingError(new String(line, TextDatum.DEFAULT_CHARSET), ae); + throw new TextLineParsingError(line, ae); } for (int i = 0; i < projectedPaths.length; i++) { http://git-wip-us.apache.org/repos/asf/tajo/blob/3e9b9f61/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/json/TestJsonSerDe.java ---------------------------------------------------------------------- diff --git a/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/json/TestJsonSerDe.java b/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/json/TestJsonSerDe.java index 8095081..88d7536 100644 --- a/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/json/TestJsonSerDe.java +++ b/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/json/TestJsonSerDe.java @@ -93,4 +93,36 @@ public class TestJsonSerDe { assertEquals(baseTuple, tuple); } + + @Test + public void testUnicodeWithControlChar() throws IOException { + TajoConf conf = new TajoConf(); + + TableMeta meta = CatalogUtil.newTableMeta("JSON"); + Path tablePath = new Path(getResourcePath("dataset", "TestJsonSerDe"), "testUnicodeWithControlChar.json"); + FileSystem fs = FileSystem.getLocal(conf); + FileStatus status = fs.getFileStatus(tablePath); + FileFragment fragment = new FileFragment("table", tablePath, 0, status.getLen()); + + Schema schema = new Schema(); + schema.addColumn("col1", TajoDataTypes.Type.TEXT); + schema.addColumn("col2", TajoDataTypes.Type.TEXT); + schema.addColumn("col3", TajoDataTypes.Type.TEXT); + Scanner scanner = TablespaceManager.getLocalFs().getScanner(meta, schema, fragment); + scanner.init(); + + Tuple tuple = scanner.next(); + assertNotNull(tuple); + assertNull(scanner.next()); + scanner.close(); + + + Tuple baseTuple = new VTuple(new Datum[] { + DatumFactory.createText("tajo"), + DatumFactory.createText("íì¡°"), + DatumFactory.createText("í\nì¡°") + }); + + assertEquals(baseTuple, tuple); + } } http://git-wip-us.apache.org/repos/asf/tajo/blob/3e9b9f61/tajo-storage/tajo-storage-hdfs/src/test/resources/dataset/TestJsonSerDe/testUnicodeWithControlChar.json ---------------------------------------------------------------------- diff --git a/tajo-storage/tajo-storage-hdfs/src/test/resources/dataset/TestJsonSerDe/testUnicodeWithControlChar.json b/tajo-storage/tajo-storage-hdfs/src/test/resources/dataset/TestJsonSerDe/testUnicodeWithControlChar.json new file mode 100644 index 0000000..5446469 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/test/resources/dataset/TestJsonSerDe/testUnicodeWithControlChar.json @@ -0,0 +1 @@ +{"col1": "tajo", "col2":"íì¡°", "col3":"í\nì¡°"}
