TAJO-1242: Json scanner can not read some case of trucated text. (jinho) Closes #296
Project: http://git-wip-us.apache.org/repos/asf/tajo/repo Commit: http://git-wip-us.apache.org/repos/asf/tajo/commit/c665ae1f Tree: http://git-wip-us.apache.org/repos/asf/tajo/tree/c665ae1f Diff: http://git-wip-us.apache.org/repos/asf/tajo/diff/c665ae1f Branch: refs/heads/index_support Commit: c665ae1f6fc1e35e6a743e7e4e377c7885686b32 Parents: 5d9a130 Author: jhkim <[email protected]> Authored: Fri Dec 12 17:00:40 2014 +0900 Committer: jhkim <[email protected]> Committed: Fri Dec 12 17:00:40 2014 +0900 ---------------------------------------------------------------------- CHANGES | 4 +- .../testErrorTolerance3.json | 1 + .../tajo/storage/json/JsonLineDeserializer.java | 39 ++++++++++---------- .../tajo/storage/TestDelimitedTextFile.java | 17 +++++++++ 4 files changed, 41 insertions(+), 20 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tajo/blob/c665ae1f/CHANGES ---------------------------------------------------------------------- diff --git a/CHANGES b/CHANGES index d758459..e41ea56 100644 --- a/CHANGES +++ b/CHANGES @@ -109,7 +109,9 @@ Release 0.9.1 - unreleased BUG FIXES - TAJO-1239 ORDER BY with null column desc miss some data. + TAJO-1242: Json scanner can not read some case of trucated text. (jinho) + + TAJO-1239: ORDER BY with null column desc miss some data. (Hyoungjun Kim via hyunsik) TAJO-1244: tajo.worker.tmpdir.locations should use a validator for a list http://git-wip-us.apache.org/repos/asf/tajo/blob/c665ae1f/tajo-storage/src/test/resources/dataset/TestDelimitedTextFile/testErrorTolerance3.json ---------------------------------------------------------------------- diff --git a/tajo-storage/src/test/resources/dataset/TestDelimitedTextFile/testErrorTolerance3.json b/tajo-storage/src/test/resources/dataset/TestDelimitedTextFile/testErrorTolerance3.json new file mode 100644 index 0000000..a7fe424 --- /dev/null +++ b/tajo-storage/src/test/resources/dataset/TestDelimitedTextFile/testErrorTolerance3.json @@ -0,0 +1 @@ +{"id":[{"text":"json test \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tajo/blob/c665ae1f/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/json/JsonLineDeserializer.java ---------------------------------------------------------------------- diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/json/JsonLineDeserializer.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/json/JsonLineDeserializer.java index dfe36f6..a7e02a4 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/json/JsonLineDeserializer.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/json/JsonLineDeserializer.java @@ -32,7 +32,6 @@ import org.apache.tajo.common.exception.NotImplementedException; import org.apache.tajo.datum.DatumFactory; import org.apache.tajo.datum.NullDatum; import org.apache.tajo.datum.TextDatum; -import org.apache.tajo.datum.protobuf.ProtobufJsonFormat; import org.apache.tajo.storage.Tuple; import org.apache.tajo.storage.text.TextLineDeserializer; import org.apache.tajo.storage.text.TextLineParsingError; @@ -42,8 +41,8 @@ import java.util.Iterator; public class JsonLineDeserializer extends TextLineDeserializer { private JSONParser parser; - private Type [] types; - private String [] columnNames; + private Type[] types; + private String[] columnNames; public JsonLineDeserializer(Schema schema, TableMeta meta, int[] targetColumnIndexes) { super(schema, meta, targetColumnIndexes); @@ -54,27 +53,34 @@ public class JsonLineDeserializer extends TextLineDeserializer { types = SchemaUtil.toTypes(schema); columnNames = SchemaUtil.toSimpleNames(schema); - parser = new JSONParser(JSONParser.MODE_JSON_SIMPLE); + parser = new JSONParser(JSONParser.MODE_JSON_SIMPLE | JSONParser.IGNORE_CONTROL_CHAR); } @Override public void deserialize(ByteBuf buf, Tuple output) throws IOException, TextLineParsingError { - byte [] line = new byte[buf.readableBytes()]; + byte[] line = new byte[buf.readableBytes()]; buf.readBytes(line); + JSONObject object; try { - JSONObject object = (JSONObject) parser.parse(line); + object = (JSONObject) parser.parse(line); + } catch (ParseException pe) { + throw new TextLineParsingError(new String(line, TextDatum.DEFAULT_CHARSET), pe); + } catch (ArrayIndexOutOfBoundsException ae) { + // truncated value + throw new TextLineParsingError(new String(line, TextDatum.DEFAULT_CHARSET), ae); + } - for (int i = 0; i < targetColumnIndexes.length; i++) { - int actualIdx = targetColumnIndexes[i]; - String fieldName = columnNames[actualIdx]; + for (int i = 0; i < targetColumnIndexes.length; i++) { + int actualIdx = targetColumnIndexes[i]; + String fieldName = columnNames[actualIdx]; - if (!object.containsKey(fieldName)) { - output.put(actualIdx, NullDatum.get()); - continue; - } + if (!object.containsKey(fieldName)) { + output.put(actualIdx, NullDatum.get()); + continue; + } - switch (types[actualIdx]) { + switch (types[actualIdx]) { case BOOLEAN: String boolStr = object.getAsString(fieldName); if (boolStr != null) { @@ -210,12 +216,7 @@ public class JsonLineDeserializer extends TextLineDeserializer { default: throw new NotImplementedException(types[actualIdx].name() + " is not supported."); - } } - } catch (ParseException pe) { - throw new TextLineParsingError(new String(line, TextDatum.DEFAULT_CHARSET), pe); - } catch (Throwable e) { - throw new IOException(e); } } http://git-wip-us.apache.org/repos/asf/tajo/blob/c665ae1f/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/TestDelimitedTextFile.java ---------------------------------------------------------------------- diff --git a/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/TestDelimitedTextFile.java b/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/TestDelimitedTextFile.java index 8749925..7e4b7aa 100644 --- a/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/TestDelimitedTextFile.java +++ b/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/TestDelimitedTextFile.java @@ -160,4 +160,21 @@ public class TestDelimitedTextFile { } fail(); } + + @Test + public void testIgnoreTruncatedValueErrorTolerance() throws IOException { + TajoConf conf = new TajoConf(); + TableMeta meta = CatalogUtil.newTableMeta(CatalogProtos.StoreType.JSON); + meta.putOption(StorageUtil.TEXT_ERROR_TOLERANCE_MAXNUM, "1"); + FileFragment fragment = getFileFragment("testErrorTolerance3.json"); + Scanner scanner = StorageManager.getStorageManager(conf).getScanner(meta, schema, fragment); + scanner.init(); + + try { + Tuple tuple = scanner.next(); + assertNull(tuple); + } finally { + scanner.close(); + } + } }
