Repository: tajo Updated Branches: refs/heads/master 3224addb3 -> e5a01e093
TAJO-1955: Add a feature to strip quotes from CSV file. Closes #840 Project: http://git-wip-us.apache.org/repos/asf/tajo/repo Commit: http://git-wip-us.apache.org/repos/asf/tajo/commit/e5a01e09 Tree: http://git-wip-us.apache.org/repos/asf/tajo/tree/e5a01e09 Diff: http://git-wip-us.apache.org/repos/asf/tajo/diff/e5a01e09 Branch: refs/heads/master Commit: e5a01e0935e7816fb89a3a2c7fc97630e1d226d5 Parents: 3224add Author: Hyunsik Choi <[email protected]> Authored: Mon Nov 2 19:03:15 2015 -0800 Committer: Hyunsik Choi <[email protected]> Committed: Mon Nov 2 19:03:15 2015 -0800 ---------------------------------------------------------------------- CHANGES | 2 + .../apache/tajo/storage/StorageConstants.java | 1 + .../src/main/sphinx/table_management/text.rst | 1 + .../tajo/storage/text/CSVLineDeserializer.java | 18 ++++++- .../tajo/storage/TestDelimitedTextFile.java | 56 +++++++++++++------- .../testIncompleteQuote.txt | 1 + .../TestDelimitedTextFile/testStripQuote.txt | 6 +++ 7 files changed, 66 insertions(+), 19 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tajo/blob/e5a01e09/CHANGES ---------------------------------------------------------------------- diff --git a/CHANGES b/CHANGES index f3dfd68..426a44a 100644 --- a/CHANGES +++ b/CHANGES @@ -4,6 +4,8 @@ Release 0.12.0 - unreleased NEW FEATURES + TAJO-1955: Add a feature to strip quotes from CSV file. (hyunsik) + IMPROVEMENT TAJO-1920: Calling 'Collection.toArray()' with zero-length array argument http://git-wip-us.apache.org/repos/asf/tajo/blob/e5a01e09/tajo-common/src/main/java/org/apache/tajo/storage/StorageConstants.java ---------------------------------------------------------------------- diff --git a/tajo-common/src/main/java/org/apache/tajo/storage/StorageConstants.java b/tajo-common/src/main/java/org/apache/tajo/storage/StorageConstants.java index bb053cc..d7f1ec5 100644 --- a/tajo-common/src/main/java/org/apache/tajo/storage/StorageConstants.java +++ b/tajo-common/src/main/java/org/apache/tajo/storage/StorageConstants.java @@ -45,6 +45,7 @@ public class StorageConstants { public static final String TEXT_DELIMITER = "text.delimiter"; public static final String TEXT_NULL = "text.null"; public static final String TEXT_SERDE_CLASS = "text.serde"; + public static final String QUOTE_CHAR = "quote_char"; public static final String DEFAULT_TEXT_SERDE_CLASS = "org.apache.tajo.storage.text.CSVLineSerDe"; public static final String TEXT_SKIP_HEADER_LINE = "text.skip.headerlines"; http://git-wip-us.apache.org/repos/asf/tajo/blob/e5a01e09/tajo-docs/src/main/sphinx/table_management/text.rst ---------------------------------------------------------------------- diff --git a/tajo-docs/src/main/sphinx/table_management/text.rst b/tajo-docs/src/main/sphinx/table_management/text.rst index b79b0e2..0b408a6 100644 --- a/tajo-docs/src/main/sphinx/table_management/text.rst +++ b/tajo-docs/src/main/sphinx/table_management/text.rst @@ -42,6 +42,7 @@ The ``WITH`` clause in the CREATE TABLE statement allows users to set those para * ``text.serde``: custom (De)serializer class. ``org.apache.tajo.storage.text.CSVLineSerDe`` is the default (De)serializer class. * ``timezone``: the time zone that the table uses for writting. When table rows are read or written, ```timestamp``` and ```time``` column values are adjusted by this timezone if it is set. Time zone can be an abbreviation form like 'PST' or 'DST'. Also, it accepts an offset-based form like 'UTC+9' or a location-based form like 'Asia/Seoul'. * ``text.error-tolerance.max-num``: the maximum number of permissible parsing errors. This value should be an integer value. By default, ``text.error-tolerance.max-num`` is ``0``. According to the value, parsing errors will be handled in different ways. +* ``quote_char``: quote character. If this property is specified, the quote characters in field values will be ignored. If the quote is incomplete, the quote character will be remain in the value. It may cause invalid parsing, probably leading to NULL value for some data types like INT and FLOAT. * If ``text.error-tolerance.max-num < 0``, all parsing errors are ignored. * If ``text.error-tolerance.max-num == 0``, any parsing error is not allowed. If any error occurs, the query will be failed. (default) http://git-wip-us.apache.org/repos/asf/tajo/blob/e5a01e09/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/CSVLineDeserializer.java ---------------------------------------------------------------------- diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/CSVLineDeserializer.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/CSVLineDeserializer.java index fdc8645..8664f5e 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/CSVLineDeserializer.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/CSVLineDeserializer.java @@ -27,6 +27,7 @@ import org.apache.tajo.datum.Datum; import org.apache.tajo.datum.NullDatum; import org.apache.tajo.plan.util.PlannerUtil; import org.apache.tajo.storage.FieldSerializerDeserializer; +import org.apache.tajo.storage.StorageConstants; import org.apache.tajo.storage.Tuple; import java.io.IOException; @@ -35,6 +36,8 @@ public class CSVLineDeserializer extends TextLineDeserializer { private ByteBufProcessor processor; private FieldSerializerDeserializer fieldSerDer; private ByteBuf nullChars; + private final boolean hasQuoteChar; + private final byte quoteChar; private int delimiterCompensation; private int [] targetColumnIndexes; @@ -42,6 +45,10 @@ public class CSVLineDeserializer extends TextLineDeserializer { public CSVLineDeserializer(Schema schema, TableMeta meta, Column [] projected) { super(schema, meta); targetColumnIndexes = PlannerUtil.getTargetIds(schema, projected); + + // The quote char must be a single ASCII character. + hasQuoteChar = meta.containsOption(StorageConstants.QUOTE_CHAR); + quoteChar = meta.getOption(StorageConstants.QUOTE_CHAR, "\0").getBytes()[0]; } @Override @@ -86,7 +93,16 @@ public class CSVLineDeserializer extends TextLineDeserializer { } if (projection.length > currentTarget && currentIndex == projection[currentTarget]) { - lineBuf.setIndex(start, start + fieldLength); + final int terminalOffset = start + fieldLength; + lineBuf.setIndex(start, terminalOffset); + + // See the issue TAJO-1955. This routine strips quote if the property 'quote_char' is specified + if (hasQuoteChar) { + if (lineBuf.getByte(start) == quoteChar && lineBuf.getByte(terminalOffset - 1) == quoteChar) { + lineBuf.setIndex(start + 1, terminalOffset - 1); + } + } + try { Datum datum = fieldSerDer.deserialize(currentIndex, lineBuf, nullChars); output.put(currentTarget, datum); http://git-wip-us.apache.org/repos/asf/tajo/blob/e5a01e09/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/TestDelimitedTextFile.java ---------------------------------------------------------------------- diff --git a/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/TestDelimitedTextFile.java b/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/TestDelimitedTextFile.java index 5f8a4d1..951cc91 100644 --- a/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/TestDelimitedTextFile.java +++ b/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/TestDelimitedTextFile.java @@ -80,17 +80,6 @@ public class TestDelimitedTextFile { return new Path(resultBaseURL.toString(), suffix); } - public static Path getResultPath(Class clazz, String fileName) { - return new Path (getResourcePath("results", clazz.getSimpleName()), fileName); - } - - public static String getResultText(Class clazz, String fileName) throws IOException { - FileSystem localFS = FileSystem.getLocal(new Configuration()); - Path path = getResultPath(clazz, fileName); - Preconditions.checkState(localFS.exists(path) && localFS.isFile(path)); - return FileUtil.readTextFile(new File(path.toUri())); - } - private static final FileFragment getFileFragment(String fileName) throws IOException { TajoConf conf = new TajoConf(); Path tablePath = new Path(getResourcePath("dataset", "TestDelimitedTextFile"), fileName); @@ -100,9 +89,45 @@ public class TestDelimitedTextFile { } @Test - public void testIgnoreAllErrors() throws IOException { - TajoConf conf = new TajoConf(); + public void testStripQuote() throws IOException, CloneNotSupportedException { + TableMeta meta = CatalogUtil.newTableMeta("TEXT"); + meta.putOption(StorageUtil.TEXT_DELIMITER, ","); + meta.putOption(StorageUtil.QUOTE_CHAR, "\""); + FileFragment fragment = getFileFragment("testStripQuote.txt"); + Scanner scanner = TablespaceManager.getLocalFs().getScanner(meta, schema, fragment, null); + scanner.init(); + + Tuple tuple; + int i = 0; + while ((tuple = scanner.next()) != null) { + assertEquals(baseTuple, tuple); + i++; + } + assertEquals(6, i); + scanner.close(); + } + + @Test + public void testIncompleteQuote() throws IOException, CloneNotSupportedException { + TableMeta meta = CatalogUtil.newTableMeta("TEXT"); + meta.putOption(StorageUtil.TEXT_DELIMITER, ","); + meta.putOption(StorageUtil.QUOTE_CHAR, "\""); + FileFragment fragment = getFileFragment("testIncompleteQuote.txt"); + Scanner scanner = TablespaceManager.getLocalFs().getScanner(meta, schema, fragment, null); + scanner.init(); + Tuple tuple; + int i = 0; + while ((tuple = scanner.next()) != null) { + assertEquals("(f,hyunsik\",NULL,NULL,NULL,NULL,0.0,\"hyunsik,hyunsik,NULL)", tuple.toString()); + i++; + } + assertEquals(1, i); + scanner.close(); + } + + @Test + public void testIgnoreAllErrors() throws IOException { TableMeta meta = CatalogUtil.newTableMeta("JSON"); meta.putOption(StorageUtil.TEXT_ERROR_TOLERANCE_MAXNUM, "-1"); FileFragment fragment = getFileFragment("testErrorTolerance1.json"); @@ -121,10 +146,6 @@ public class TestDelimitedTextFile { @Test public void testIgnoreOneErrorTolerance() throws IOException { - - - TajoConf conf = new TajoConf(); - TableMeta meta = CatalogUtil.newTableMeta("JSON"); meta.putOption(StorageUtil.TEXT_ERROR_TOLERANCE_MAXNUM, "1"); FileFragment fragment = getFileFragment("testErrorTolerance1.json"); @@ -146,7 +167,6 @@ public class TestDelimitedTextFile { @Test public void testNoErrorTolerance() throws IOException { - TajoConf conf = new TajoConf(); TableMeta meta = CatalogUtil.newTableMeta("JSON"); meta.putOption(StorageUtil.TEXT_ERROR_TOLERANCE_MAXNUM, "0"); FileFragment fragment = getFileFragment("testErrorTolerance2.json"); http://git-wip-us.apache.org/repos/asf/tajo/blob/e5a01e09/tajo-storage/tajo-storage-hdfs/src/test/resources/dataset/TestDelimitedTextFile/testIncompleteQuote.txt ---------------------------------------------------------------------- diff --git a/tajo-storage/tajo-storage-hdfs/src/test/resources/dataset/TestDelimitedTextFile/testIncompleteQuote.txt b/tajo-storage/tajo-storage-hdfs/src/test/resources/dataset/TestDelimitedTextFile/testIncompleteQuote.txt new file mode 100644 index 0000000..e312b82 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/test/resources/dataset/TestDelimitedTextFile/testIncompleteQuote.txt @@ -0,0 +1 @@ +"true,hyunsik","17,59","23,77.9","271.9,"hyunsik,"aHl1bnNpaw==,192.168.0.1" http://git-wip-us.apache.org/repos/asf/tajo/blob/e5a01e09/tajo-storage/tajo-storage-hdfs/src/test/resources/dataset/TestDelimitedTextFile/testStripQuote.txt ---------------------------------------------------------------------- diff --git a/tajo-storage/tajo-storage-hdfs/src/test/resources/dataset/TestDelimitedTextFile/testStripQuote.txt b/tajo-storage/tajo-storage-hdfs/src/test/resources/dataset/TestDelimitedTextFile/testStripQuote.txt new file mode 100644 index 0000000..31d0cb8 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/test/resources/dataset/TestDelimitedTextFile/testStripQuote.txt @@ -0,0 +1,6 @@ +true,"hyunsik",17,"59","23",77.9,271.9,hyunsik,aHl1bnNpaw==,192.168.0.1 +true,hyunsik,"17",59,23,77.9,271.9,hyunsik,aHl1bnNpaw==,192.168.0.1 +true,hyunsik,17,59,23,77.9,271.9,hyunsik,"aHl1bnNpaw==",192.168.0.1 +true,"hyunsik","17",59,23,77.9,271.9,hyunsik,aHl1bnNpaw==,192.168.0.1 +true,hyunsik,17,59,"23",77.9,271.9,hyunsik,aHl1bnNpaw==,192.168.0.1 +true,"hyunsik",17,59,23,"77.9",271.9,hyunsik,aHl1bnNpaw==,192.168.0.1
