Repository: tajo
Updated Branches:
  refs/heads/master 3224addb3 -> e5a01e093


TAJO-1955: Add a feature to strip quotes from CSV file.

Closes #840


Project: http://git-wip-us.apache.org/repos/asf/tajo/repo
Commit: http://git-wip-us.apache.org/repos/asf/tajo/commit/e5a01e09
Tree: http://git-wip-us.apache.org/repos/asf/tajo/tree/e5a01e09
Diff: http://git-wip-us.apache.org/repos/asf/tajo/diff/e5a01e09

Branch: refs/heads/master
Commit: e5a01e0935e7816fb89a3a2c7fc97630e1d226d5
Parents: 3224add
Author: Hyunsik Choi <[email protected]>
Authored: Mon Nov 2 19:03:15 2015 -0800
Committer: Hyunsik Choi <[email protected]>
Committed: Mon Nov 2 19:03:15 2015 -0800

----------------------------------------------------------------------
 CHANGES                                         |  2 +
 .../apache/tajo/storage/StorageConstants.java   |  1 +
 .../src/main/sphinx/table_management/text.rst   |  1 +
 .../tajo/storage/text/CSVLineDeserializer.java  | 18 ++++++-
 .../tajo/storage/TestDelimitedTextFile.java     | 56 +++++++++++++-------
 .../testIncompleteQuote.txt                     |  1 +
 .../TestDelimitedTextFile/testStripQuote.txt    |  6 +++
 7 files changed, 66 insertions(+), 19 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tajo/blob/e5a01e09/CHANGES
----------------------------------------------------------------------
diff --git a/CHANGES b/CHANGES
index f3dfd68..426a44a 100644
--- a/CHANGES
+++ b/CHANGES
@@ -4,6 +4,8 @@ Release 0.12.0 - unreleased
 
   NEW FEATURES
 
+    TAJO-1955: Add a feature to strip quotes from CSV file. (hyunsik)
+
   IMPROVEMENT
 
     TAJO-1920: Calling 'Collection.toArray()' with zero-length array argument 

http://git-wip-us.apache.org/repos/asf/tajo/blob/e5a01e09/tajo-common/src/main/java/org/apache/tajo/storage/StorageConstants.java
----------------------------------------------------------------------
diff --git 
a/tajo-common/src/main/java/org/apache/tajo/storage/StorageConstants.java 
b/tajo-common/src/main/java/org/apache/tajo/storage/StorageConstants.java
index bb053cc..d7f1ec5 100644
--- a/tajo-common/src/main/java/org/apache/tajo/storage/StorageConstants.java
+++ b/tajo-common/src/main/java/org/apache/tajo/storage/StorageConstants.java
@@ -45,6 +45,7 @@ public class StorageConstants {
   public static final String TEXT_DELIMITER = "text.delimiter";
   public static final String TEXT_NULL = "text.null";
   public static final String TEXT_SERDE_CLASS = "text.serde";
+  public static final String QUOTE_CHAR = "quote_char";
   public static final String DEFAULT_TEXT_SERDE_CLASS = 
"org.apache.tajo.storage.text.CSVLineSerDe";
 
   public static final String TEXT_SKIP_HEADER_LINE = "text.skip.headerlines";

http://git-wip-us.apache.org/repos/asf/tajo/blob/e5a01e09/tajo-docs/src/main/sphinx/table_management/text.rst
----------------------------------------------------------------------
diff --git a/tajo-docs/src/main/sphinx/table_management/text.rst 
b/tajo-docs/src/main/sphinx/table_management/text.rst
index b79b0e2..0b408a6 100644
--- a/tajo-docs/src/main/sphinx/table_management/text.rst
+++ b/tajo-docs/src/main/sphinx/table_management/text.rst
@@ -42,6 +42,7 @@ The ``WITH`` clause in the CREATE TABLE statement allows 
users to set those para
 * ``text.serde``: custom (De)serializer class. 
``org.apache.tajo.storage.text.CSVLineSerDe`` is the default (De)serializer 
class.
 * ``timezone``: the time zone that the table uses for writting. When table 
rows are read or written, ```timestamp``` and ```time``` column values are 
adjusted by this timezone if it is set. Time zone can be an abbreviation form 
like 'PST' or 'DST'. Also, it accepts an offset-based form like 'UTC+9' or a 
location-based form like 'Asia/Seoul'.
 * ``text.error-tolerance.max-num``: the maximum number of permissible parsing 
errors. This value should be an integer value. By default, 
``text.error-tolerance.max-num`` is ``0``. According to the value, parsing 
errors will be handled in different ways.
+* ``quote_char``:  quote character. If this property is specified, the quote 
characters in field values will be ignored. If the quote is incomplete, the 
quote character will be remain in the value. It may cause invalid parsing, 
probably leading to NULL value for some data types like INT and FLOAT.
 
   * If ``text.error-tolerance.max-num < 0``, all parsing errors are ignored.
   * If ``text.error-tolerance.max-num == 0``, any parsing error is not 
allowed. If any error occurs, the query will be failed. (default)

http://git-wip-us.apache.org/repos/asf/tajo/blob/e5a01e09/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/CSVLineDeserializer.java
----------------------------------------------------------------------
diff --git 
a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/CSVLineDeserializer.java
 
b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/CSVLineDeserializer.java
index fdc8645..8664f5e 100644
--- 
a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/CSVLineDeserializer.java
+++ 
b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/CSVLineDeserializer.java
@@ -27,6 +27,7 @@ import org.apache.tajo.datum.Datum;
 import org.apache.tajo.datum.NullDatum;
 import org.apache.tajo.plan.util.PlannerUtil;
 import org.apache.tajo.storage.FieldSerializerDeserializer;
+import org.apache.tajo.storage.StorageConstants;
 import org.apache.tajo.storage.Tuple;
 
 import java.io.IOException;
@@ -35,6 +36,8 @@ public class CSVLineDeserializer extends TextLineDeserializer 
{
   private ByteBufProcessor processor;
   private FieldSerializerDeserializer fieldSerDer;
   private ByteBuf nullChars;
+  private final boolean hasQuoteChar;
+  private final byte quoteChar;
   private int delimiterCompensation;
 
   private int [] targetColumnIndexes;
@@ -42,6 +45,10 @@ public class CSVLineDeserializer extends 
TextLineDeserializer {
   public CSVLineDeserializer(Schema schema, TableMeta meta, Column [] 
projected) {
     super(schema, meta);
     targetColumnIndexes = PlannerUtil.getTargetIds(schema, projected);
+
+    // The quote char must be a single ASCII character.
+    hasQuoteChar = meta.containsOption(StorageConstants.QUOTE_CHAR);
+    quoteChar = meta.getOption(StorageConstants.QUOTE_CHAR, 
"\0").getBytes()[0];
   }
 
   @Override
@@ -86,7 +93,16 @@ public class CSVLineDeserializer extends 
TextLineDeserializer {
       }
 
       if (projection.length > currentTarget && currentIndex == 
projection[currentTarget]) {
-        lineBuf.setIndex(start, start + fieldLength);
+        final int terminalOffset = start + fieldLength;
+        lineBuf.setIndex(start, terminalOffset);
+
+        // See the issue TAJO-1955. This routine strips quote if the property 
'quote_char' is specified
+        if (hasQuoteChar) {
+          if (lineBuf.getByte(start) == quoteChar && 
lineBuf.getByte(terminalOffset - 1) == quoteChar) {
+            lineBuf.setIndex(start + 1, terminalOffset - 1);
+          }
+        }
+
         try {
           Datum datum = fieldSerDer.deserialize(currentIndex, lineBuf, 
nullChars);
           output.put(currentTarget, datum);

http://git-wip-us.apache.org/repos/asf/tajo/blob/e5a01e09/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/TestDelimitedTextFile.java
----------------------------------------------------------------------
diff --git 
a/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/TestDelimitedTextFile.java
 
b/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/TestDelimitedTextFile.java
index 5f8a4d1..951cc91 100644
--- 
a/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/TestDelimitedTextFile.java
+++ 
b/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/TestDelimitedTextFile.java
@@ -80,17 +80,6 @@ public class TestDelimitedTextFile {
     return new Path(resultBaseURL.toString(), suffix);
   }
 
-  public static Path getResultPath(Class clazz, String fileName) {
-    return new Path (getResourcePath("results", clazz.getSimpleName()), 
fileName);
-  }
-
-  public static String getResultText(Class clazz, String fileName) throws 
IOException {
-    FileSystem localFS = FileSystem.getLocal(new Configuration());
-    Path path = getResultPath(clazz, fileName);
-    Preconditions.checkState(localFS.exists(path) && localFS.isFile(path));
-    return FileUtil.readTextFile(new File(path.toUri()));
-  }
-
   private static final FileFragment getFileFragment(String fileName) throws 
IOException {
     TajoConf conf = new TajoConf();
     Path tablePath = new Path(getResourcePath("dataset", 
"TestDelimitedTextFile"), fileName);
@@ -100,9 +89,45 @@ public class TestDelimitedTextFile {
   }
 
   @Test
-  public void testIgnoreAllErrors() throws IOException {
-    TajoConf conf = new TajoConf();
+  public void testStripQuote() throws IOException, CloneNotSupportedException {
+    TableMeta meta = CatalogUtil.newTableMeta("TEXT");
+    meta.putOption(StorageUtil.TEXT_DELIMITER, ",");
+    meta.putOption(StorageUtil.QUOTE_CHAR, "\"");
+    FileFragment fragment =  getFileFragment("testStripQuote.txt");
+    Scanner scanner =  TablespaceManager.getLocalFs().getScanner(meta, schema, 
fragment, null);
+    scanner.init();
+
+    Tuple tuple;
+    int i = 0;
+    while ((tuple = scanner.next()) != null) {
+      assertEquals(baseTuple, tuple);
+      i++;
+    }
+    assertEquals(6, i);
+    scanner.close();
+  }
+
+  @Test
+  public void testIncompleteQuote() throws IOException, 
CloneNotSupportedException {
+    TableMeta meta = CatalogUtil.newTableMeta("TEXT");
+    meta.putOption(StorageUtil.TEXT_DELIMITER, ",");
+    meta.putOption(StorageUtil.QUOTE_CHAR, "\"");
+    FileFragment fragment =  getFileFragment("testIncompleteQuote.txt");
+    Scanner scanner =  TablespaceManager.getLocalFs().getScanner(meta, schema, 
fragment, null);
+    scanner.init();
 
+    Tuple tuple;
+    int i = 0;
+    while ((tuple = scanner.next()) != null) {
+      
assertEquals("(f,hyunsik\",NULL,NULL,NULL,NULL,0.0,\"hyunsik,hyunsik,NULL)", 
tuple.toString());
+      i++;
+    }
+    assertEquals(1, i);
+    scanner.close();
+  }
+
+  @Test
+  public void testIgnoreAllErrors() throws IOException {
     TableMeta meta = CatalogUtil.newTableMeta("JSON");
     meta.putOption(StorageUtil.TEXT_ERROR_TOLERANCE_MAXNUM, "-1");
     FileFragment fragment =  getFileFragment("testErrorTolerance1.json");
@@ -121,10 +146,6 @@ public class TestDelimitedTextFile {
 
   @Test
   public void testIgnoreOneErrorTolerance() throws IOException {
-
-
-    TajoConf conf = new TajoConf();
-
     TableMeta meta = CatalogUtil.newTableMeta("JSON");
     meta.putOption(StorageUtil.TEXT_ERROR_TOLERANCE_MAXNUM, "1");
     FileFragment fragment =  getFileFragment("testErrorTolerance1.json");
@@ -146,7 +167,6 @@ public class TestDelimitedTextFile {
 
   @Test
   public void testNoErrorTolerance() throws IOException {
-    TajoConf conf = new TajoConf();
     TableMeta meta = CatalogUtil.newTableMeta("JSON");
     meta.putOption(StorageUtil.TEXT_ERROR_TOLERANCE_MAXNUM, "0");
     FileFragment fragment =  getFileFragment("testErrorTolerance2.json");

http://git-wip-us.apache.org/repos/asf/tajo/blob/e5a01e09/tajo-storage/tajo-storage-hdfs/src/test/resources/dataset/TestDelimitedTextFile/testIncompleteQuote.txt
----------------------------------------------------------------------
diff --git 
a/tajo-storage/tajo-storage-hdfs/src/test/resources/dataset/TestDelimitedTextFile/testIncompleteQuote.txt
 
b/tajo-storage/tajo-storage-hdfs/src/test/resources/dataset/TestDelimitedTextFile/testIncompleteQuote.txt
new file mode 100644
index 0000000..e312b82
--- /dev/null
+++ 
b/tajo-storage/tajo-storage-hdfs/src/test/resources/dataset/TestDelimitedTextFile/testIncompleteQuote.txt
@@ -0,0 +1 @@
+"true,hyunsik","17,59","23,77.9","271.9,"hyunsik,"aHl1bnNpaw==,192.168.0.1"

http://git-wip-us.apache.org/repos/asf/tajo/blob/e5a01e09/tajo-storage/tajo-storage-hdfs/src/test/resources/dataset/TestDelimitedTextFile/testStripQuote.txt
----------------------------------------------------------------------
diff --git 
a/tajo-storage/tajo-storage-hdfs/src/test/resources/dataset/TestDelimitedTextFile/testStripQuote.txt
 
b/tajo-storage/tajo-storage-hdfs/src/test/resources/dataset/TestDelimitedTextFile/testStripQuote.txt
new file mode 100644
index 0000000..31d0cb8
--- /dev/null
+++ 
b/tajo-storage/tajo-storage-hdfs/src/test/resources/dataset/TestDelimitedTextFile/testStripQuote.txt
@@ -0,0 +1,6 @@
+true,"hyunsik",17,"59","23",77.9,271.9,hyunsik,aHl1bnNpaw==,192.168.0.1
+true,hyunsik,"17",59,23,77.9,271.9,hyunsik,aHl1bnNpaw==,192.168.0.1
+true,hyunsik,17,59,23,77.9,271.9,hyunsik,"aHl1bnNpaw==",192.168.0.1
+true,"hyunsik","17",59,23,77.9,271.9,hyunsik,aHl1bnNpaw==,192.168.0.1
+true,hyunsik,17,59,"23",77.9,271.9,hyunsik,aHl1bnNpaw==,192.168.0.1
+true,"hyunsik",17,59,23,"77.9",271.9,hyunsik,aHl1bnNpaw==,192.168.0.1

Reply via email to