Repository: incubator-carbondata
Updated Branches:
  refs/heads/master 2c9cd7105 -> 7a6e3517d


Problem: Equal or lesser value of MAXCOLUMNS option than column count in CSV 
header results into array index of bound exception

Analysis: If column count in CSV header is more or equal to MAXCOLUMNS option 
value then array index out of bound exception is thrown by the Univocity CSV 
parser. This is because while parsing the row, parser adds each row to an array 
and increments the index and after incrementing it performs one more operation 
using the incremented index value which leads to array index pf bound 
exception. Code snipped as attached below for CSV parser.

public void valueParsed() {
        this.parsedValues[column++] = appender.getAndReset();
        this.appender = appenders[column];
}

Fix: Whenever Column count in CSV header is equal or more than MAXCOLUMNS 
option value or default value, increment it by 1.

Impact: Data load flow


Project: http://git-wip-us.apache.org/repos/asf/incubator-carbondata/repo
Commit: 
http://git-wip-us.apache.org/repos/asf/incubator-carbondata/commit/c90c68cc
Tree: http://git-wip-us.apache.org/repos/asf/incubator-carbondata/tree/c90c68cc
Diff: http://git-wip-us.apache.org/repos/asf/incubator-carbondata/diff/c90c68cc

Branch: refs/heads/master
Commit: c90c68cc7d432a7f106e352fa0869df798ff3c97
Parents: 2c9cd71
Author: manishgupta88 <tomanishgupt...@gmail.com>
Authored: Tue Sep 20 19:51:33 2016 +0530
Committer: Venkata Ramana G <ramana.gollam...@huawei.com>
Committed: Wed Sep 21 00:14:59 2016 +0530

----------------------------------------------------------------------
 .../TestDataLoadWithColumnsMoreThanSchema.scala | 32 ++++++++++++++++++++
 .../csvreaderstep/UnivocityCsvParser.java       | 12 +++++---
 2 files changed, 40 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-carbondata/blob/c90c68cc/integration/spark/src/test/scala/org/apache/carbondata/spark/testsuite/dataload/TestDataLoadWithColumnsMoreThanSchema.scala
----------------------------------------------------------------------
diff --git 
a/integration/spark/src/test/scala/org/apache/carbondata/spark/testsuite/dataload/TestDataLoadWithColumnsMoreThanSchema.scala
 
b/integration/spark/src/test/scala/org/apache/carbondata/spark/testsuite/dataload/TestDataLoadWithColumnsMoreThanSchema.scala
index 4e5a207..da53143 100644
--- 
a/integration/spark/src/test/scala/org/apache/carbondata/spark/testsuite/dataload/TestDataLoadWithColumnsMoreThanSchema.scala
+++ 
b/integration/spark/src/test/scala/org/apache/carbondata/spark/testsuite/dataload/TestDataLoadWithColumnsMoreThanSchema.scala
@@ -99,6 +99,38 @@ class TestDataLoadWithColumnsMoreThanSchema extends 
QueryTest with BeforeAndAfte
     }
   }
 
+  test("test for boundary value for maxcolumns") {
+    sql("DROP TABLE IF EXISTS boundary_max_columns_test")
+    sql("CREATE TABLE boundary_max_columns_test (empno string, empname String, 
designation String, doj String, " +
+        "workgroupcategory string, workgroupcategoryname String, deptno 
string, deptname String, " +
+        "projectcode string, projectjoindate String, projectenddate 
String,attendance double," +
+        "utilization double,salary double) STORED BY 
'org.apache.carbondata.format' TBLPROPERTIES" +
+        
"('DICTIONARY_EXCLUDE'='empno,empname,designation,doj,workgroupcategory," +
+        
"workgroupcategoryname,deptno,deptname,projectcode,projectjoindate,projectenddate')")
+    try {
+      sql("LOAD DATA LOCAL INPATH './src/test/resources/data.csv' into table 
boundary_max_columns_test options('MAXCOLUMNS'='14')")
+      assert(true)
+    } catch {
+      case _ => assert(false)
+    }
+  }
+
+  test("test for maxcolumns value less than columns in 1st line of csv file") {
+    sql("DROP TABLE IF EXISTS boundary_max_columns_test")
+    sql("CREATE TABLE boundary_max_columns_test (empno string, empname String, 
designation String, doj String, " +
+        "workgroupcategory string, workgroupcategoryname String, deptno 
string, deptname String, " +
+        "projectcode string, projectjoindate String, projectenddate 
String,attendance double," +
+        "utilization double,salary double) STORED BY 
'org.apache.carbondata.format' TBLPROPERTIES" +
+        
"('DICTIONARY_EXCLUDE'='empno,empname,designation,doj,workgroupcategory," +
+        
"workgroupcategoryname,deptno,deptname,projectcode,projectjoindate,projectenddate')")
+    try {
+      sql("LOAD DATA LOCAL INPATH './src/test/resources/data.csv' into table 
boundary_max_columns_test options('MAXCOLUMNS'='13')")
+      assert(true)
+    } catch {
+      case _ => assert(false)
+    }
+  }
+
   override def afterAll {
     sql("DROP TABLE IF EXISTS char_test")
     sql("DROP TABLE IF EXISTS hive_char_test")

http://git-wip-us.apache.org/repos/asf/incubator-carbondata/blob/c90c68cc/processing/src/main/java/org/apache/carbondata/processing/csvreaderstep/UnivocityCsvParser.java
----------------------------------------------------------------------
diff --git 
a/processing/src/main/java/org/apache/carbondata/processing/csvreaderstep/UnivocityCsvParser.java
 
b/processing/src/main/java/org/apache/carbondata/processing/csvreaderstep/UnivocityCsvParser.java
index f72dd5b..a677a50 100644
--- 
a/processing/src/main/java/org/apache/carbondata/processing/csvreaderstep/UnivocityCsvParser.java
+++ 
b/processing/src/main/java/org/apache/carbondata/processing/csvreaderstep/UnivocityCsvParser.java
@@ -128,8 +128,10 @@ public class UnivocityCsvParser {
   private int getMaxColumnsForParsing(int columnCountInSchema, int maxColumns) 
{
     int maxNumberOfColumnsForParsing = 
DEFAULT_MAX_NUMBER_OF_COLUMNS_FOR_PARSING;
     if (maxColumns > 0) {
-      if (columnCountInSchema > maxColumns) {
-        maxNumberOfColumnsForParsing = columnCountInSchema;
+      if (columnCountInSchema >= maxColumns) {
+        // univocity parser needs one extra count from the number of columns
+        // specified during processing. eg. columnCount=12, then array size 
should be 13
+        maxNumberOfColumnsForParsing = columnCountInSchema + 1;
       } else if (maxColumns > THRESHOLD_MAX_NUMBER_OF_COLUMNS_FOR_PARSING) {
         maxNumberOfColumnsForParsing = 
THRESHOLD_MAX_NUMBER_OF_COLUMNS_FOR_PARSING;
         LOGGER.info("MAXCOLUMNS option value configured is more than system 
allowed limit. "
@@ -138,8 +140,10 @@ public class UnivocityCsvParser {
       } else {
         maxNumberOfColumnsForParsing = maxColumns;
       }
-    } else if (columnCountInSchema > 
DEFAULT_MAX_NUMBER_OF_COLUMNS_FOR_PARSING) {
-      maxNumberOfColumnsForParsing = columnCountInSchema;
+    } else if (columnCountInSchema >= 
DEFAULT_MAX_NUMBER_OF_COLUMNS_FOR_PARSING) {
+      // univocity parser needs one extra count from the number of columns
+      // specified during processing. eg. columnCount=2200, then array size 
should be 2201
+      maxNumberOfColumnsForParsing = columnCountInSchema + 1;
     }
     return maxNumberOfColumnsForParsing;
   }

Reply via email to