Repository: incubator-carbondata
Updated Branches:
  refs/heads/master 4003811b0 -> de56d0e40


Problem: Higher MAXCOLUMNS value in load DML options is leading to out of 
memory error

Analysis: When a higher value lets say Integer max value is configured for 
maxcolumns option in load DML and executor memory is less, then in that case 
UnivocityCsvParser throws an out of memory error when it tries to create an 
array of size of maxColumns option value.

Fix: Set the threshold value for maxColumns option value that our system can 
support and if maxColumns option value is greater than threshold value then 
assign the threshold value to maxColumns option value

Impact: Data loading


Project: http://git-wip-us.apache.org/repos/asf/incubator-carbondata/repo
Commit: 
http://git-wip-us.apache.org/repos/asf/incubator-carbondata/commit/15c72428
Tree: http://git-wip-us.apache.org/repos/asf/incubator-carbondata/tree/15c72428
Diff: http://git-wip-us.apache.org/repos/asf/incubator-carbondata/diff/15c72428

Branch: refs/heads/master
Commit: 15c72428ce63b2103f05363aa0390075753fb73b
Parents: 4003811
Author: manishgupta88 <tomanishgupt...@gmail.com>
Authored: Sat Sep 17 10:52:27 2016 +0530
Committer: Venkata Ramana G <ramana.gollam...@huawei.com>
Committed: Sun Sep 18 01:50:36 2016 +0530

----------------------------------------------------------------------
 .../TestDataLoadWithColumnsMoreThanSchema.scala        | 12 ++++++++++++
 .../processing/csvreaderstep/UnivocityCsvParser.java   | 13 +++++++++++--
 2 files changed, 23 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-carbondata/blob/15c72428/integration/spark/src/test/scala/org/apache/carbondata/spark/testsuite/dataload/TestDataLoadWithColumnsMoreThanSchema.scala
----------------------------------------------------------------------
diff --git 
a/integration/spark/src/test/scala/org/apache/carbondata/spark/testsuite/dataload/TestDataLoadWithColumnsMoreThanSchema.scala
 
b/integration/spark/src/test/scala/org/apache/carbondata/spark/testsuite/dataload/TestDataLoadWithColumnsMoreThanSchema.scala
index 7bd29d5..4e5a207 100644
--- 
a/integration/spark/src/test/scala/org/apache/carbondata/spark/testsuite/dataload/TestDataLoadWithColumnsMoreThanSchema.scala
+++ 
b/integration/spark/src/test/scala/org/apache/carbondata/spark/testsuite/dataload/TestDataLoadWithColumnsMoreThanSchema.scala
@@ -87,6 +87,18 @@ class TestDataLoadWithColumnsMoreThanSchema extends 
QueryTest with BeforeAndAfte
     }
   }
 
+  test("test for maxcolumns option value greater than threshold value for 
maxcolumns") {
+    sql("DROP TABLE IF EXISTS valid_max_columns_test")
+    sql("CREATE TABLE valid_max_columns_test (imei string,age int,task 
bigint,num double,level decimal(10,3),productdate timestamp,mark int,name 
string)STORED BY 'org.apache.carbondata.format'")
+    try {
+      sql("LOAD DATA LOCAL INPATH './src/test/resources/character_carbon.csv' 
into table valid_max_columns_test options('MAXCOLUMNS'='22000')")
+      checkAnswer(sql("select count(*) from valid_max_columns_test"),
+        sql("select count(*) from hive_char_test"))
+    } catch {
+      case _ => assert(false)
+    }
+  }
+
   override def afterAll {
     sql("DROP TABLE IF EXISTS char_test")
     sql("DROP TABLE IF EXISTS hive_char_test")

http://git-wip-us.apache.org/repos/asf/incubator-carbondata/blob/15c72428/processing/src/main/java/org/apache/carbondata/processing/csvreaderstep/UnivocityCsvParser.java
----------------------------------------------------------------------
diff --git 
a/processing/src/main/java/org/apache/carbondata/processing/csvreaderstep/UnivocityCsvParser.java
 
b/processing/src/main/java/org/apache/carbondata/processing/csvreaderstep/UnivocityCsvParser.java
index 89eec54..f72dd5b 100644
--- 
a/processing/src/main/java/org/apache/carbondata/processing/csvreaderstep/UnivocityCsvParser.java
+++ 
b/processing/src/main/java/org/apache/carbondata/processing/csvreaderstep/UnivocityCsvParser.java
@@ -49,6 +49,10 @@ public class UnivocityCsvParser {
    */
   private static final int DEFAULT_MAX_NUMBER_OF_COLUMNS_FOR_PARSING = 2000;
   /**
+   * Maximum allowed value for number of columns to be parsed in each row
+   */
+  private static final int THRESHOLD_MAX_NUMBER_OF_COLUMNS_FOR_PARSING = 20000;
+  /**
    * reader for csv
    */
   private Reader inputStreamReader;
@@ -125,12 +129,17 @@ public class UnivocityCsvParser {
     int maxNumberOfColumnsForParsing = 
DEFAULT_MAX_NUMBER_OF_COLUMNS_FOR_PARSING;
     if (maxColumns > 0) {
       if (columnCountInSchema > maxColumns) {
-        maxNumberOfColumnsForParsing = columnCountInSchema + 10;
+        maxNumberOfColumnsForParsing = columnCountInSchema;
+      } else if (maxColumns > THRESHOLD_MAX_NUMBER_OF_COLUMNS_FOR_PARSING) {
+        maxNumberOfColumnsForParsing = 
THRESHOLD_MAX_NUMBER_OF_COLUMNS_FOR_PARSING;
+        LOGGER.info("MAXCOLUMNS option value configured is more than system 
allowed limit. "
+            + "Therefore threshold value for max column parsing will be 
considered: "
+            + THRESHOLD_MAX_NUMBER_OF_COLUMNS_FOR_PARSING);
       } else {
         maxNumberOfColumnsForParsing = maxColumns;
       }
     } else if (columnCountInSchema > 
DEFAULT_MAX_NUMBER_OF_COLUMNS_FOR_PARSING) {
-      maxNumberOfColumnsForParsing = columnCountInSchema + 10;
+      maxNumberOfColumnsForParsing = columnCountInSchema;
     }
     return maxNumberOfColumnsForParsing;
   }

Reply via email to