This is an automated email from the ASF dual-hosted git repository.

ravipesala pushed a commit to branch branch-1.6
in repository https://gitbox.apache.org/repos/asf/carbondata.git

commit 93425458a93871fd18b1c3c41da396dbb06c02c8
Author: Zhang Zhichao <441586...@qq.com>
AuthorDate: Wed Sep 25 15:58:35 2019 +0800

    [CARBONDATA-3527] Fix 'String length cannot exceed 32000 characters' issue 
when load data with 'GLOBAL_SORT' from csv files which include big complex type 
data
    
    Problem:
    When complex type data is used more than 32000 characters to indicate in 
csv file, and load data with 'GLOBAL_SORT' from these csv files, it will throw 
'String length cannot exceed 32000 characters' exception.
    
    Cause:
    Use 'GLOBAL_SORT' to load data from csv files, it reads files and firstly 
store data in StringArrayRow, the type of all data are string, when call 
'CarbonScalaUtil.getString' in 'NewRddIterator.next', it will check the length 
of all data and throw 'String length cannot exceed 32000 characters' exception 
even if it's complex type data which store as more than 32000 characters in csv 
files.
    
    Solution:
    In 'FieldConverter.objectToString' (called in 'CarbonScalaUtil.getString'), 
if the data type of field is complex type, don't check the length.
    
    This closes #3399
---
 .../src/test/resources/complexdata3.csv            | 10 +++++
 .../complexType/TestComplexDataType.scala          | 52 ++++++++++++++++++++++
 .../spark/rdd/NewCarbonDataLoadRDD.scala           |  6 ++-
 .../carbondata/spark/util/CarbonScalaUtil.scala    |  4 +-
 .../streaming/parser/FieldConverter.scala          | 14 +++---
 5 files changed, 79 insertions(+), 7 deletions(-)

diff --git a/integration/spark-common-test/src/test/resources/complexdata3.csv 
b/integration/spark-common-test/src/test/resources/complexdata3.csv
new file mode 100644
index 0000000..63cd44b
--- /dev/null
+++ b/integration/spark-common-test/src/test/resources/complexdata3.csv
@@ -0,0 +1,10 @@
+e01a1773-bd37-40be-a1de-d7e74837a281   (0551)96116063  886     0031    5       
(0551)46819921  853             4       0       1568220618904   50      asp     
fk      2745000 1       0       0       0       0       
-0.19569306\0020.10781755\002-0.06963766\002-0.06576662\002-0.17820272\002-0.01949397\0020.08014756\002-0.05287997\0020.02067086\002-0.11302640\0020.07383678\0020.07296083\0020.11693181\002-0.06988186\0020.05753217\002-0.02308202\002-0.03685183\0020.05840293\0020.03959572\002-0.01631518\0020.05918765\0020.07385136\002-0.05143059\002-0.19158234\0020.13839211\002
 [...]
+f72ce5cb-2ea6-423b-8c1f-6dadfd6f52e7   (0551)73382297  853     0031    4       
(0551)73382297  49              9       0       1568275177770   1559    asp     
fk      5821000 1       0       0       0       0       
-0.19569308\0020.10781755\002-0.06963766\002-0.06576661\002-0.17820270\002-0.01949396\0020.08014755\002-0.05287996\0020.02067086\002-0.11302640\0020.07383677\0020.07296082\0020.11693182\002-0.06988187\0020.05753216\002-0.02308202\002-0.03685183\0020.05840293\0020.03959572\002-0.01631517\0020.05918765\0020.07385137\002-0.05143059\002-0.19158235\0020.13839212\00
 [...]
+e282ecb5-9be8-4a0e-8faf-d10e535ab877   13396633307     49      0031    9       
13918448986     1               7       0       1568260253193   1150    asp     
fk      3884000 1       0       0       0       0       
-0.19569308\0020.10781755\002-0.06963766\002-0.06576661\002-0.17820270\002-0.01949396\0020.08014755\002-0.05287996\0020.02067086\002-0.11302640\0020.07383677\0020.07296082\0020.11693182\002-0.06988187\0020.05753216\002-0.02308202\002-0.03685183\0020.05840293\0020.03959572\002-0.01631517\0020.05918765\0020.07385137\002-0.05143059\002-0.19158235\0020.13839212\002-0.0826
 [...]
+01e36a06-b4fd-4638-862c-2785f9e4331b   13924865616     82      0031    0       
0086(021)60080162       82              6       0       1568293725356   2108    
asp     fk      3152000 1       0       0       0       0       
-0.19569308\0020.10781755\002-0.06963766\002-0.06576661\002-0.17820270\002-0.01949396\0020.08014755\002-0.05287996\0020.02067086\002-0.11302640\0020.07383677\0020.07296082\0020.11693182\002-0.06988187\0020.05753216\002-0.02308202\002-0.03685183\0020.05840293\0020.03959572\002-0.01631517\0020.05918765\0020.07385137\002-0.05143059\002-0.19158235\0020.13839212\002
 [...]
+a451790d-42f8-48e5-88f4-ba21118e63e6   13326037312     81      0031    8       
(0551)17198025  852             2       0       1568294179731   2116    asp     
fk      1127000 1       0       0       0       0       
-0.19569308\0020.10781755\002-0.06963766\002-0.06576661\002-0.17820270\002-0.01949396\0020.08014755\002-0.05287996\0020.02067086\002-0.11302640\0020.07383677\0020.07296082\0020.11693182\002-0.06988187\0020.05753216\002-0.02308202\002-0.03685183\0020.05840293\0020.03959572\002-0.01631517\0020.05918765\0020.07385137\002-0.05143059\002-0.19158235\0020.13839212\002-0
 [...]
+9d26e280-4e87-4cbe-a850-5965b7c36a4b   13376907227     44      0031    9       
13376907227     82              3       0       1568302365552   2332    asp     
fk      2043000 1       0       0       0       0       
-0.19569308\0020.10781755\002-0.06963766\002-0.06576661\002-0.17820270\002-0.01949396\0020.08014755\002-0.05287996\0020.02067086\002-0.11302640\0020.07383677\0020.07296082\0020.11693182\002-0.06988187\0020.05753216\002-0.02308202\002-0.03685183\0020.05840293\0020.03959572\002-0.01631517\0020.05918765\0020.07385137\002-0.05143059\002-0.19158235\0020.13839212\002-0.082
 [...]
+c2eabec9-b8a7-405b-80d9-e73692d586f4   0086(021)77426829       81      0031    
8       13326037312     44              0       0       1568252700180   945     
asp     fk      943000  1       0       0       0       0       
-0.19569306\0020.10781755\002-0.06963766\002-0.06576662\002-0.17820272\002-0.01949397\0020.08014756\002-0.05287997\0020.02067086\002-0.11302640\0020.07383678\0020.07296083\0020.11693181\002-0.06988186\0020.05753217\002-0.02308202\002-0.03685183\0020.05840293\0020.03959572\002-0.01631518\0020.05918765\0020.07385136\002-0.05143059\002-0.19158234\0020.13839211\002-0
 [...]
+04a548aa-a103-4ffd-b72c-81b6cb2ea420   0086(021)77426829       82      0031    
2       13924865616     33              0       0       1568249850352   857     
asp     fk      2450000 1       0       0       0       0       
-0.19569308\0020.10781755\002-0.06963766\002-0.06576661\002-0.17820270\002-0.01949396\0020.08014755\002-0.05287996\0020.02067086\002-0.11302640\0020.07383677\0020.07296082\0020.11693182\002-0.06988187\0020.05753216\002-0.02308202\002-0.03685183\0020.05840293\0020.03959572\002-0.01631517\0020.05918765\0020.07385137\002-0.05143059\002-0.19158235\0020.13839212\002-
 [...]
+45c0ded1-c608-4a49-981d-faf720442a59   13378149447     49      0031    8       
13376907227     1               5       0       1568289879606   2004    asp     
fk      3686000 1       0       0       0       0       
-0.19569308\0020.10781755\002-0.06963766\002-0.06576661\002-0.17820270\002-0.01949396\0020.08014755\002-0.05287996\0020.02067086\002-0.11302640\0020.07383677\0020.07296082\0020.11693182\002-0.06988187\0020.05753216\002-0.02308202\002-0.03685183\0020.05840293\0020.03959572\002-0.01631517\0020.05918765\0020.07385137\002-0.05143059\002-0.19158235\0020.13839212\002-0.0826
 [...]
+cff43f86-ae81-4bbc-90dd-b7de39bdda1b   0086(021)77426829       82      0031    
2       0086(021)60080162       886             6       0       1568230183633   
329     asp     fk      1615000 1       0       0       0       0       
-0.19569308\0020.10781755\002-0.06963766\002-0.06576661\002-0.17820270\002-0.01949396\0020.08014755\002-0.05287996\0020.02067086\002-0.11302640\0020.07383677\0020.07296082\0020.11693182\002-0.06988187\0020.05753216\002-0.02308202\002-0.03685183\0020.05840293\0020.03959572\002-0.01631517\0020.05918765\0020.07385137\002-0.05143059\002-0.19158235\0020.138392
 [...]
diff --git 
a/integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/complexType/TestComplexDataType.scala
 
b/integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/complexType/TestComplexDataType.scala
index 9d6b4d1..8ec1420 100644
--- 
a/integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/complexType/TestComplexDataType.scala
+++ 
b/integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/complexType/TestComplexDataType.scala
@@ -1127,4 +1127,56 @@ class TestComplexDataType extends QueryTest with 
BeforeAndAfterAll {
     sql("drop table if exists hive_table")
   }
 
+  test("[CARBONDATA-3527] Fix 'String length cannot exceed 32000 characters' 
issue when load data with 'GLOBAL_SORT' from csv files which include big 
complex type data") {
+    val tableName = "complexdata3_table"
+    sql(s"drop table if exists ${tableName}")
+    sql(
+      s"""
+         |CREATE TABLE IF NOT EXISTS ${tableName} (
+         | begin_time LONG,
+         | id string,
+         | phone string,
+         | other_phone string,
+         | vtl LONG,
+         | gender string,
+         | lang string,
+         | lang_dec string,
+         | phone_country string,
+         | phone_province string,
+         | phone_city string,
+         | other_phone_country string,
+         | other_phone_province string,
+         | other_phone_city string,
+         | call_type INT,
+         | begin_hhmm INT,
+         | ds string,
+         | voice_flag INT,
+         | dss string,
+         | dur LONG,
+         | modela array < array < FLOAT >>, modelb array < array < FLOAT >>, 
modela_pk array < array < FLOAT >>, modelb_pk array < array < FLOAT >>, 
modela_ms array < array < FLOAT >>, modelb_ms array < array < FLOAT >>, tl LONG,
+         | lang_sc FLOAT,
+         | nlp_sc FLOAT,
+         | create_time LONG,
+         | cdr_create_time LONG,
+         | fulltext string,
+         | tag_label string,
+         | tag_memo string,
+         | tag_listen string,
+         | tag_imp string,
+         | prop string,
+         | files string
+         | )
+         | STORED AS carbondata TBLPROPERTIES (
+         | 'SORT_COLUMNS' = 
'begin_time,id,phone,other_phone,vtl,gender,lang,lang_dec,phone_country,phone_province,phone_city,other_phone_country,other_phone_province,other_phone_city,call_type,begin_hhmm,ds,voice_flag',
+         | 'SORT_SCOPE' = 'GLOBAL_SORT','LONG_STRING_COLUMNS' = 
'fulltext,files')""".stripMargin)
+    sql(s"""LOAD DATA inpath '${resourcesPath}/complexdata3.csv' INTO table 
${tableName}
+        
options('DELIMITER'='\t','QUOTECHAR'='"','COMMENTCHAR'='#','HEADER'='false',
+                
'FILEHEADER'='id,phone,phone_country,phone_province,phone_city,other_phone,other_phone_country,other_phone_province,other_phone_city,call_type,begin_time,begin_hhmm,ds,dss,dur,voice_flag,modela,modelb,modela_pk,modelb_pk,modela_ms,modelb_ms,lang,lang_dec,lang_sc,gender,nlp_sc,tl,vtl,create_time,cdr_create_time,fulltext,tag_label,tag_memo,tag_listen,tag_imp,prop,files',
+                
'MULTILINE'='true','ESCAPECHAR'='\','COMPLEX_DELIMITER_LEVEL_1'='\\001','COMPLEX_DELIMITER_LEVEL_2'='\\002',
+                'SINGLE_PASS'='TRUE')""")
+    checkAnswer(sql(s"select count(1) from ${tableName}"), Seq(Row(10)))
+    checkAnswer(sql(s"select modela[0][0], modela_ms[0][1] from ${tableName} 
where id = 'e01a1773-bd37-40be-a1de-d7e74837a281'"),
+      Seq(Row(0.0, 0.10781755)))
+    sql(s"drop table if exists ${tableName}")
+  }
 }
diff --git 
a/integration/spark-common/src/main/scala/org/apache/carbondata/spark/rdd/NewCarbonDataLoadRDD.scala
 
b/integration/spark-common/src/main/scala/org/apache/carbondata/spark/rdd/NewCarbonDataLoadRDD.scala
index ac8224e..ce60a55 100644
--- 
a/integration/spark-common/src/main/scala/org/apache/carbondata/spark/rdd/NewCarbonDataLoadRDD.scala
+++ 
b/integration/spark-common/src/main/scala/org/apache/carbondata/spark/rdd/NewCarbonDataLoadRDD.scala
@@ -348,6 +348,9 @@ class NewRddIterator(rddIter: Iterator[Row],
   private val isVarcharTypeMapping =
     
carbonLoadModel.getCarbonDataLoadSchema.getCarbonTable.getCreateOrderColumn(
       carbonLoadModel.getTableName).asScala.map(_.getDataType == 
DataTypes.VARCHAR)
+  private val isComplexTypeMapping =
+    
carbonLoadModel.getCarbonDataLoadSchema.getCarbonTable.getCreateOrderColumn(
+      carbonLoadModel.getTableName).asScala.map(_.isComplex())
   def hasNext: Boolean = rddIter.hasNext
 
   def next: Array[AnyRef] = {
@@ -356,7 +359,8 @@ class NewRddIterator(rddIter: Iterator[Row],
     for (i <- 0 until columns.length) {
       columns(i) = CarbonScalaUtil.getString(row.get(i), 
serializationNullFormat,
         complexDelimiters, timeStampFormat, dateFormat,
-        isVarcharType = i < isVarcharTypeMapping.size && 
isVarcharTypeMapping(i))
+        isVarcharType = i < isVarcharTypeMapping.size && 
isVarcharTypeMapping(i),
+        isComplexType = i < isComplexTypeMapping.size && 
isComplexTypeMapping(i))
     }
     columns
   }
diff --git 
a/integration/spark-common/src/main/scala/org/apache/carbondata/spark/util/CarbonScalaUtil.scala
 
b/integration/spark-common/src/main/scala/org/apache/carbondata/spark/util/CarbonScalaUtil.scala
index b0af2ea..d94c5d7 100644
--- 
a/integration/spark-common/src/main/scala/org/apache/carbondata/spark/util/CarbonScalaUtil.scala
+++ 
b/integration/spark-common/src/main/scala/org/apache/carbondata/spark/util/CarbonScalaUtil.scala
@@ -66,9 +66,11 @@ object CarbonScalaUtil {
       timeStampFormat: SimpleDateFormat,
       dateFormat: SimpleDateFormat,
       isVarcharType: Boolean = false,
+      isComplexType: Boolean = false,
       level: Int = 0): String = {
     FieldConverter.objectToString(value, serializationNullFormat, 
complexDelimiters,
-      timeStampFormat, dateFormat, isVarcharType = isVarcharType, level)
+      timeStampFormat, dateFormat, isVarcharType = isVarcharType, 
isComplexType = isComplexType,
+      level)
   }
 
   /**
diff --git 
a/streaming/src/main/scala/org/apache/carbondata/streaming/parser/FieldConverter.scala
 
b/streaming/src/main/scala/org/apache/carbondata/streaming/parser/FieldConverter.scala
index 5c67dfb..0cf244a 100644
--- 
a/streaming/src/main/scala/org/apache/carbondata/streaming/parser/FieldConverter.scala
+++ 
b/streaming/src/main/scala/org/apache/carbondata/streaming/parser/FieldConverter.scala
@@ -42,12 +42,13 @@ object FieldConverter {
       timeStampFormat: SimpleDateFormat,
       dateFormat: SimpleDateFormat,
       isVarcharType: Boolean = false,
+      isComplexType: Boolean = false,
       level: Int = 0): String = {
     if (value == null) {
       serializationNullFormat
     } else {
       value match {
-        case s: String => if (!isVarcharType &&
+        case s: String => if (!isVarcharType && !isComplexType &&
                               s.length > 
CarbonCommonConstants.MAX_CHARS_PER_COLUMN_DEFAULT) {
           throw new Exception("Dataload failed, String length cannot exceed " +
                               
CarbonCommonConstants.MAX_CHARS_PER_COLUMN_DEFAULT + " characters")
@@ -68,23 +69,25 @@ object FieldConverter {
           val delimiter = complexDelimiters.get(level)
           val builder = new StringBuilder()
           s.foreach { x =>
+            val nextLevel = level + 1
             builder.append(objectToString(x, serializationNullFormat, 
complexDelimiters,
-              timeStampFormat, dateFormat, isVarcharType, level + 1))
+              timeStampFormat, dateFormat, isVarcharType, level = nextLevel))
               .append(delimiter)
           }
           builder.substring(0, builder.length - delimiter.length())
         // First convert the 'key' of Map and then append the 
keyValueDelimiter and then convert
         // the 'value of the map and append delimiter
         case m: scala.collection.Map[_, _] =>
+          val nextLevel = level + 2
           val delimiter = complexDelimiters.get(level)
           val keyValueDelimiter = complexDelimiters.get(level + 1)
           val builder = new StringBuilder()
           m.foreach { x =>
             builder.append(objectToString(x._1, serializationNullFormat, 
complexDelimiters,
-              timeStampFormat, dateFormat, isVarcharType, level + 2))
+              timeStampFormat, dateFormat, isVarcharType, level = nextLevel))
               .append(keyValueDelimiter)
             builder.append(objectToString(x._2, serializationNullFormat, 
complexDelimiters,
-              timeStampFormat, dateFormat, isVarcharType, level + 2))
+              timeStampFormat, dateFormat, isVarcharType, level = nextLevel))
               .append(delimiter)
           }
           builder.substring(0, builder.length - delimiter.length())
@@ -92,8 +95,9 @@ object FieldConverter {
           val delimiter = complexDelimiters.get(level)
           val builder = new StringBuilder()
           for (i <- 0 until r.length) {
+            val nextLevel = level + 1
             builder.append(objectToString(r(i), serializationNullFormat, 
complexDelimiters,
-              timeStampFormat, dateFormat, isVarcharType, level + 1))
+              timeStampFormat, dateFormat, isVarcharType, level = nextLevel))
               .append(delimiter)
           }
           builder.substring(0, builder.length - delimiter.length())

Reply via email to