Repository: carbondata Updated Branches: refs/heads/master 8c49e5b42 -> d941a5dda
[CARBONDATA-3004][32k] Fix bugs in writing dataframe to carbon with longstring Currently while writing dataframe to carbon table, we need to parse the rows in dataframe. For string columns, we need to judge whether it is a long string column or not. In current implementation, the judgement is position based which means that the order of fields in dataframe should be the same as that in create table. In this PR, we fix this bug. This closes #2812 Project: http://git-wip-us.apache.org/repos/asf/carbondata/repo Commit: http://git-wip-us.apache.org/repos/asf/carbondata/commit/d941a5dd Tree: http://git-wip-us.apache.org/repos/asf/carbondata/tree/d941a5dd Diff: http://git-wip-us.apache.org/repos/asf/carbondata/diff/d941a5dd Branch: refs/heads/master Commit: d941a5ddab30f4bfc5e5df04cf31edd3666887dd Parents: 8c49e5b Author: xuchuanyin <[email protected]> Authored: Fri Oct 12 16:20:58 2018 +0800 Committer: Jacky Li <[email protected]> Committed: Tue Oct 23 09:17:02 2018 +0800 ---------------------------------------------------------------------- .../VarcharDataTypesBasicTestCase.scala | 22 ++++++++++++++++++++ .../spark/rdd/NewCarbonDataLoadRDD.scala | 14 ++++++++++--- 2 files changed, 33 insertions(+), 3 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/carbondata/blob/d941a5dd/integration/spark-common-test/src/test/scala/org/apache/carbondata/spark/testsuite/longstring/VarcharDataTypesBasicTestCase.scala ---------------------------------------------------------------------- diff --git a/integration/spark-common-test/src/test/scala/org/apache/carbondata/spark/testsuite/longstring/VarcharDataTypesBasicTestCase.scala b/integration/spark-common-test/src/test/scala/org/apache/carbondata/spark/testsuite/longstring/VarcharDataTypesBasicTestCase.scala index b607d07..4051de4 100644 --- a/integration/spark-common-test/src/test/scala/org/apache/carbondata/spark/testsuite/longstring/VarcharDataTypesBasicTestCase.scala +++ b/integration/spark-common-test/src/test/scala/org/apache/carbondata/spark/testsuite/longstring/VarcharDataTypesBasicTestCase.scala @@ -405,6 +405,28 @@ class VarcharDataTypesBasicTestCase extends QueryTest with BeforeAndAfterEach wi checkQuery() } + test("write from dataframe with long_string datatype whose order of fields is not the same as that in table") { + sql( + s""" + | CREATE TABLE if not exists $longStringTable( + | id INT, name STRING, description STRING, address STRING, note STRING + | ) STORED BY 'carbondata' + | TBLPROPERTIES('LONG_STRING_COLUMNS'='description, note', 'dictionary_include'='name', 'sort_columns'='id') + |""". + stripMargin) + + prepareDF() + // the order of fields in dataframe is different from that in create table + longStringDF.select("note", "address", "description", "name", "id") + .write + .format("carbondata") + .option("tableName", longStringTable) + .mode(SaveMode.Append) + .save() + + checkQuery() + } + test("desc table shows long_string_columns property") { sql( s""" http://git-wip-us.apache.org/repos/asf/carbondata/blob/d941a5dd/integration/spark-common/src/main/scala/org/apache/carbondata/spark/rdd/NewCarbonDataLoadRDD.scala ---------------------------------------------------------------------- diff --git a/integration/spark-common/src/main/scala/org/apache/carbondata/spark/rdd/NewCarbonDataLoadRDD.scala b/integration/spark-common/src/main/scala/org/apache/carbondata/spark/rdd/NewCarbonDataLoadRDD.scala index ab8bb8b..fe09034 100644 --- a/integration/spark-common/src/main/scala/org/apache/carbondata/spark/rdd/NewCarbonDataLoadRDD.scala +++ b/integration/spark-common/src/main/scala/org/apache/carbondata/spark/rdd/NewCarbonDataLoadRDD.scala @@ -418,10 +418,18 @@ class LazyRddIterator(serializer: SerializerInstance, private val delimiterLevel2 = carbonLoadModel.getComplexDelimiterLevel2 private val serializationNullFormat = carbonLoadModel.getSerializationNullFormat.split(CarbonCommonConstants.COMMA, 2)(1) + // the order of fields in dataframe and createTable may be different, here we need to know whether + // each fields in dataframe is Varchar or not. import scala.collection.JavaConverters._ - private val isVarcharTypeMapping = - carbonLoadModel.getCarbonDataLoadSchema.getCarbonTable.getCreateOrderColumn( - carbonLoadModel.getTableName).asScala.map(_.getDataType == DataTypes.VARCHAR) + private val isVarcharTypeMapping = { + val col2VarcharType = carbonLoadModel.getCarbonDataLoadSchema.getCarbonTable + .getCreateOrderColumn(carbonLoadModel.getTableName).asScala + .map(c => c.getColName -> (c.getDataType == DataTypes.VARCHAR)).toMap + carbonLoadModel.getCsvHeaderColumns.map(c => { + val r = col2VarcharType.get(c.toLowerCase) + r.isDefined && r.get + }) + } private var rddIter: Iterator[Row] = null private var uninitialized = true
