This is an automated email from the ASF dual-hosted git repository. jackylk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/carbondata.git
commit 0a93c136575a5414e8da202cc7390feda44d8785 Author: ajantha-bhat <ajanthab...@gmail.com> AuthorDate: Mon Dec 9 21:06:14 2019 +0800 [CARBONDATA-3614] Support Alter table properties set/unset for longstring columns If some users want to change string column to long string columns as data is huge. can use this command. ALTER TABLE table_name SET TBLPROPERTIES('long_String_columns'='col1,col2') For alter table tableProperties of long string, modifed like below set operation: a. reset the current varchar datatype to string b. reorder back to original order (consider sort column and original schema ordinal) c. set the new columns to varchar d. reorder new varchar columns to be after dimensions unset operation: a. reset the current varchar datatype to string b. reorder back to original order (consider sort column and original schema ordinal) This closes #3504 --- .../apache/carbondata/core/util/CarbonUtil.java | 99 ++++++++++++++++++++++ .../longstring/VarcharDataTypesBasicTestCase.scala | 77 ++++++++++++++++- .../org/apache/spark/util/AlterTableUtil.scala | 74 +++++++++++++++- 3 files changed, 247 insertions(+), 3 deletions(-) diff --git a/core/src/main/java/org/apache/carbondata/core/util/CarbonUtil.java b/core/src/main/java/org/apache/carbondata/core/util/CarbonUtil.java index d3fa33b..8585f78 100644 --- a/core/src/main/java/org/apache/carbondata/core/util/CarbonUtil.java +++ b/core/src/main/java/org/apache/carbondata/core/util/CarbonUtil.java @@ -36,7 +36,9 @@ import java.security.PrivilegedExceptionAction; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; +import java.util.Comparator; import java.util.HashMap; +import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Locale; @@ -3342,4 +3344,101 @@ public final class CarbonUtil { return file; } } + + /** + * For alter table tableProperties of long string, modify ColumnSchema + * set operation: + * a. reset the current varchar datatype to string + * b. reorder back to original order (consider sort column and original schema ordinal) + * c. set the new columns to varchar + * d. reorder new varchar columns to be after dimensions + * <p> + * unset operation: + * a. reset the current varchar datatype to string + * b. reorder back to original order (consider sort column and original schema ordinal) + * + * @param columns list of column schema from thrift + * @param longStringColumnsString comma separated long string columns to be set, empty for unset + * @return updated columnSchema list + */ + public static List<org.apache.carbondata.format.ColumnSchema> reorderColumnsForLongString( + List<org.apache.carbondata.format.ColumnSchema> columns, String longStringColumnsString) { + // schema will be like + // sortColumns-otherDimensions-varchar-[complexColumns + complex_child - measures] + List<org.apache.carbondata.format.ColumnSchema> sortColumns = new ArrayList<>(); + List<org.apache.carbondata.format.ColumnSchema> dimColumns = new ArrayList<>(); + List<org.apache.carbondata.format.ColumnSchema> otherColumns = new ArrayList<>(); + // true if cleared the varchar type in dims to string + boolean isResetDone = false; + // complex type child also looks like dimension or measure, + // so just take all other columns at once. + int otherColumnStartIndex = -1; + for (int i = 0; i < columns.size(); i++) { + if (columns.get(i).getColumnProperties() != null) { + String isSortColumn = + columns.get(i).getColumnProperties().get(CarbonCommonConstants.SORT_COLUMNS); + if ((isSortColumn != null) && (isSortColumn.equalsIgnoreCase("true"))) { + // add sort column dimensions + sortColumns.add(columns.get(i)); + } + } else if ((columns.get(i).getData_type() == org.apache.carbondata.format.DataType.ARRAY + || columns.get(i).getData_type() == org.apache.carbondata.format.DataType.STRUCT + || columns.get(i).getData_type() == org.apache.carbondata.format.DataType.MAP || (!columns + .get(i).isDimension()))) { + // complex type or measure starts here and processed all sort columns and other dimensions + otherColumnStartIndex = i; + break; + } else { + // if it is varchar, reset to string as new set and unset needs to be done. + // if not, just add it to dimColumns + org.apache.carbondata.format.ColumnSchema col = columns.get(i); + if (col.data_type == org.apache.carbondata.format.DataType.VARCHAR) { + col.data_type = org.apache.carbondata.format.DataType.STRING; + isResetDone = true; + } + dimColumns.add(col); + } + } + if (otherColumnStartIndex != -1) { + otherColumns.addAll(columns.subList(otherColumnStartIndex, columns.size())); + } + if (isResetDone) { + // reorder the dims based on original schema ordinal + dimColumns.sort(new Comparator<org.apache.carbondata.format.ColumnSchema>() { + @Override + public int compare(org.apache.carbondata.format.ColumnSchema o1, + org.apache.carbondata.format.ColumnSchema o2) { + return o1.getSchemaOrdinal() - o2.getSchemaOrdinal(); + } + }); + } + List<org.apache.carbondata.format.ColumnSchema> nonVarCharDims = new ArrayList<>(); + // for setting long string columns + if (!longStringColumnsString.isEmpty()) { + String[] inputColumns = longStringColumnsString.split(","); + Set<String> longStringSet = new HashSet<>(Arrays.asList(inputColumns)); + List<org.apache.carbondata.format.ColumnSchema> varCharColumns = new ArrayList<>(); + // change data type to varchar and extract the varchar columns + for (org.apache.carbondata.format.ColumnSchema dim : dimColumns) { + if (longStringSet.contains(dim.getColumn_name())) { + dim.data_type = org.apache.carbondata.format.DataType.VARCHAR; + // extract varchar dimensions + varCharColumns.add(dim); + } else { + // extract non varchar, non sort dimensions + nonVarCharDims.add(dim); + } + } + // append varchar in the end of dimensions + nonVarCharDims.addAll(varCharColumns); + } else { + nonVarCharDims = dimColumns; + } + // combine all columns and return + if (otherColumns.size() != 0) { + nonVarCharDims.addAll(otherColumns); + } + sortColumns.addAll(nonVarCharDims); + return sortColumns; + } } diff --git a/integration/spark-common-test/src/test/scala/org/apache/carbondata/spark/testsuite/longstring/VarcharDataTypesBasicTestCase.scala b/integration/spark-common-test/src/test/scala/org/apache/carbondata/spark/testsuite/longstring/VarcharDataTypesBasicTestCase.scala index fb8dd5f..428a56a 100644 --- a/integration/spark-common-test/src/test/scala/org/apache/carbondata/spark/testsuite/longstring/VarcharDataTypesBasicTestCase.scala +++ b/integration/spark-common-test/src/test/scala/org/apache/carbondata/spark/testsuite/longstring/VarcharDataTypesBasicTestCase.scala @@ -30,7 +30,6 @@ import org.apache.carbondata.core.constants.CarbonCommonConstants import org.apache.carbondata.core.metadata.CarbonMetadata import org.apache.carbondata.core.metadata.datatype.DataTypes import org.apache.carbondata.core.util.CarbonProperties - import scala.collection.mutable class VarcharDataTypesBasicTestCase extends QueryTest with BeforeAndAfterEach with BeforeAndAfterAll { @@ -46,6 +45,7 @@ class VarcharDataTypesBasicTestCase extends QueryTest with BeforeAndAfterEach wi private var originMemorySize = CarbonProperties.getInstance().getProperty( CarbonCommonConstants.UNSAFE_WORKING_MEMORY_IN_MB, CarbonCommonConstants.UNSAFE_WORKING_MEMORY_IN_MB_DEFAULT) + private var longChar : String = _ case class Content(head: Int, desc_line_head: String, note_line_head: String, mid: Int, desc_line_mid: String, note_line_mid: String, @@ -61,6 +61,7 @@ class VarcharDataTypesBasicTestCase extends QueryTest with BeforeAndAfterEach wi new File(inputDir).mkdir() } content = createFile(inputFile, line = lineNum) + longChar = RandomStringUtils.randomAlphabetic(33000) } override def afterAll(): Unit = { @@ -191,6 +192,80 @@ class VarcharDataTypesBasicTestCase extends QueryTest with BeforeAndAfterEach wi assert(exceptionCaught.getMessage.contains("both in no_inverted_index and long_string_columns")) } + test("test alter table properties for long string columns") { + sql("drop table if exists testlongstring") + sql( + s""" + | CREATE TABLE if not exists testlongstring(id INT, name STRING, description STRING + | ) STORED BY 'carbondata' + |""".stripMargin) + sql("insert into testlongstring select 1, 'ab', 'cool'") + // describe formatted should not have long_string_columns + checkExistence(sql("describe formatted testlongstring"), false, "long_string_columns") + //Alter table add table property of long string columns + sql("ALTER TABLE testlongstring SET TBLPROPERTIES('long_String_columns'='name,description')") + sql(s""" insert into testlongstring select 1, 'ab1', '$longChar'""") + checkAnswer(sql("select * from testlongstring"), Seq(Row(1, "ab", "cool"), Row(1, "ab1", longChar))) + // describe formatted should have long_string_columns + checkExistence(sql("describe formatted testlongstring"), true, "LONG_STRING_COLUMNS") + //insert without without local dictionary + sql("ALTER TABLE testlongstring SET TBLPROPERTIES('local_dictionary_enable'='false')") + sql(s""" insert into testlongstring select 1, 'abc', '$longChar'""") + checkAnswer(sql("select * from testlongstring"), + Seq(Row(1, "ab", "cool"), Row(1, "ab1", longChar), Row(1, "abc", longChar))) + // Unset the long_String_columns + sql("ALTER TABLE testlongstring UNSET TBLPROPERTIES('long_string_columns')") + // describe formatted should not have long_string_columns + checkExistence(sql("describe formatted testlongstring"), false, "long_string_columns") + // query should pass + checkAnswer(sql("select * from testlongstring"), + Seq(Row(1, "ab", "cool"), Row(1, "ab1", longChar), Row(1, "abc", longChar))) + // insert long string should fail as unset is done + val e = intercept[Exception] { + sql(s""" insert into testlongstring select 1, 'abc', '$longChar'""") + } + assert(e.getMessage.contains("Dataload failed, String length cannot exceed 32000 characters")) + sql("ALTER TABLE testlongstring SET TBLPROPERTIES('long_String_columns'='description')") + sql(s""" insert into testlongstring select 1, 'ab1', '$longChar'""") + sql("drop table if exists testlongstring") + } + + test("test alter table properties for long_string_columns with complex columns") { + sql("DROP TABLE IF EXISTS varchar_complex_table") + sql(""" + | CREATE TABLE varchar_complex_table + | (m1 int,arr1 array<string>,varchar1 string,s1 string,varchar2 string,arr2 array<string>) + | STORED BY 'carbondata' TBLPROPERTIES('sort_columns'='s1,m1') + | """.stripMargin) + sql( + s"""insert into varchar_complex_table values(1, array('ar1.0','ar1.1'), + |'xx', 'normal string1', 'xx', array('ar2.0','ar2.1'))""".stripMargin) + + sql("ALTER TABLE varchar_complex_table SET TBLPROPERTIES('long_String_columns'='varchar1,varchar2')") + + sql( + s"""insert into varchar_complex_table values(1, array('ar1.0','ar1.1'), '$longChar', + |'normal string1', '$longChar', array('ar2.0','ar2.1')),(2, array('ar1.2','ar1.3'), + |'$longChar', 'normal string2', '$longChar', array('ar2.2','ar2.3'))""".stripMargin) + + checkAnswer( + sql("SELECT * FROM varchar_complex_table where m1=1"), + Seq(Row(1,mutable.WrappedArray.make(Array("ar1.0","ar1.1")),"xx","normal string1", + "xx",mutable.WrappedArray.make(Array("ar2.0","ar2.1"))), + Row(1,mutable.WrappedArray.make(Array("ar1.0","ar1.1")),longChar,"normal string1", + longChar,mutable.WrappedArray.make(Array("ar2.0","ar2.1"))))) + checkAnswer( + sql( + """ + |SELECT varchar1,arr2,s1,m1,varchar2,arr1 + |FROM varchar_complex_table + |WHERE arr1[1]='ar1.3' + |""".stripMargin), + Seq(Row(longChar,mutable.WrappedArray.make(Array("ar2.2","ar2.3")),"normal string2",2, + longChar,mutable.WrappedArray.make(Array("ar1.2","ar1.3"))))) + sql("DROP TABLE IF EXISTS varchar_complex_table") + } + test("inverted index columns cannot be present in long_string_cols as they do not support sort_cols") { val exceptionCaught = intercept[MalformedCarbonCommandException] { sql( diff --git a/integration/spark2/src/main/scala/org/apache/spark/util/AlterTableUtil.scala b/integration/spark2/src/main/scala/org/apache/spark/util/AlterTableUtil.scala index 4177152..61eb2dd 100644 --- a/integration/spark2/src/main/scala/org/apache/spark/util/AlterTableUtil.scala +++ b/integration/spark2/src/main/scala/org/apache/spark/util/AlterTableUtil.scala @@ -45,7 +45,7 @@ import org.apache.carbondata.core.metadata.datatype.DataTypes import org.apache.carbondata.core.metadata.schema.table.CarbonTable import org.apache.carbondata.core.metadata.schema.table.column.ColumnSchema import org.apache.carbondata.core.util.CarbonUtil -import org.apache.carbondata.format.{SchemaEvolutionEntry, TableInfo} +import org.apache.carbondata.format.{DataType, SchemaEvolutionEntry, TableInfo} import org.apache.carbondata.spark.util.{CarbonScalaUtil, CommonUtil} @@ -173,6 +173,20 @@ object AlterTableUtil { } /** + * update schema when LONG_STRING_COLUMNS are changed + */ + private def updateSchemaForLongStringColumns( + thriftTable: TableInfo, + longStringColumns: String) = { + val longStringColumnsString = CarbonUtil.unquoteChar(longStringColumns).trim + val newColumns = CarbonUtil.reorderColumnsForLongString(thriftTable + .getFact_table + .getTable_columns, + longStringColumnsString) + thriftTable.getFact_table.setTable_columns(newColumns) + } + + /** * @param carbonTable * @param schemaEvolutionEntry * @param thriftTable @@ -436,6 +450,15 @@ object AlterTableUtil { validateSortScopeAndSortColumnsProperties(carbonTable, lowerCasePropertiesMap) // if SORT_COLUMN is changed, it will move them to the head of column list updateSchemaForSortColumns(thriftTable, lowerCasePropertiesMap, schemaConverter) + // validate long string columns + val longStringColumns = lowerCasePropertiesMap.get("long_string_columns"); + if (longStringColumns.isDefined) { + validateLongStringColumns(longStringColumns.get, carbonTable) + // update schema for long string columns + updateSchemaForLongStringColumns(thriftTable, longStringColumns.get) + } else if (propKeys.exists(_.equalsIgnoreCase("long_string_columns") && !set)) { + updateSchemaForLongStringColumns(thriftTable, "") + } // below map will be used for cache invalidation. As tblProperties map is getting modified // in the next few steps the original map need to be retained for any decision making val existingTablePropertiesMap = mutable.Map(tblPropertiesMap.toSeq: _*) @@ -518,7 +541,8 @@ object AlterTableUtil { "RANGE_COLUMN", "SORT_SCOPE", "SORT_COLUMNS", - "GLOBAL_SORT_PARTITIONS") + "GLOBAL_SORT_PARTITIONS", + "LONG_STRING_COLUMNS") supportedOptions.contains(propKey.toUpperCase) } @@ -813,6 +837,52 @@ object AlterTableUtil { } /** + * Validate LONG_STRING_COLUMNS property specified in Alter command + * + * @param longStringColumns + * @param carbonTable + */ + def validateLongStringColumns(longStringColumns: String, + carbonTable: CarbonTable): Unit = { + // don't allow duplicate column names + val longStringCols = longStringColumns.split(",") + if (longStringCols.distinct.lengthCompare(longStringCols.size) != 0) { + val duplicateColumns = longStringCols + .diff(longStringCols.distinct).distinct + val errMsg = + "LONG_STRING_COLUMNS contains Duplicate Columns: " + + duplicateColumns.mkString(",") + ". Please check the DDL." + throw new MalformedCarbonCommandException(errMsg) + } + // check if the column specified exists in table schema and must be of string data type + val colSchemas = carbonTable.getTableInfo.getFactTable.getListOfColumns.asScala + longStringCols.foreach { col => + if (!colSchemas.exists(x => x.getColumnName.equalsIgnoreCase(col.trim))) { + val errorMsg = "LONG_STRING_COLUMNS column: " + col.trim + + " does not exist in table. Please check the DDL." + throw new MalformedCarbonCommandException(errorMsg) + } else if (colSchemas.exists(x => x.getColumnName.equalsIgnoreCase(col.trim) && + !x.getDataType.toString + .equalsIgnoreCase("STRING"))) { + val errMsg = "LONG_STRING_COLUMNS column: " + col.trim + + " is not a string datatype column" + throw new MalformedCarbonCommandException(errMsg) + } + } + // should not be present in sort columns + val sortCols = carbonTable.getSortColumns + if (sortCols != null) { + for (col <- longStringCols) { + if (sortCols.contains(col)) { + val errMsg = + "LONG_STRING_COLUMNS cannot be present in sort columns: " + col + throw new MalformedCarbonCommandException(errMsg) + } + } + } + } + + /** * Validate LOCAL_DICT_COLUMNS property specified in Alter command * @param tblPropertiesMap * @param carbonTable