Repository: carbondata Updated Branches: refs/heads/master 2b7c8b374 -> f354e0b2b
[CARBONDATA-2926] fixed ArrayIndexOutOfBoundException with varchar columns and empty sort_columns problem: ArrayIndexOutOfBoundException if varchar column is present before dictionary columns along with empty sort_columns. root cause: CarbonFactDataHandlerColumnar.isVarcharColumnFull() method uses model.getVarcharDimIdxInNoDict() and index of varchar column in no dictonary array became negative. currently index was calculated based on ordinal-number of dictionary columns. This can go negative in no_sort column case, solution: take the varchar dimension index from no dictionary array from at runtime based on schema. Also hotfix: removed number validation in table properties of sdk, now we support 7 properties This closes #2705 Project: http://git-wip-us.apache.org/repos/asf/carbondata/repo Commit: http://git-wip-us.apache.org/repos/asf/carbondata/commit/f354e0b2 Tree: http://git-wip-us.apache.org/repos/asf/carbondata/tree/f354e0b2 Diff: http://git-wip-us.apache.org/repos/asf/carbondata/diff/f354e0b2 Branch: refs/heads/master Commit: f354e0b2b5704a3846b622f9256dc898022b9177 Parents: 2b7c8b3 Author: ajantha-bhat <[email protected]> Authored: Mon Sep 10 23:08:15 2018 +0530 Committer: ravipesala <[email protected]> Committed: Fri Sep 14 19:38:03 2018 +0530 ---------------------------------------------------------------------- .../carbondata/core/datastore/TableSpec.java | 17 +++++++++++ .../schema/table/TableSchemaBuilder.java | 9 +++++- .../sdv/generated/SDKwriterTestCase.scala | 31 ++++++++++++++++++++ .../store/CarbonFactDataHandlerModel.java | 10 ++++--- .../carbondata/processing/store/TablePage.java | 4 ++- .../sdk/file/CarbonWriterBuilder.java | 6 ---- 6 files changed, 65 insertions(+), 12 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/carbondata/blob/f354e0b2/core/src/main/java/org/apache/carbondata/core/datastore/TableSpec.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/carbondata/core/datastore/TableSpec.java b/core/src/main/java/org/apache/carbondata/core/datastore/TableSpec.java index 4d30cb0..bded430 100644 --- a/core/src/main/java/org/apache/carbondata/core/datastore/TableSpec.java +++ b/core/src/main/java/org/apache/carbondata/core/datastore/TableSpec.java @@ -20,6 +20,7 @@ package org.apache.carbondata.core.datastore; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; +import java.util.ArrayList; import java.util.List; import org.apache.carbondata.core.metadata.datatype.DataType; @@ -36,6 +37,14 @@ public class TableSpec { private DimensionSpec[] dimensionSpec; private MeasureSpec[] measureSpec; + // Many places we might have to access no-dictionary column spec. + // but no-dictionary column spec are not always in below order like, + // dictionary + no dictionary + complex + measure + // when sort_columns are empty, no columns are selected for sorting. + // so, spec will not be in above order. + // Hence noDictionaryDimensionSpec will be useful and it will be subset of dimensionSpec. + private List<DimensionSpec> noDictionaryDimensionSpec; + // number of simple dimensions private int numSimpleDimensions; @@ -56,6 +65,7 @@ public class TableSpec { } dimensionSpec = new DimensionSpec[dimensions.size()]; measureSpec = new MeasureSpec[measures.size()]; + noDictionaryDimensionSpec = new ArrayList<>(); addDimensions(dimensions); addMeasures(measures); } @@ -67,10 +77,12 @@ public class TableSpec { if (dimension.isComplex()) { DimensionSpec spec = new DimensionSpec(ColumnType.COMPLEX, dimension); dimensionSpec[dimIndex++] = spec; + noDictionaryDimensionSpec.add(spec); } else if (dimension.getDataType() == DataTypes.TIMESTAMP && !dimension .isDirectDictionaryEncoding()) { DimensionSpec spec = new DimensionSpec(ColumnType.PLAIN_VALUE, dimension); dimensionSpec[dimIndex++] = spec; + noDictionaryDimensionSpec.add(spec); } else if (dimension.isDirectDictionaryEncoding()) { DimensionSpec spec = new DimensionSpec(ColumnType.DIRECT_DICTIONARY, dimension); dimensionSpec[dimIndex++] = spec; @@ -80,6 +92,7 @@ public class TableSpec { } else { DimensionSpec spec = new DimensionSpec(ColumnType.PLAIN_VALUE, dimension); dimensionSpec[dimIndex++] = spec; + noDictionaryDimensionSpec.add(spec); } } } @@ -95,6 +108,10 @@ public class TableSpec { return dimensionSpec[dimensionIndex]; } + public List<DimensionSpec> getNoDictionaryDimensionSpec() { + return noDictionaryDimensionSpec; + } + public MeasureSpec getMeasureSpec(int measureIndex) { return measureSpec[measureIndex]; } http://git-wip-us.apache.org/repos/asf/carbondata/blob/f354e0b2/core/src/main/java/org/apache/carbondata/core/metadata/schema/table/TableSchemaBuilder.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/carbondata/core/metadata/schema/table/TableSchemaBuilder.java b/core/src/main/java/org/apache/carbondata/core/metadata/schema/table/TableSchemaBuilder.java index c8eaa16..f1be5ca 100644 --- a/core/src/main/java/org/apache/carbondata/core/metadata/schema/table/TableSchemaBuilder.java +++ b/core/src/main/java/org/apache/carbondata/core/metadata/schema/table/TableSchemaBuilder.java @@ -50,6 +50,8 @@ public class TableSchemaBuilder { private List<ColumnSchema> dimension = new LinkedList<>(); + private List<ColumnSchema> varCharColumns = new LinkedList<>(); + private List<ColumnSchema> complex = new LinkedList<>(); private List<ColumnSchema> measures = new LinkedList<>(); @@ -107,6 +109,7 @@ public class TableSchemaBuilder { schema.setSchemaEvolution(schemaEvol); List<ColumnSchema> allColumns = new LinkedList<>(sortColumns); allColumns.addAll(dimension); + allColumns.addAll(varCharColumns); allColumns.addAll(complex); allColumns.addAll(measures); schema.setListOfColumns(allColumns); @@ -214,7 +217,11 @@ public class TableSchemaBuilder { || isComplexChild) { complex.add(newColumn); } else { - dimension.add(newColumn); + if (field.getDataType() == DataTypes.VARCHAR) { + varCharColumns.add(newColumn); + } else { + dimension.add(newColumn); + } } } if (newColumn.isDimensionColumn()) { http://git-wip-us.apache.org/repos/asf/carbondata/blob/f354e0b2/integration/spark-common-cluster-test/src/test/scala/org/apache/carbondata/cluster/sdv/generated/SDKwriterTestCase.scala ---------------------------------------------------------------------- diff --git a/integration/spark-common-cluster-test/src/test/scala/org/apache/carbondata/cluster/sdv/generated/SDKwriterTestCase.scala b/integration/spark-common-cluster-test/src/test/scala/org/apache/carbondata/cluster/sdv/generated/SDKwriterTestCase.scala index 40a0a62..267a05c 100644 --- a/integration/spark-common-cluster-test/src/test/scala/org/apache/carbondata/cluster/sdv/generated/SDKwriterTestCase.scala +++ b/integration/spark-common-cluster-test/src/test/scala/org/apache/carbondata/cluster/sdv/generated/SDKwriterTestCase.scala @@ -756,6 +756,37 @@ class SDKwriterTestCase extends QueryTest with BeforeAndAfterEach { sql(s"CREATE EXTERNAL TABLE sdkTable STORED BY 'carbondata' LOCATION '$writerPath'") checkAnswer(sql("select count(*) from sdkTable"), Seq(Row(5))) } + + test("Test sdk with longstring with empty sort column and some direct dictionary columns") { + // here we specify the longstring column as varchar + val schema = new StringBuilder() + .append("[ \n") + .append(" {\"address\":\"varchar\"},\n") + .append(" {\"date1\":\"date\"},\n") + .append(" {\"date2\":\"date\"},\n") + .append(" {\"age\":\"int\"}\n") + .append("]") + .toString() + val builder = CarbonWriter.builder() + val writer = builder + .outputPath(writerPath) + .sortBy(Array[String]()) + .buildWriterForCSVInput(Schema.parseJson(schema)) + + for (i <- 0 until 5) { + writer + .write(Array[String](RandomStringUtils.randomAlphabetic(40000), + "1999-12-01", + "1998-12-01", + i.toString)) + } + writer.close() + + assert(FileFactory.getCarbonFile(writerPath).exists) + sql("DROP TABLE IF EXISTS sdkTable") + sql(s"CREATE EXTERNAL TABLE sdkTable STORED BY 'carbondata' LOCATION '$writerPath'") + checkAnswer(sql("select count(*) from sdkTable"), Seq(Row(5))) + } } object avroUtil{ http://git-wip-us.apache.org/repos/asf/carbondata/blob/f354e0b2/processing/src/main/java/org/apache/carbondata/processing/store/CarbonFactDataHandlerModel.java ---------------------------------------------------------------------- diff --git a/processing/src/main/java/org/apache/carbondata/processing/store/CarbonFactDataHandlerModel.java b/processing/src/main/java/org/apache/carbondata/processing/store/CarbonFactDataHandlerModel.java index 54dd0aa..1a38de6 100644 --- a/processing/src/main/java/org/apache/carbondata/processing/store/CarbonFactDataHandlerModel.java +++ b/processing/src/main/java/org/apache/carbondata/processing/store/CarbonFactDataHandlerModel.java @@ -224,14 +224,16 @@ public class CarbonFactDataHandlerModel { simpleDimsLen[i] = dimLens[i]; } + int noDictionayDimensionIndex = 0; // for dynamic page size in write step if varchar columns exist List<Integer> varcharDimIdxInNoDict = new ArrayList<>(); for (DataField dataField : configuration.getDataFields()) { CarbonColumn column = dataField.getColumn(); - if (!column.isComplex() && !dataField.hasDictionaryEncoding() && - column.getDataType() == DataTypes.VARCHAR) { - // ordinal is set in CarbonTable.fillDimensionsAndMeasuresForTables() - varcharDimIdxInNoDict.add(column.getOrdinal() - simpleDimsCount); + if (!dataField.hasDictionaryEncoding()) { + if (!column.isComplex() && column.getDataType() == DataTypes.VARCHAR) { + varcharDimIdxInNoDict.add(noDictionayDimensionIndex); + } + noDictionayDimensionIndex++; } } http://git-wip-us.apache.org/repos/asf/carbondata/blob/f354e0b2/processing/src/main/java/org/apache/carbondata/processing/store/TablePage.java ---------------------------------------------------------------------- diff --git a/processing/src/main/java/org/apache/carbondata/processing/store/TablePage.java b/processing/src/main/java/org/apache/carbondata/processing/store/TablePage.java index 73746d6..a311483 100644 --- a/processing/src/main/java/org/apache/carbondata/processing/store/TablePage.java +++ b/processing/src/main/java/org/apache/carbondata/processing/store/TablePage.java @@ -195,8 +195,10 @@ public class TablePage { if (noDictionaryCount > 0 || complexColumnCount > 0) { TableSpec tableSpec = model.getTableSpec(); byte[][] noDictAndComplex = WriteStepRowUtil.getNoDictAndComplexDimension(row); + List<TableSpec.DimensionSpec> noDictionaryDimensionSpec = + tableSpec.getNoDictionaryDimensionSpec(); for (int i = 0; i < noDictAndComplex.length; i++) { - if (tableSpec.getDimensionSpec(dictDimensionPages.length + i).getSchemaDataType() + if (noDictionaryDimensionSpec.get(i).getSchemaDataType() == DataTypes.VARCHAR) { byte[] valueWithLength = addIntLengthToByteArray(noDictAndComplex[i]); noDictDimensionPages[i].putData(rowId, valueWithLength); http://git-wip-us.apache.org/repos/asf/carbondata/blob/f354e0b2/store/sdk/src/main/java/org/apache/carbondata/sdk/file/CarbonWriterBuilder.java ---------------------------------------------------------------------- diff --git a/store/sdk/src/main/java/org/apache/carbondata/sdk/file/CarbonWriterBuilder.java b/store/sdk/src/main/java/org/apache/carbondata/sdk/file/CarbonWriterBuilder.java index 5f3e7b8..388870f 100644 --- a/store/sdk/src/main/java/org/apache/carbondata/sdk/file/CarbonWriterBuilder.java +++ b/store/sdk/src/main/java/org/apache/carbondata/sdk/file/CarbonWriterBuilder.java @@ -294,12 +294,6 @@ public class CarbonWriterBuilder { */ public CarbonWriterBuilder withTableProperties(Map<String, String> options) { Objects.requireNonNull(options, "Table properties should not be null"); - //validate the options. - if (options.size() > 5) { - throw new IllegalArgumentException("Supports only 5 options now. " - + "Refer method header or documentation"); - } - Set<String> supportedOptions = new HashSet<>(Arrays .asList("table_blocksize", "table_blocklet_size", "local_dictionary_threshold", "local_dictionary_enable", "sort_columns", "sort_scope", "long_string_columns"));
