Repository: incubator-carbondata Updated Branches: refs/heads/master 8d0a672b9 -> e3fd98bbf
When data contains long max and min values for a measure column with bigInt datatype, the delta compression selected is DATA_BYTE which is incorrect. For selecting the delta compression min value is decremented from max value and here the min value is negative, so it performs addition operation and goes out of long range. When a long value goes out of range it starts again from the long max negative value which results in wrong compression selection. This leads to data loss and incorrect query results. Project: http://git-wip-us.apache.org/repos/asf/incubator-carbondata/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-carbondata/commit/4f915d10 Tree: http://git-wip-us.apache.org/repos/asf/incubator-carbondata/tree/4f915d10 Diff: http://git-wip-us.apache.org/repos/asf/incubator-carbondata/diff/4f915d10 Branch: refs/heads/master Commit: 4f915d102a5f6186aed5a74cfe23a0a81ea62aee Parents: 8d0a672 Author: manishgupta88 <tomanishgupt...@gmail.com> Authored: Mon Mar 20 17:13:02 2017 +0530 Committer: manishgupta88 <tomanishgupt...@gmail.com> Committed: Mon Mar 20 17:13:21 2017 +0530 ---------------------------------------------------------------------- .../carbondata/core/util/ValueCompressionUtil.java | 12 ++++++++++-- .../test/resources/testBigInt_boundary_value.csv | 16 ++++++++++++++++ .../spark/testsuite/bigdecimal/TestBigInt.scala | 13 +++++++++++++ 3 files changed, 39 insertions(+), 2 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-carbondata/blob/4f915d10/core/src/main/java/org/apache/carbondata/core/util/ValueCompressionUtil.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/carbondata/core/util/ValueCompressionUtil.java b/core/src/main/java/org/apache/carbondata/core/util/ValueCompressionUtil.java index 6a14373..c8a9397 100644 --- a/core/src/main/java/org/apache/carbondata/core/util/ValueCompressionUtil.java +++ b/core/src/main/java/org/apache/carbondata/core/util/ValueCompressionUtil.java @@ -181,8 +181,16 @@ public final class ValueCompressionUtil { int mantissa, byte dataTypeSelected, char measureStoreType) { DataType adaptiveDataType = getDataType((long) maxValue, mantissa, dataTypeSelected); int adaptiveSize = getSize(adaptiveDataType); - DataType deltaDataType = getDataType((long) maxValue - (long) minValue, mantissa, - dataTypeSelected); + DataType deltaDataType = null; + // we cannot apply compression in case actual data type of the column is long + // consider the scenario when max and min value are equal to is long max and min value OR + // when the max and min value are resulting in a value greater than long max value, then + // it is not possible to determine the compression type. + if (adaptiveDataType == DataType.DATA_LONG) { + deltaDataType = DataType.DATA_BIGINT; + } else { + deltaDataType = getDataType((long) maxValue - (long) minValue, mantissa, dataTypeSelected); + } int deltaSize = getSize(deltaDataType); if (adaptiveSize > deltaSize) { return new CompressionFinder(COMPRESSION_TYPE.DELTA_DOUBLE, DataType.DATA_BIGINT, http://git-wip-us.apache.org/repos/asf/incubator-carbondata/blob/4f915d10/integration/spark-common-test/src/test/resources/testBigInt_boundary_value.csv ---------------------------------------------------------------------- diff --git a/integration/spark-common-test/src/test/resources/testBigInt_boundary_value.csv b/integration/spark-common-test/src/test/resources/testBigInt_boundary_value.csv new file mode 100644 index 0000000..80ec0b4 --- /dev/null +++ b/integration/spark-common-test/src/test/resources/testBigInt_boundary_value.csv @@ -0,0 +1,16 @@ +Invalid values,92233720368547758071234 +Invalid values,def +All_null_values,null +All_zeros_values,0 +Max_range_values,9223372036854775807 +Max_range_values,9223372036854775807 +Max_range_values,9223372036854775807 +Max_range_values,9223372036854775807 +Min_range_values,-9223372036854775808 +Min_range_values,-9223372036854775807 +Min_range_values,-9223372036854775806 +Min_range_values,-9223372036854775805 +Normal_values,2345 +Normal_values,1234 +Normal_values,3456 +Normal_values,4567 http://git-wip-us.apache.org/repos/asf/incubator-carbondata/blob/4f915d10/integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/bigdecimal/TestBigInt.scala ---------------------------------------------------------------------- diff --git a/integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/bigdecimal/TestBigInt.scala b/integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/bigdecimal/TestBigInt.scala index 59fc9d3..f738040 100644 --- a/integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/bigdecimal/TestBigInt.scala +++ b/integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/bigdecimal/TestBigInt.scala @@ -79,6 +79,19 @@ class TestBigInt extends QueryTest with BeforeAndAfterAll { sql("select count(distinct salary) from hiveTable")) } + test("test big int data type storage for boundary values") { + sql("DROP TABLE IF EXISTS Test_Boundary_carbon") + sql("DROP TABLE IF EXISTS Test_Boundary_hive") + sql("create table Test_Boundary_carbon (c1_String string, c2_Bigint Bigint) STORED BY 'org.apache.carbondata.format'") + sql(s"LOAD DATA LOCAL INPATH '$resourcesPath/testBigInt_boundary_value.csv' into table Test_Boundary_carbon OPTIONS('FILEHEADER'='c1_String,c2_Bigint')") + sql("create table Test_Boundary_hive (c1_String string, c2_Bigint Bigint) row format delimited fields terminated by ','") + sql(s"LOAD DATA LOCAL INPATH '$resourcesPath/testBigInt_boundary_value.csv' into table Test_Boundary_hive") + checkAnswer(sql("select c2_bigInt*23 from Test_Boundary_carbon where c2_BigInt<9223372036854775807"), + sql("select c2_bigInt*23 from Test_Boundary_hive where c2_BigInt<9223372036854775807")) + sql("DROP TABLE IF EXISTS Test_Boundary_carbon") + sql("DROP TABLE IF EXISTS Test_Boundary_hive") + } + override def afterAll { sql("drop table if exists carbonTable") sql("drop table if exists hiveTable")