[1/2] incubator-carbondata git commit: When data contains long max and min values for a measure column with bigInt datatype, the delta compression selected is DATA_BYTE which is incorrect. For selecting the delta compression min value is decremented from

gvramana Mon, 20 Mar 2017 05:43:41 -0700

Repository: incubator-carbondata
Updated Branches:
  refs/heads/master 8d0a672b9 -> e3fd98bbf



When data contains long max and min values for a measure column with bigInt 
datatype, the delta compression selected is DATA_BYTE which is incorrect. For 
selecting the delta compression min value is decremented from max value and 
here the min value is negative, so it performs addition operation and goes out 
of long range. When a long value goes out of range it starts again from the 
long max negative value which results in wrong compression selection.
This leads to data loss and incorrect query results.


Project: http://git-wip-us.apache.org/repos/asf/incubator-carbondata/repo
Commit: 
http://git-wip-us.apache.org/repos/asf/incubator-carbondata/commit/4f915d10
Tree: http://git-wip-us.apache.org/repos/asf/incubator-carbondata/tree/4f915d10
Diff: http://git-wip-us.apache.org/repos/asf/incubator-carbondata/diff/4f915d10

Branch: refs/heads/master
Commit: 4f915d102a5f6186aed5a74cfe23a0a81ea62aee
Parents: 8d0a672
Author: manishgupta88 <tomanishgupt...@gmail.com>
Authored: Mon Mar 20 17:13:02 2017 +0530
Committer: manishgupta88 <tomanishgupt...@gmail.com>
Committed: Mon Mar 20 17:13:21 2017 +0530

----------------------------------------------------------------------
 .../carbondata/core/util/ValueCompressionUtil.java  | 12 ++++++++++--
 .../test/resources/testBigInt_boundary_value.csv    | 16 ++++++++++++++++
 .../spark/testsuite/bigdecimal/TestBigInt.scala     | 13 +++++++++++++
 3 files changed, 39 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-carbondata/blob/4f915d10/core/src/main/java/org/apache/carbondata/core/util/ValueCompressionUtil.java
----------------------------------------------------------------------
diff --git 
a/core/src/main/java/org/apache/carbondata/core/util/ValueCompressionUtil.java 
b/core/src/main/java/org/apache/carbondata/core/util/ValueCompressionUtil.java
index 6a14373..c8a9397 100644
--- 
a/core/src/main/java/org/apache/carbondata/core/util/ValueCompressionUtil.java
+++ 
b/core/src/main/java/org/apache/carbondata/core/util/ValueCompressionUtil.java
@@ -181,8 +181,16 @@ public final class ValueCompressionUtil {
       int mantissa, byte dataTypeSelected, char measureStoreType) {
     DataType adaptiveDataType = getDataType((long) maxValue, mantissa, 
dataTypeSelected);
     int adaptiveSize = getSize(adaptiveDataType);
-    DataType deltaDataType = getDataType((long) maxValue - (long) minValue, 
mantissa,
-        dataTypeSelected);
+    DataType deltaDataType = null;
+    // we cannot apply compression in case actual data type of the column is 
long
+    // consider the scenario when max and min value are equal to is long max 
and min value OR
+    // when the max and min value are resulting in a value greater than long 
max value, then
+    // it is not possible to determine the compression type.
+    if (adaptiveDataType == DataType.DATA_LONG) {
+      deltaDataType = DataType.DATA_BIGINT;
+    } else {
+      deltaDataType = getDataType((long) maxValue - (long) minValue, mantissa, 
dataTypeSelected);
+    }
     int deltaSize = getSize(deltaDataType);
     if (adaptiveSize > deltaSize) {
       return new CompressionFinder(COMPRESSION_TYPE.DELTA_DOUBLE, 
DataType.DATA_BIGINT,

http://git-wip-us.apache.org/repos/asf/incubator-carbondata/blob/4f915d10/integration/spark-common-test/src/test/resources/testBigInt_boundary_value.csv
----------------------------------------------------------------------
diff --git 
a/integration/spark-common-test/src/test/resources/testBigInt_boundary_value.csv
 
b/integration/spark-common-test/src/test/resources/testBigInt_boundary_value.csv
new file mode 100644
index 0000000..80ec0b4
--- /dev/null
+++ 
b/integration/spark-common-test/src/test/resources/testBigInt_boundary_value.csv
@@ -0,0 +1,16 @@
+Invalid values,92233720368547758071234
+Invalid values,def
+All_null_values,null
+All_zeros_values,0
+Max_range_values,9223372036854775807
+Max_range_values,9223372036854775807
+Max_range_values,9223372036854775807
+Max_range_values,9223372036854775807
+Min_range_values,-9223372036854775808
+Min_range_values,-9223372036854775807
+Min_range_values,-9223372036854775806
+Min_range_values,-9223372036854775805
+Normal_values,2345
+Normal_values,1234
+Normal_values,3456
+Normal_values,4567

http://git-wip-us.apache.org/repos/asf/incubator-carbondata/blob/4f915d10/integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/bigdecimal/TestBigInt.scala
----------------------------------------------------------------------
diff --git 
a/integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/bigdecimal/TestBigInt.scala
 
b/integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/bigdecimal/TestBigInt.scala
index 59fc9d3..f738040 100644
--- 
a/integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/bigdecimal/TestBigInt.scala
+++ 
b/integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/bigdecimal/TestBigInt.scala
@@ -79,6 +79,19 @@ class TestBigInt extends QueryTest with BeforeAndAfterAll {
       sql("select count(distinct salary) from hiveTable"))
   }
 
+  test("test big int data type storage for boundary values") {
+    sql("DROP TABLE IF EXISTS Test_Boundary_carbon")
+    sql("DROP TABLE IF EXISTS Test_Boundary_hive")
+    sql("create table Test_Boundary_carbon (c1_String string, c2_Bigint 
Bigint) STORED BY 'org.apache.carbondata.format'")
+    sql(s"LOAD DATA LOCAL INPATH 
'$resourcesPath/testBigInt_boundary_value.csv' into table Test_Boundary_carbon 
OPTIONS('FILEHEADER'='c1_String,c2_Bigint')")
+    sql("create table Test_Boundary_hive (c1_String string, c2_Bigint Bigint) 
row format delimited fields terminated by ','")
+    sql(s"LOAD DATA LOCAL INPATH 
'$resourcesPath/testBigInt_boundary_value.csv' into table Test_Boundary_hive")
+    checkAnswer(sql("select c2_bigInt*23 from Test_Boundary_carbon where 
c2_BigInt<9223372036854775807"),
+      sql("select c2_bigInt*23 from Test_Boundary_hive where 
c2_BigInt<9223372036854775807"))
+    sql("DROP TABLE IF EXISTS Test_Boundary_carbon")
+    sql("DROP TABLE IF EXISTS Test_Boundary_hive")
+  }
+
   override def afterAll {
     sql("drop table if exists carbonTable")
     sql("drop table if exists hiveTable")

[1/2] incubator-carbondata git commit: When data contains long max and min values for a measure column with bigInt datatype, the delta compression selected is DATA_BYTE which is incorrect. For selecting the delta compression min value is decremented from

Reply via email to