Repository: hive
Updated Branches:
  refs/heads/master 35605732b -> 5daad4e44


HIVE-18569 : Hive Druid indexing not dealing with decimals in correct way. 
(Nishant Bangarwa via Ashutosh Chauhan)

Signed-off-by: Ashutosh Chauhan <hashut...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/5daad4e4
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/5daad4e4
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/5daad4e4

Branch: refs/heads/master
Commit: 5daad4e4451e7d181236942e5af85f3cf94c6bad
Parents: 3560573
Author: Ashutosh Chauhan <hashut...@apache.org>
Authored: Mon Jan 29 07:48:00 2018 -0800
Committer: Ashutosh Chauhan <hashut...@apache.org>
Committed: Tue Feb 13 13:14:45 2018 -0800

----------------------------------------------------------------------
 .../org/apache/hadoop/hive/conf/HiveConf.java   |  3 +
 .../hadoop/hive/druid/io/DruidOutputFormat.java | 12 ++-
 .../test/queries/clientpositive/druidmini_mv.q  |  4 +-
 .../clientpositive/druid/druidmini_mv.q.out     | 98 +++++++++++---------
 4 files changed, 72 insertions(+), 45 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/5daad4e4/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
----------------------------------------------------------------------
diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java 
b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
index adb9b9b..ce96bff 100644
--- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
+++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
@@ -2123,6 +2123,9 @@ public class HiveConf extends Configuration {
             "Wait time in ms default to 30 seconds."
     ),
     HIVE_DRUID_BITMAP_FACTORY_TYPE("hive.druid.bitmap.type", "roaring", new 
PatternSet("roaring", "concise"), "Coding algorithm use to encode the bitmaps"),
+    HIVE_DRUID_APPROX_RESULT("hive.druid.approx.result", false,
+        "Whether to allow approximate results from druid. \n" +
+        "When set to true decimals will be stored as double and druid is 
allowed to return approximate results for decimal columns."),
     // For HBase storage handler
     HIVE_HBASE_WAL_ENABLED("hive.hbase.wal.enabled", true,
         "Whether writes to HBase should be forced to the write-ahead log. \n" +

http://git-wip-us.apache.org/repos/asf/hive/blob/5daad4e4/druid-handler/src/java/org/apache/hadoop/hive/druid/io/DruidOutputFormat.java
----------------------------------------------------------------------
diff --git 
a/druid-handler/src/java/org/apache/hadoop/hive/druid/io/DruidOutputFormat.java 
b/druid-handler/src/java/org/apache/hadoop/hive/druid/io/DruidOutputFormat.java
index 0977329..8c25d62 100644
--- 
a/druid-handler/src/java/org/apache/hadoop/hive/druid/io/DruidOutputFormat.java
+++ 
b/druid-handler/src/java/org/apache/hadoop/hive/druid/io/DruidOutputFormat.java
@@ -129,6 +129,7 @@ public class DruidOutputFormat<K, V> implements 
HiveOutputFormat<K, DruidWritabl
     }
     ArrayList<TypeInfo> columnTypes = 
TypeInfoUtils.getTypeInfosFromTypeString(columnTypeProperty);
 
+    final boolean approximationAllowed = HiveConf.getBoolVar(jc, 
HiveConf.ConfVars.HIVE_DRUID_APPROX_RESULT);
     // Default, all columns that are not metrics or timestamp, are treated as 
dimensions
     final List<DimensionSchema> dimensions = new ArrayList<>();
     ImmutableList.Builder<AggregatorFactory> aggregatorFactoryBuilder = 
ImmutableList.builder();
@@ -145,9 +146,18 @@ public class DruidOutputFormat<K, V> implements 
HiveOutputFormat<K, DruidWritabl
           break;
         case FLOAT:
         case DOUBLE:
-        case DECIMAL:
           af = new DoubleSumAggregatorFactory(columnNames.get(i), 
columnNames.get(i));
           break;
+        case DECIMAL:
+          if (approximationAllowed) {
+            af = new DoubleSumAggregatorFactory(columnNames.get(i), 
columnNames.get(i));
+          } else {
+            throw new UnsupportedOperationException(
+                String.format("Druid does not support decimal column type." +
+                        "Either cast column [%s] to double or Enable 
Approximate Result for Druid by setting property [%s] to true",
+                    columnNames.get(i), 
HiveConf.ConfVars.HIVE_DRUID_APPROX_RESULT.varname));
+          }
+          break;
         case TIMESTAMP:
           // Granularity column
           String tColumnName = columnNames.get(i);

http://git-wip-us.apache.org/repos/asf/hive/blob/5daad4e4/ql/src/test/queries/clientpositive/druidmini_mv.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/druidmini_mv.q 
b/ql/src/test/queries/clientpositive/druidmini_mv.q
index e059357..9f8500f 100644
--- a/ql/src/test/queries/clientpositive/druidmini_mv.q
+++ b/ql/src/test/queries/clientpositive/druidmini_mv.q
@@ -18,7 +18,7 @@ CREATE MATERIALIZED VIEW cmv_mat_view ENABLE REWRITE
 STORED BY 'org.apache.hadoop.hive.druid.DruidStorageHandler'
 TBLPROPERTIES ("druid.segment.granularity" = "HOUR")
 AS
-SELECT cast(current_timestamp() as timestamp with local time zone) as 
`__time`, a, b, c
+SELECT cast(current_timestamp() as timestamp with local time zone) as 
`__time`, a, b, cast(c as double)
 FROM cmv_basetable
 WHERE a = 2;
 
@@ -30,7 +30,7 @@ CREATE MATERIALIZED VIEW IF NOT EXISTS cmv_mat_view2 ENABLE 
REWRITE
 STORED BY 'org.apache.hadoop.hive.druid.DruidStorageHandler'
 TBLPROPERTIES ("druid.segment.granularity" = "HOUR")
 AS
-SELECT cast(current_timestamp() as timestamp with local time zone) as 
`__time`, a, c
+SELECT cast(current_timestamp() as timestamp with local time zone) as 
`__time`, a, cast(c as double)
 FROM cmv_basetable
 WHERE a = 3;
 

http://git-wip-us.apache.org/repos/asf/hive/blob/5daad4e4/ql/src/test/results/clientpositive/druid/druidmini_mv.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/druid/druidmini_mv.q.out 
b/ql/src/test/results/clientpositive/druid/druidmini_mv.q.out
index 5a0b885..294b84a 100644
--- a/ql/src/test/results/clientpositive/druid/druidmini_mv.q.out
+++ b/ql/src/test/results/clientpositive/druid/druidmini_mv.q.out
@@ -32,7 +32,7 @@ PREHOOK: query: CREATE MATERIALIZED VIEW cmv_mat_view ENABLE 
REWRITE
 STORED BY 'org.apache.hadoop.hive.druid.DruidStorageHandler'
 TBLPROPERTIES ("druid.segment.granularity" = "HOUR")
 AS
-SELECT cast(current_timestamp() as timestamp with local time zone) as 
`__time`, a, b, c
+SELECT cast(current_timestamp() as timestamp with local time zone) as 
`__time`, a, b, cast(c as double)
 FROM cmv_basetable
 WHERE a = 2
 PREHOOK: type: CREATE_MATERIALIZED_VIEW
@@ -43,7 +43,7 @@ POSTHOOK: query: CREATE MATERIALIZED VIEW cmv_mat_view ENABLE 
REWRITE
 STORED BY 'org.apache.hadoop.hive.druid.DruidStorageHandler'
 TBLPROPERTIES ("druid.segment.granularity" = "HOUR")
 AS
-SELECT cast(current_timestamp() as timestamp with local time zone) as 
`__time`, a, b, c
+SELECT cast(current_timestamp() as timestamp with local time zone) as 
`__time`, a, b, cast(c as double)
 FROM cmv_basetable
 WHERE a = 2
 POSTHOOK: type: CREATE_MATERIALIZED_VIEW
@@ -58,8 +58,8 @@ POSTHOOK: query: SELECT a, b, c FROM cmv_mat_view
 POSTHOOK: type: QUERY
 POSTHOOK: Input: default@cmv_mat_view
 #### A masked pattern was here ####
-2      bob     3.14
-2      bonnie  172342.20
+2      bob     3.140000104904175
+2      bonnie  172342.203125
 PREHOOK: query: SHOW TBLPROPERTIES cmv_mat_view
 PREHOOK: type: SHOW_TBLPROPERTIES
 POSTHOOK: query: SHOW TBLPROPERTIES cmv_mat_view
@@ -77,7 +77,7 @@ PREHOOK: query: CREATE MATERIALIZED VIEW IF NOT EXISTS 
cmv_mat_view2 ENABLE REWR
 STORED BY 'org.apache.hadoop.hive.druid.DruidStorageHandler'
 TBLPROPERTIES ("druid.segment.granularity" = "HOUR")
 AS
-SELECT cast(current_timestamp() as timestamp with local time zone) as 
`__time`, a, c
+SELECT cast(current_timestamp() as timestamp with local time zone) as 
`__time`, a, cast(c as double)
 FROM cmv_basetable
 WHERE a = 3
 PREHOOK: type: CREATE_MATERIALIZED_VIEW
@@ -88,7 +88,7 @@ POSTHOOK: query: CREATE MATERIALIZED VIEW IF NOT EXISTS 
cmv_mat_view2 ENABLE REW
 STORED BY 'org.apache.hadoop.hive.druid.DruidStorageHandler'
 TBLPROPERTIES ("druid.segment.granularity" = "HOUR")
 AS
-SELECT cast(current_timestamp() as timestamp with local time zone) as 
`__time`, a, c
+SELECT cast(current_timestamp() as timestamp with local time zone) as 
`__time`, a, cast(c as double)
 FROM cmv_basetable
 WHERE a = 3
 POSTHOOK: type: CREATE_MATERIALIZED_VIEW
@@ -103,7 +103,7 @@ POSTHOOK: query: SELECT a, c FROM cmv_mat_view2
 POSTHOOK: type: QUERY
 POSTHOOK: Input: default@cmv_mat_view2
 #### A masked pattern was here ####
-6      988.56
+6      988.5599975585938
 PREHOOK: query: SHOW TBLPROPERTIES cmv_mat_view2
 PREHOOK: type: SHOW_TBLPROPERTIES
 POSTHOOK: query: SHOW TBLPROPERTIES cmv_mat_view2
@@ -128,41 +128,52 @@ FROM cmv_basetable
 WHERE a = 3
 POSTHOOK: type: QUERY
 STAGE DEPENDENCIES:
-  Stage-0 is a root stage
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
 
 STAGE PLANS:
+  Stage: Stage-1
+    Map Reduce
+      Map Operator Tree:
+          TableScan
+            alias: cmv_basetable
+            Statistics: Num rows: 1 Data size: 9310 Basic stats: COMPLETE 
Column stats: NONE
+            Filter Operator
+              predicate: (a = 3) (type: boolean)
+              Statistics: Num rows: 1 Data size: 9310 Basic stats: COMPLETE 
Column stats: NONE
+              Select Operator
+                expressions: 3 (type: int), c (type: decimal(10,2))
+                outputColumnNames: _col0, _col1
+                Statistics: Num rows: 1 Data size: 9310 Basic stats: COMPLETE 
Column stats: NONE
+                File Output Operator
+                  compressed: false
+                  Statistics: Num rows: 1 Data size: 9310 Basic stats: 
COMPLETE Column stats: NONE
+                  table:
+                      input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
+                      output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                      serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
   Stage: Stage-0
     Fetch Operator
       limit: -1
       Processor Tree:
-        TableScan
-          alias: cmv_mat_view2
-          properties:
-            druid.query.json 
{"queryType":"select","dataSource":"default.cmv_mat_view2","descending":false,"intervals":["1900-01-01T00:00:00.000Z/3000-01-01T00:00:00.000Z"],"dimensions":[],"metrics":["a","c"],"granularity":"all","pagingSpec":{"threshold":16384,"fromNext":true},"context":{"druid.query.fetch":false}}
-            druid.query.type select
-          Statistics: Num rows: 2 Data size: 0 Basic stats: PARTIAL Column 
stats: NONE
-          Select Operator
-            expressions: a (type: int), c (type: decimal(10,2))
-            outputColumnNames: _col0, _col1
-            Statistics: Num rows: 2 Data size: 0 Basic stats: PARTIAL Column 
stats: NONE
-            ListSink
+        ListSink
 
 PREHOOK: query: SELECT a, c
 FROM cmv_basetable
 WHERE a = 3
 PREHOOK: type: QUERY
 PREHOOK: Input: default@cmv_basetable
-PREHOOK: Input: default@cmv_mat_view2
 #### A masked pattern was here ####
 POSTHOOK: query: SELECT a, c
 FROM cmv_basetable
 WHERE a = 3
 POSTHOOK: type: QUERY
 POSTHOOK: Input: default@cmv_basetable
-POSTHOOK: Input: default@cmv_mat_view2
 #### A masked pattern was here ####
-6      988.56
-Warning: Shuffle Join JOIN[6][tables = [cmv_mat_view2, $hdt$_0]] in Stage 
'Stage-1:MAPRED' is a cross product
+3      9.80
+3      978.76
+Warning: Shuffle Join JOIN[8][tables = [$hdt$_0, $hdt$_1]] in Stage 
'Stage-1:MAPRED' is a cross product
 PREHOOK: query: EXPLAIN
 SELECT * FROM (
   (SELECT a, c FROM cmv_basetable WHERE a = 3) table1
@@ -189,7 +200,7 @@ STAGE PLANS:
             alias: cmv_basetable
             Statistics: Num rows: 1 Data size: 9310 Basic stats: COMPLETE 
Column stats: NONE
             Filter Operator
-              predicate: ((3 = a) and (d = 3)) (type: boolean)
+              predicate: (a = 3) (type: boolean)
               Statistics: Num rows: 1 Data size: 9310 Basic stats: COMPLETE 
Column stats: NONE
               Select Operator
                 expressions: c (type: decimal(10,2))
@@ -200,15 +211,19 @@ STAGE PLANS:
                   Statistics: Num rows: 1 Data size: 9310 Basic stats: 
COMPLETE Column stats: NONE
                   value expressions: _col0 (type: decimal(10,2))
           TableScan
-            alias: cmv_mat_view2
-            properties:
-              druid.query.json 
{"queryType":"select","dataSource":"default.cmv_mat_view2","descending":false,"intervals":["1900-01-01T00:00:00.000Z/3000-01-01T00:00:00.000Z"],"dimensions":[],"metrics":["c"],"granularity":"all","pagingSpec":{"threshold":16384,"fromNext":true},"context":{"druid.query.fetch":false}}
-              druid.query.type select
-            Statistics: Num rows: 2 Data size: 0 Basic stats: PARTIAL Column 
stats: NONE
-            Reduce Output Operator
-              sort order: 
-              Statistics: Num rows: 2 Data size: 0 Basic stats: PARTIAL Column 
stats: NONE
-              value expressions: c (type: decimal(10,2))
+            alias: cmv_basetable
+            Statistics: Num rows: 1 Data size: 9310 Basic stats: COMPLETE 
Column stats: NONE
+            Filter Operator
+              predicate: ((3 = a) and (d = 3)) (type: boolean)
+              Statistics: Num rows: 1 Data size: 9310 Basic stats: COMPLETE 
Column stats: NONE
+              Select Operator
+                expressions: c (type: decimal(10,2))
+                outputColumnNames: _col0
+                Statistics: Num rows: 1 Data size: 9310 Basic stats: COMPLETE 
Column stats: NONE
+                Reduce Output Operator
+                  sort order: 
+                  Statistics: Num rows: 1 Data size: 9310 Basic stats: 
COMPLETE Column stats: NONE
+                  value expressions: _col0 (type: decimal(10,2))
       Reduce Operator Tree:
         Join Operator
           condition map:
@@ -216,15 +231,15 @@ STAGE PLANS:
           keys:
             0 
             1 
-          outputColumnNames: _col1, _col5
-          Statistics: Num rows: 2 Data size: 18622 Basic stats: PARTIAL Column 
stats: NONE
+          outputColumnNames: _col0, _col1
+          Statistics: Num rows: 1 Data size: 18621 Basic stats: COMPLETE 
Column stats: NONE
           Select Operator
-            expressions: 3 (type: int), _col1 (type: decimal(10,2)), 3 (type: 
int), _col5 (type: decimal(10,2))
+            expressions: 3 (type: int), _col0 (type: decimal(10,2)), 3 (type: 
int), _col1 (type: decimal(10,2))
             outputColumnNames: _col0, _col1, _col2, _col3
-            Statistics: Num rows: 2 Data size: 18622 Basic stats: PARTIAL 
Column stats: NONE
+            Statistics: Num rows: 1 Data size: 18621 Basic stats: COMPLETE 
Column stats: NONE
             File Output Operator
               compressed: false
-              Statistics: Num rows: 2 Data size: 18622 Basic stats: PARTIAL 
Column stats: NONE
+              Statistics: Num rows: 1 Data size: 18621 Basic stats: COMPLETE 
Column stats: NONE
               table:
                   input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
                   output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
@@ -236,7 +251,7 @@ STAGE PLANS:
       Processor Tree:
         ListSink
 
-Warning: Shuffle Join JOIN[6][tables = [cmv_mat_view2, $hdt$_0]] in Stage 
'Stage-1:MAPRED' is a cross product
+Warning: Shuffle Join JOIN[8][tables = [$hdt$_0, $hdt$_1]] in Stage 
'Stage-1:MAPRED' is a cross product
 PREHOOK: query: SELECT * FROM (
   (SELECT a, c FROM cmv_basetable WHERE a = 3) table1
   JOIN
@@ -244,7 +259,6 @@ PREHOOK: query: SELECT * FROM (
   ON table1.a = table2.a)
 PREHOOK: type: QUERY
 PREHOOK: Input: default@cmv_basetable
-PREHOOK: Input: default@cmv_mat_view2
 #### A masked pattern was here ####
 POSTHOOK: query: SELECT * FROM (
   (SELECT a, c FROM cmv_basetable WHERE a = 3) table1
@@ -253,9 +267,9 @@ POSTHOOK: query: SELECT * FROM (
   ON table1.a = table2.a)
 POSTHOOK: type: QUERY
 POSTHOOK: Input: default@cmv_basetable
-POSTHOOK: Input: default@cmv_mat_view2
 #### A masked pattern was here ####
-3      988.56  3       978.76
+3      9.80    3       978.76
+3      978.76  3       978.76
 PREHOOK: query: INSERT INTO cmv_basetable VALUES
  (3, 'charlie', 15.8, 1)
 PREHOOK: type: QUERY

Reply via email to