Repository: hive Updated Branches: refs/heads/master 35605732b -> 5daad4e44
HIVE-18569 : Hive Druid indexing not dealing with decimals in correct way. (Nishant Bangarwa via Ashutosh Chauhan) Signed-off-by: Ashutosh Chauhan <hashut...@apache.org> Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/5daad4e4 Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/5daad4e4 Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/5daad4e4 Branch: refs/heads/master Commit: 5daad4e4451e7d181236942e5af85f3cf94c6bad Parents: 3560573 Author: Ashutosh Chauhan <hashut...@apache.org> Authored: Mon Jan 29 07:48:00 2018 -0800 Committer: Ashutosh Chauhan <hashut...@apache.org> Committed: Tue Feb 13 13:14:45 2018 -0800 ---------------------------------------------------------------------- .../org/apache/hadoop/hive/conf/HiveConf.java | 3 + .../hadoop/hive/druid/io/DruidOutputFormat.java | 12 ++- .../test/queries/clientpositive/druidmini_mv.q | 4 +- .../clientpositive/druid/druidmini_mv.q.out | 98 +++++++++++--------- 4 files changed, 72 insertions(+), 45 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/5daad4e4/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java ---------------------------------------------------------------------- diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index adb9b9b..ce96bff 100644 --- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -2123,6 +2123,9 @@ public class HiveConf extends Configuration { "Wait time in ms default to 30 seconds." ), HIVE_DRUID_BITMAP_FACTORY_TYPE("hive.druid.bitmap.type", "roaring", new PatternSet("roaring", "concise"), "Coding algorithm use to encode the bitmaps"), + HIVE_DRUID_APPROX_RESULT("hive.druid.approx.result", false, + "Whether to allow approximate results from druid. \n" + + "When set to true decimals will be stored as double and druid is allowed to return approximate results for decimal columns."), // For HBase storage handler HIVE_HBASE_WAL_ENABLED("hive.hbase.wal.enabled", true, "Whether writes to HBase should be forced to the write-ahead log. \n" + http://git-wip-us.apache.org/repos/asf/hive/blob/5daad4e4/druid-handler/src/java/org/apache/hadoop/hive/druid/io/DruidOutputFormat.java ---------------------------------------------------------------------- diff --git a/druid-handler/src/java/org/apache/hadoop/hive/druid/io/DruidOutputFormat.java b/druid-handler/src/java/org/apache/hadoop/hive/druid/io/DruidOutputFormat.java index 0977329..8c25d62 100644 --- a/druid-handler/src/java/org/apache/hadoop/hive/druid/io/DruidOutputFormat.java +++ b/druid-handler/src/java/org/apache/hadoop/hive/druid/io/DruidOutputFormat.java @@ -129,6 +129,7 @@ public class DruidOutputFormat<K, V> implements HiveOutputFormat<K, DruidWritabl } ArrayList<TypeInfo> columnTypes = TypeInfoUtils.getTypeInfosFromTypeString(columnTypeProperty); + final boolean approximationAllowed = HiveConf.getBoolVar(jc, HiveConf.ConfVars.HIVE_DRUID_APPROX_RESULT); // Default, all columns that are not metrics or timestamp, are treated as dimensions final List<DimensionSchema> dimensions = new ArrayList<>(); ImmutableList.Builder<AggregatorFactory> aggregatorFactoryBuilder = ImmutableList.builder(); @@ -145,9 +146,18 @@ public class DruidOutputFormat<K, V> implements HiveOutputFormat<K, DruidWritabl break; case FLOAT: case DOUBLE: - case DECIMAL: af = new DoubleSumAggregatorFactory(columnNames.get(i), columnNames.get(i)); break; + case DECIMAL: + if (approximationAllowed) { + af = new DoubleSumAggregatorFactory(columnNames.get(i), columnNames.get(i)); + } else { + throw new UnsupportedOperationException( + String.format("Druid does not support decimal column type." + + "Either cast column [%s] to double or Enable Approximate Result for Druid by setting property [%s] to true", + columnNames.get(i), HiveConf.ConfVars.HIVE_DRUID_APPROX_RESULT.varname)); + } + break; case TIMESTAMP: // Granularity column String tColumnName = columnNames.get(i); http://git-wip-us.apache.org/repos/asf/hive/blob/5daad4e4/ql/src/test/queries/clientpositive/druidmini_mv.q ---------------------------------------------------------------------- diff --git a/ql/src/test/queries/clientpositive/druidmini_mv.q b/ql/src/test/queries/clientpositive/druidmini_mv.q index e059357..9f8500f 100644 --- a/ql/src/test/queries/clientpositive/druidmini_mv.q +++ b/ql/src/test/queries/clientpositive/druidmini_mv.q @@ -18,7 +18,7 @@ CREATE MATERIALIZED VIEW cmv_mat_view ENABLE REWRITE STORED BY 'org.apache.hadoop.hive.druid.DruidStorageHandler' TBLPROPERTIES ("druid.segment.granularity" = "HOUR") AS -SELECT cast(current_timestamp() as timestamp with local time zone) as `__time`, a, b, c +SELECT cast(current_timestamp() as timestamp with local time zone) as `__time`, a, b, cast(c as double) FROM cmv_basetable WHERE a = 2; @@ -30,7 +30,7 @@ CREATE MATERIALIZED VIEW IF NOT EXISTS cmv_mat_view2 ENABLE REWRITE STORED BY 'org.apache.hadoop.hive.druid.DruidStorageHandler' TBLPROPERTIES ("druid.segment.granularity" = "HOUR") AS -SELECT cast(current_timestamp() as timestamp with local time zone) as `__time`, a, c +SELECT cast(current_timestamp() as timestamp with local time zone) as `__time`, a, cast(c as double) FROM cmv_basetable WHERE a = 3; http://git-wip-us.apache.org/repos/asf/hive/blob/5daad4e4/ql/src/test/results/clientpositive/druid/druidmini_mv.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/druid/druidmini_mv.q.out b/ql/src/test/results/clientpositive/druid/druidmini_mv.q.out index 5a0b885..294b84a 100644 --- a/ql/src/test/results/clientpositive/druid/druidmini_mv.q.out +++ b/ql/src/test/results/clientpositive/druid/druidmini_mv.q.out @@ -32,7 +32,7 @@ PREHOOK: query: CREATE MATERIALIZED VIEW cmv_mat_view ENABLE REWRITE STORED BY 'org.apache.hadoop.hive.druid.DruidStorageHandler' TBLPROPERTIES ("druid.segment.granularity" = "HOUR") AS -SELECT cast(current_timestamp() as timestamp with local time zone) as `__time`, a, b, c +SELECT cast(current_timestamp() as timestamp with local time zone) as `__time`, a, b, cast(c as double) FROM cmv_basetable WHERE a = 2 PREHOOK: type: CREATE_MATERIALIZED_VIEW @@ -43,7 +43,7 @@ POSTHOOK: query: CREATE MATERIALIZED VIEW cmv_mat_view ENABLE REWRITE STORED BY 'org.apache.hadoop.hive.druid.DruidStorageHandler' TBLPROPERTIES ("druid.segment.granularity" = "HOUR") AS -SELECT cast(current_timestamp() as timestamp with local time zone) as `__time`, a, b, c +SELECT cast(current_timestamp() as timestamp with local time zone) as `__time`, a, b, cast(c as double) FROM cmv_basetable WHERE a = 2 POSTHOOK: type: CREATE_MATERIALIZED_VIEW @@ -58,8 +58,8 @@ POSTHOOK: query: SELECT a, b, c FROM cmv_mat_view POSTHOOK: type: QUERY POSTHOOK: Input: default@cmv_mat_view #### A masked pattern was here #### -2 bob 3.14 -2 bonnie 172342.20 +2 bob 3.140000104904175 +2 bonnie 172342.203125 PREHOOK: query: SHOW TBLPROPERTIES cmv_mat_view PREHOOK: type: SHOW_TBLPROPERTIES POSTHOOK: query: SHOW TBLPROPERTIES cmv_mat_view @@ -77,7 +77,7 @@ PREHOOK: query: CREATE MATERIALIZED VIEW IF NOT EXISTS cmv_mat_view2 ENABLE REWR STORED BY 'org.apache.hadoop.hive.druid.DruidStorageHandler' TBLPROPERTIES ("druid.segment.granularity" = "HOUR") AS -SELECT cast(current_timestamp() as timestamp with local time zone) as `__time`, a, c +SELECT cast(current_timestamp() as timestamp with local time zone) as `__time`, a, cast(c as double) FROM cmv_basetable WHERE a = 3 PREHOOK: type: CREATE_MATERIALIZED_VIEW @@ -88,7 +88,7 @@ POSTHOOK: query: CREATE MATERIALIZED VIEW IF NOT EXISTS cmv_mat_view2 ENABLE REW STORED BY 'org.apache.hadoop.hive.druid.DruidStorageHandler' TBLPROPERTIES ("druid.segment.granularity" = "HOUR") AS -SELECT cast(current_timestamp() as timestamp with local time zone) as `__time`, a, c +SELECT cast(current_timestamp() as timestamp with local time zone) as `__time`, a, cast(c as double) FROM cmv_basetable WHERE a = 3 POSTHOOK: type: CREATE_MATERIALIZED_VIEW @@ -103,7 +103,7 @@ POSTHOOK: query: SELECT a, c FROM cmv_mat_view2 POSTHOOK: type: QUERY POSTHOOK: Input: default@cmv_mat_view2 #### A masked pattern was here #### -6 988.56 +6 988.5599975585938 PREHOOK: query: SHOW TBLPROPERTIES cmv_mat_view2 PREHOOK: type: SHOW_TBLPROPERTIES POSTHOOK: query: SHOW TBLPROPERTIES cmv_mat_view2 @@ -128,41 +128,52 @@ FROM cmv_basetable WHERE a = 3 POSTHOOK: type: QUERY STAGE DEPENDENCIES: - Stage-0 is a root stage + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: cmv_basetable + Statistics: Num rows: 1 Data size: 9310 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (a = 3) (type: boolean) + Statistics: Num rows: 1 Data size: 9310 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: 3 (type: int), c (type: decimal(10,2)) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 9310 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 9310 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Stage: Stage-0 Fetch Operator limit: -1 Processor Tree: - TableScan - alias: cmv_mat_view2 - properties: - druid.query.json {"queryType":"select","dataSource":"default.cmv_mat_view2","descending":false,"intervals":["1900-01-01T00:00:00.000Z/3000-01-01T00:00:00.000Z"],"dimensions":[],"metrics":["a","c"],"granularity":"all","pagingSpec":{"threshold":16384,"fromNext":true},"context":{"druid.query.fetch":false}} - druid.query.type select - Statistics: Num rows: 2 Data size: 0 Basic stats: PARTIAL Column stats: NONE - Select Operator - expressions: a (type: int), c (type: decimal(10,2)) - outputColumnNames: _col0, _col1 - Statistics: Num rows: 2 Data size: 0 Basic stats: PARTIAL Column stats: NONE - ListSink + ListSink PREHOOK: query: SELECT a, c FROM cmv_basetable WHERE a = 3 PREHOOK: type: QUERY PREHOOK: Input: default@cmv_basetable -PREHOOK: Input: default@cmv_mat_view2 #### A masked pattern was here #### POSTHOOK: query: SELECT a, c FROM cmv_basetable WHERE a = 3 POSTHOOK: type: QUERY POSTHOOK: Input: default@cmv_basetable -POSTHOOK: Input: default@cmv_mat_view2 #### A masked pattern was here #### -6 988.56 -Warning: Shuffle Join JOIN[6][tables = [cmv_mat_view2, $hdt$_0]] in Stage 'Stage-1:MAPRED' is a cross product +3 9.80 +3 978.76 +Warning: Shuffle Join JOIN[8][tables = [$hdt$_0, $hdt$_1]] in Stage 'Stage-1:MAPRED' is a cross product PREHOOK: query: EXPLAIN SELECT * FROM ( (SELECT a, c FROM cmv_basetable WHERE a = 3) table1 @@ -189,7 +200,7 @@ STAGE PLANS: alias: cmv_basetable Statistics: Num rows: 1 Data size: 9310 Basic stats: COMPLETE Column stats: NONE Filter Operator - predicate: ((3 = a) and (d = 3)) (type: boolean) + predicate: (a = 3) (type: boolean) Statistics: Num rows: 1 Data size: 9310 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: c (type: decimal(10,2)) @@ -200,15 +211,19 @@ STAGE PLANS: Statistics: Num rows: 1 Data size: 9310 Basic stats: COMPLETE Column stats: NONE value expressions: _col0 (type: decimal(10,2)) TableScan - alias: cmv_mat_view2 - properties: - druid.query.json {"queryType":"select","dataSource":"default.cmv_mat_view2","descending":false,"intervals":["1900-01-01T00:00:00.000Z/3000-01-01T00:00:00.000Z"],"dimensions":[],"metrics":["c"],"granularity":"all","pagingSpec":{"threshold":16384,"fromNext":true},"context":{"druid.query.fetch":false}} - druid.query.type select - Statistics: Num rows: 2 Data size: 0 Basic stats: PARTIAL Column stats: NONE - Reduce Output Operator - sort order: - Statistics: Num rows: 2 Data size: 0 Basic stats: PARTIAL Column stats: NONE - value expressions: c (type: decimal(10,2)) + alias: cmv_basetable + Statistics: Num rows: 1 Data size: 9310 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: ((3 = a) and (d = 3)) (type: boolean) + Statistics: Num rows: 1 Data size: 9310 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: c (type: decimal(10,2)) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 9310 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 9310 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: decimal(10,2)) Reduce Operator Tree: Join Operator condition map: @@ -216,15 +231,15 @@ STAGE PLANS: keys: 0 1 - outputColumnNames: _col1, _col5 - Statistics: Num rows: 2 Data size: 18622 Basic stats: PARTIAL Column stats: NONE + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 18621 Basic stats: COMPLETE Column stats: NONE Select Operator - expressions: 3 (type: int), _col1 (type: decimal(10,2)), 3 (type: int), _col5 (type: decimal(10,2)) + expressions: 3 (type: int), _col0 (type: decimal(10,2)), 3 (type: int), _col1 (type: decimal(10,2)) outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 2 Data size: 18622 Basic stats: PARTIAL Column stats: NONE + Statistics: Num rows: 1 Data size: 18621 Basic stats: COMPLETE Column stats: NONE File Output Operator compressed: false - Statistics: Num rows: 2 Data size: 18622 Basic stats: PARTIAL Column stats: NONE + Statistics: Num rows: 1 Data size: 18621 Basic stats: COMPLETE Column stats: NONE table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat @@ -236,7 +251,7 @@ STAGE PLANS: Processor Tree: ListSink -Warning: Shuffle Join JOIN[6][tables = [cmv_mat_view2, $hdt$_0]] in Stage 'Stage-1:MAPRED' is a cross product +Warning: Shuffle Join JOIN[8][tables = [$hdt$_0, $hdt$_1]] in Stage 'Stage-1:MAPRED' is a cross product PREHOOK: query: SELECT * FROM ( (SELECT a, c FROM cmv_basetable WHERE a = 3) table1 JOIN @@ -244,7 +259,6 @@ PREHOOK: query: SELECT * FROM ( ON table1.a = table2.a) PREHOOK: type: QUERY PREHOOK: Input: default@cmv_basetable -PREHOOK: Input: default@cmv_mat_view2 #### A masked pattern was here #### POSTHOOK: query: SELECT * FROM ( (SELECT a, c FROM cmv_basetable WHERE a = 3) table1 @@ -253,9 +267,9 @@ POSTHOOK: query: SELECT * FROM ( ON table1.a = table2.a) POSTHOOK: type: QUERY POSTHOOK: Input: default@cmv_basetable -POSTHOOK: Input: default@cmv_mat_view2 #### A masked pattern was here #### -3 988.56 3 978.76 +3 9.80 3 978.76 +3 978.76 3 978.76 PREHOOK: query: INSERT INTO cmv_basetable VALUES (3, 'charlie', 15.8, 1) PREHOOK: type: QUERY