http://git-wip-us.apache.org/repos/asf/hive/blob/13960aa9/ql/src/test/results/clientpositive/spark/bucket_map_join_tez2.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/spark/bucket_map_join_tez2.q.out b/ql/src/test/results/clientpositive/spark/bucket_map_join_tez2.q.out index 68aabb0..4a10953 100644 --- a/ql/src/test/results/clientpositive/spark/bucket_map_join_tez2.q.out +++ b/ql/src/test/results/clientpositive/spark/bucket_map_join_tez2.q.out @@ -193,48 +193,48 @@ STAGE PLANS: outputColumnNames: _col0, _col1 Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator - key expressions: _col0 (type: int) + key expressions: _col1 (type: string) sort order: + - Map-reduce partition columns: _col0 (type: int) + Map-reduce partition columns: _col1 (type: string) Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: string) + value expressions: _col0 (type: int) Execution mode: vectorized Map 4 Map Operator Tree: TableScan - alias: c + alias: b Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE Filter Operator - predicate: key is not null (type: boolean) + predicate: value is not null (type: boolean) Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE Select Operator - expressions: key (type: int) - outputColumnNames: _col0 + expressions: key (type: int), value (type: string) + outputColumnNames: _col0, _col1 Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator - key expressions: _col0 (type: int) + key expressions: _col1 (type: string) sort order: + - Map-reduce partition columns: _col0 (type: int) + Map-reduce partition columns: _col1 (type: string) Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: int) Execution mode: vectorized Map 5 Map Operator Tree: TableScan - alias: b + alias: c Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE Filter Operator - predicate: value is not null (type: boolean) + predicate: key is not null (type: boolean) Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE Select Operator - expressions: key (type: int), value (type: string) - outputColumnNames: _col0, _col1 + expressions: key (type: int) + outputColumnNames: _col0 Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator - key expressions: _col1 (type: string) + key expressions: _col0 (type: int) sort order: + - Map-reduce partition columns: _col1 (type: string) + Map-reduce partition columns: _col0 (type: int) Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: _col0 (type: int) Execution mode: vectorized Reducer 2 Reduce Operator Tree: @@ -242,28 +242,28 @@ STAGE PLANS: condition map: Inner Join 0 to 1 keys: - 0 _col0 (type: int) - 1 _col0 (type: int) - outputColumnNames: _col0, _col1 + 0 _col1 (type: string) + 1 _col1 (type: string) + outputColumnNames: _col0, _col2 Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator - key expressions: _col1 (type: string) + key expressions: _col0 (type: int) sort order: + - Map-reduce partition columns: _col1 (type: string) + Map-reduce partition columns: _col0 (type: int) Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE - value expressions: _col0 (type: int) + value expressions: _col2 (type: int) Reducer 3 Reduce Operator Tree: Join Operator condition map: Inner Join 0 to 1 keys: - 0 _col1 (type: string) - 1 _col1 (type: string) - outputColumnNames: _col0, _col3 + 0 _col0 (type: int) + 1 _col0 (type: int) + outputColumnNames: _col0, _col2 Statistics: Num rows: 605 Data size: 6427 Basic stats: COMPLETE Column stats: NONE Select Operator - expressions: _col0 (type: int), _col3 (type: int) + expressions: _col0 (type: int), _col2 (type: int) outputColumnNames: _col0, _col1 Statistics: Num rows: 605 Data size: 6427 Basic stats: COMPLETE Column stats: NONE File Output Operator @@ -309,48 +309,48 @@ STAGE PLANS: outputColumnNames: _col0, _col1 Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator - key expressions: _col0 (type: int) + key expressions: _col1 (type: string) sort order: + - Map-reduce partition columns: _col0 (type: int) + Map-reduce partition columns: _col1 (type: string) Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: string) + value expressions: _col0 (type: int) Execution mode: vectorized Map 4 Map Operator Tree: TableScan - alias: c + alias: b Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE Filter Operator - predicate: key is not null (type: boolean) + predicate: value is not null (type: boolean) Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE Select Operator - expressions: key (type: int) - outputColumnNames: _col0 + expressions: key (type: int), value (type: string) + outputColumnNames: _col0, _col1 Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator - key expressions: _col0 (type: int) + key expressions: _col1 (type: string) sort order: + - Map-reduce partition columns: _col0 (type: int) + Map-reduce partition columns: _col1 (type: string) Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: int) Execution mode: vectorized Map 5 Map Operator Tree: TableScan - alias: b + alias: c Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE Filter Operator - predicate: value is not null (type: boolean) + predicate: key is not null (type: boolean) Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE Select Operator - expressions: key (type: int), value (type: string) - outputColumnNames: _col0, _col1 + expressions: key (type: int) + outputColumnNames: _col0 Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator - key expressions: _col1 (type: string) + key expressions: _col0 (type: int) sort order: + - Map-reduce partition columns: _col1 (type: string) + Map-reduce partition columns: _col0 (type: int) Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: _col0 (type: int) Execution mode: vectorized Reducer 2 Reduce Operator Tree: @@ -358,28 +358,28 @@ STAGE PLANS: condition map: Inner Join 0 to 1 keys: - 0 _col0 (type: int) - 1 _col0 (type: int) - outputColumnNames: _col0, _col1 + 0 _col1 (type: string) + 1 _col1 (type: string) + outputColumnNames: _col0, _col2 Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator - key expressions: _col1 (type: string) + key expressions: _col0 (type: int) sort order: + - Map-reduce partition columns: _col1 (type: string) + Map-reduce partition columns: _col0 (type: int) Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE - value expressions: _col0 (type: int) + value expressions: _col2 (type: int) Reducer 3 Reduce Operator Tree: Join Operator condition map: Inner Join 0 to 1 keys: - 0 _col1 (type: string) - 1 _col1 (type: string) - outputColumnNames: _col0, _col3 + 0 _col0 (type: int) + 1 _col0 (type: int) + outputColumnNames: _col0, _col2 Statistics: Num rows: 605 Data size: 6427 Basic stats: COMPLETE Column stats: NONE Select Operator - expressions: _col0 (type: int), _col3 (type: int) + expressions: _col0 (type: int), _col2 (type: int) outputColumnNames: _col0, _col1 Statistics: Num rows: 605 Data size: 6427 Basic stats: COMPLETE Column stats: NONE File Output Operator @@ -1906,48 +1906,48 @@ STAGE PLANS: outputColumnNames: _col0, _col1 Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator - key expressions: _col0 (type: int) + key expressions: _col1 (type: string) sort order: + - Map-reduce partition columns: _col0 (type: int) + Map-reduce partition columns: _col1 (type: string) Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: string) + value expressions: _col0 (type: int) Execution mode: vectorized Map 4 Map Operator Tree: TableScan - alias: c + alias: b Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE Filter Operator - predicate: key is not null (type: boolean) + predicate: value is not null (type: boolean) Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE Select Operator - expressions: key (type: int) - outputColumnNames: _col0 + expressions: key (type: int), value (type: string) + outputColumnNames: _col0, _col1 Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator - key expressions: _col0 (type: int) + key expressions: _col1 (type: string) sort order: + - Map-reduce partition columns: _col0 (type: int) + Map-reduce partition columns: _col1 (type: string) Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: int) Execution mode: vectorized Map 5 Map Operator Tree: TableScan - alias: b + alias: c Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE Filter Operator - predicate: value is not null (type: boolean) + predicate: key is not null (type: boolean) Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE Select Operator - expressions: key (type: int), value (type: string) - outputColumnNames: _col0, _col1 + expressions: key (type: int) + outputColumnNames: _col0 Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator - key expressions: _col1 (type: string) + key expressions: _col0 (type: int) sort order: + - Map-reduce partition columns: _col1 (type: string) + Map-reduce partition columns: _col0 (type: int) Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: _col0 (type: int) Execution mode: vectorized Reducer 2 Reduce Operator Tree: @@ -1955,28 +1955,28 @@ STAGE PLANS: condition map: Inner Join 0 to 1 keys: - 0 _col0 (type: int) - 1 _col0 (type: int) - outputColumnNames: _col0, _col1 + 0 _col1 (type: string) + 1 _col1 (type: string) + outputColumnNames: _col0, _col2 Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator - key expressions: _col1 (type: string) + key expressions: _col0 (type: int) sort order: + - Map-reduce partition columns: _col1 (type: string) + Map-reduce partition columns: _col0 (type: int) Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE - value expressions: _col0 (type: int) + value expressions: _col2 (type: int) Reducer 3 Reduce Operator Tree: Join Operator condition map: Inner Join 0 to 1 keys: - 0 _col1 (type: string) - 1 _col1 (type: string) - outputColumnNames: _col0, _col3 + 0 _col0 (type: int) + 1 _col0 (type: int) + outputColumnNames: _col0, _col2 Statistics: Num rows: 605 Data size: 6427 Basic stats: COMPLETE Column stats: NONE Select Operator - expressions: _col0 (type: int), _col3 (type: int) + expressions: _col0 (type: int), _col2 (type: int) outputColumnNames: _col0, _col1 Statistics: Num rows: 605 Data size: 6427 Basic stats: COMPLETE Column stats: NONE File Output Operator @@ -2023,48 +2023,48 @@ STAGE PLANS: outputColumnNames: _col0, _col1 Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator - key expressions: _col0 (type: int) + key expressions: _col1 (type: string) sort order: + - Map-reduce partition columns: _col0 (type: int) + Map-reduce partition columns: _col1 (type: string) Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: string) + value expressions: _col0 (type: int) Execution mode: vectorized Map 4 Map Operator Tree: TableScan - alias: c + alias: b Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE Filter Operator - predicate: key is not null (type: boolean) + predicate: value is not null (type: boolean) Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE Select Operator - expressions: key (type: int) - outputColumnNames: _col0 + expressions: key (type: int), value (type: string) + outputColumnNames: _col0, _col1 Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator - key expressions: _col0 (type: int) + key expressions: _col1 (type: string) sort order: + - Map-reduce partition columns: _col0 (type: int) + Map-reduce partition columns: _col1 (type: string) Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: int) Execution mode: vectorized Map 5 Map Operator Tree: TableScan - alias: b + alias: c Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE Filter Operator - predicate: value is not null (type: boolean) + predicate: key is not null (type: boolean) Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE Select Operator - expressions: key (type: int), value (type: string) - outputColumnNames: _col0, _col1 + expressions: key (type: int) + outputColumnNames: _col0 Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator - key expressions: _col1 (type: string) + key expressions: _col0 (type: int) sort order: + - Map-reduce partition columns: _col1 (type: string) + Map-reduce partition columns: _col0 (type: int) Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: _col0 (type: int) Execution mode: vectorized Reducer 2 Reduce Operator Tree: @@ -2072,28 +2072,28 @@ STAGE PLANS: condition map: Inner Join 0 to 1 keys: - 0 _col0 (type: int) - 1 _col0 (type: int) - outputColumnNames: _col0, _col1 + 0 _col1 (type: string) + 1 _col1 (type: string) + outputColumnNames: _col0, _col2 Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator - key expressions: _col1 (type: string) + key expressions: _col0 (type: int) sort order: + - Map-reduce partition columns: _col1 (type: string) + Map-reduce partition columns: _col0 (type: int) Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE - value expressions: _col0 (type: int) + value expressions: _col2 (type: int) Reducer 3 Reduce Operator Tree: Join Operator condition map: Inner Join 0 to 1 keys: - 0 _col1 (type: string) - 1 _col1 (type: string) - outputColumnNames: _col0, _col3 + 0 _col0 (type: int) + 1 _col0 (type: int) + outputColumnNames: _col0, _col2 Statistics: Num rows: 605 Data size: 6427 Basic stats: COMPLETE Column stats: NONE Select Operator - expressions: _col0 (type: int), _col3 (type: int) + expressions: _col0 (type: int), _col2 (type: int) outputColumnNames: _col0, _col1 Statistics: Num rows: 605 Data size: 6427 Basic stats: COMPLETE Column stats: NONE File Output Operator
http://git-wip-us.apache.org/repos/asf/hive/blob/13960aa9/ql/src/test/results/clientpositive/spark/join32_lessSize.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/spark/join32_lessSize.q.out b/ql/src/test/results/clientpositive/spark/join32_lessSize.q.out index b1363f0..ddd6bd1 100644 --- a/ql/src/test/results/clientpositive/spark/join32_lessSize.q.out +++ b/ql/src/test/results/clientpositive/spark/join32_lessSize.q.out @@ -488,26 +488,25 @@ JOIN src y ON (x.key = y.key) JOIN src1 z ON (x.key = z.key) POSTHOOK: type: QUERY STAGE DEPENDENCIES: - Stage-4 is a root stage - Stage-3 depends on stages: Stage-4 + Stage-3 is a root stage Stage-1 depends on stages: Stage-3 Stage-0 depends on stages: Stage-1 Stage-2 depends on stages: Stage-0 STAGE PLANS: - Stage: Stage-4 + Stage: Stage-3 Spark #### A masked pattern was here #### Vertices: - Map 2 + Map 1 Map Operator Tree: TableScan - alias: z + alias: x Statistics: Num rows: 25 Data size: 191 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false - predicate: key is not null (type: boolean) + predicate: (key is not null and value is not null) (type: boolean) Statistics: Num rows: 25 Data size: 191 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: key (type: string), value (type: string) @@ -517,7 +516,8 @@ STAGE PLANS: keys: 0 _col0 (type: string) 1 _col0 (type: string) - Position of Big Table: 0 + 2 _col0 (type: string) + Position of Big Table: 2 Execution mode: vectorized Local Work: Map Reduce Local Work @@ -573,42 +573,27 @@ STAGE PLANS: name: default.src1 name: default.src1 Truncated Path -> Alias: - /src1 [$hdt$_3:z] - - Stage: Stage-3 - Spark -#### A masked pattern was here #### - Vertices: - Map 1 + /src1 [$hdt$_2:x] + Map 2 Map Operator Tree: TableScan - alias: x + alias: z Statistics: Num rows: 25 Data size: 191 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false - predicate: (key is not null and value is not null) (type: boolean) + predicate: key is not null (type: boolean) Statistics: Num rows: 25 Data size: 191 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: key (type: string), value (type: string) outputColumnNames: _col0, _col1 Statistics: Num rows: 25 Data size: 191 Basic stats: COMPLETE Column stats: NONE - Map Join Operator - condition map: - Inner Join 0 to 1 + Spark HashTable Sink Operator keys: 0 _col0 (type: string) 1 _col0 (type: string) - outputColumnNames: _col0, _col1, _col3 - input vertices: - 1 Map 2 - Position of Big Table: 0 - Statistics: Num rows: 27 Data size: 210 Basic stats: COMPLETE Column stats: NONE - Spark HashTable Sink Operator - keys: - 0 _col1 (type: string) - 1 _col0 (type: string) - Position of Big Table: 1 + 2 _col0 (type: string) + Position of Big Table: 2 Execution mode: vectorized Local Work: Map Reduce Local Work @@ -664,24 +649,24 @@ STAGE PLANS: name: default.src1 name: default.src1 Truncated Path -> Alias: - /src1 [$hdt$_2:x] + /src1 [$hdt$_3:z] Map 4 Map Operator Tree: TableScan - alias: y + alias: w Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false - predicate: key is not null (type: boolean) + predicate: value is not null (type: boolean) Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE Select Operator - expressions: key (type: string), value (type: string) - outputColumnNames: _col0, _col1 + expressions: value (type: string) + outputColumnNames: _col0 Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE Spark HashTable Sink Operator keys: - 0 _col0 (type: string) + 0 _col1 (type: string) 1 _col0 (type: string) Position of Big Table: 0 Execution mode: vectorized @@ -739,7 +724,7 @@ STAGE PLANS: name: default.src name: default.src Truncated Path -> Alias: - /src [$hdt$_0:y] + /src [$hdt$_0:w] Stage: Stage-1 Spark @@ -748,49 +733,52 @@ STAGE PLANS: Map 3 Map Operator Tree: TableScan - alias: w + alias: y Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false - predicate: value is not null (type: boolean) + predicate: key is not null (type: boolean) Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE Select Operator - expressions: value (type: string) - outputColumnNames: _col0 + expressions: key (type: string), value (type: string) + outputColumnNames: _col0, _col1 Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE Map Join Operator condition map: Inner Join 0 to 1 + Inner Join 0 to 2 keys: - 0 _col1 (type: string) + 0 _col0 (type: string) 1 _col0 (type: string) - outputColumnNames: _col0, _col3 + 2 _col0 (type: string) + outputColumnNames: _col0, _col1, _col3, _col5 input vertices: 0 Map 1 - Position of Big Table: 1 - Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE + 1 Map 2 + Position of Big Table: 2 + Statistics: Num rows: 1100 Data size: 11686 Basic stats: COMPLETE Column stats: NONE Map Join Operator condition map: Inner Join 0 to 1 keys: - 0 _col0 (type: string) + 0 _col1 (type: string) 1 _col0 (type: string) - outputColumnNames: _col0, _col3, _col6 + outputColumnNames: _col0, _col3, _col5 input vertices: 1 Map 4 Position of Big Table: 0 - Statistics: Num rows: 605 Data size: 6427 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1210 Data size: 12854 Basic stats: COMPLETE Column stats: NONE Select Operator - expressions: _col0 (type: string), _col3 (type: string), _col6 (type: string) + expressions: _col0 (type: string), _col3 (type: string), _col5 (type: string) outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 605 Data size: 6427 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1210 Data size: 12854 Basic stats: COMPLETE Column stats: NONE File Output Operator compressed: false GlobalTableId: 1 #### A masked pattern was here #### NumFilesPerFileSink: 1 - Statistics: Num rows: 605 Data size: 6427 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1210 Data size: 12854 Basic stats: COMPLETE Column stats: NONE #### A masked pattern was here #### table: input format: org.apache.hadoop.mapred.TextInputFormat @@ -873,7 +861,7 @@ STAGE PLANS: name: default.src name: default.src Truncated Path -> Alias: - /src [$hdt$_1:w] + /src [$hdt$_1:y] Stage: Stage-0 Move Operator http://git-wip-us.apache.org/repos/asf/hive/blob/13960aa9/ql/src/test/results/clientpositive/spark/mapjoin_mapjoin.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/spark/mapjoin_mapjoin.q.out b/ql/src/test/results/clientpositive/spark/mapjoin_mapjoin.q.out index ec632a6..fff2f31 100644 --- a/ql/src/test/results/clientpositive/spark/mapjoin_mapjoin.q.out +++ b/ql/src/test/results/clientpositive/spark/mapjoin_mapjoin.q.out @@ -566,18 +566,18 @@ STAGE PLANS: Map 5 Map Operator Tree: TableScan - alias: src1 + alias: src Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE Filter Operator - predicate: key is not null (type: boolean) + predicate: value is not null (type: boolean) Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE Select Operator - expressions: key (type: string) + expressions: value (type: string) outputColumnNames: _col0 Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE Spark HashTable Sink Operator keys: - 0 _col0 (type: string) + 0 _col1 (type: string) 1 _col0 (type: string) Execution mode: vectorized Local Work: @@ -603,22 +603,22 @@ STAGE PLANS: outputColumnNames: _col0, _col1, _col2 Statistics: Num rows: 2000 Data size: 21248 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator - key expressions: _col1 (type: string) + key expressions: _col0 (type: string) sort order: + - Map-reduce partition columns: _col1 (type: string) + Map-reduce partition columns: _col0 (type: string) Statistics: Num rows: 2000 Data size: 21248 Basic stats: COMPLETE Column stats: NONE - value expressions: _col0 (type: string), _col2 (type: string) + value expressions: _col1 (type: string), _col2 (type: string) Execution mode: vectorized Map 4 Map Operator Tree: TableScan - alias: src + alias: src1 Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE Filter Operator - predicate: value is not null (type: boolean) + predicate: key is not null (type: boolean) Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE Select Operator - expressions: value (type: string) + expressions: key (type: string) outputColumnNames: _col0 Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator @@ -635,15 +635,15 @@ STAGE PLANS: condition map: Inner Join 0 to 1 keys: - 0 _col1 (type: string) + 0 _col0 (type: string) 1 _col0 (type: string) - outputColumnNames: _col0, _col2 + outputColumnNames: _col1, _col2 Statistics: Num rows: 2200 Data size: 23372 Basic stats: COMPLETE Column stats: NONE Map Join Operator condition map: Inner Join 0 to 1 keys: - 0 _col0 (type: string) + 0 _col1 (type: string) 1 _col0 (type: string) outputColumnNames: _col2 input vertices: http://git-wip-us.apache.org/repos/asf/hive/blob/13960aa9/ql/src/test/results/clientpositive/spark/spark_explainuser_1.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/spark/spark_explainuser_1.q.out b/ql/src/test/results/clientpositive/spark/spark_explainuser_1.q.out index 103491d..85d0b8a 100644 --- a/ql/src/test/results/clientpositive/spark/spark_explainuser_1.q.out +++ b/ql/src/test/results/clientpositive/spark/spark_explainuser_1.q.out @@ -1665,9 +1665,9 @@ Stage-0 Stage-1 Reducer 2 File Output Operator [FS_19] - Select Operator [SEL_18] (rows=366 width=178) + Select Operator [SEL_18] (rows=365 width=178) Output:["_col0","_col1"] - Filter Operator [FIL_17] (rows=366 width=179) + Filter Operator [FIL_17] (rows=365 width=179) predicate:_col3 is null Join Operator [JOIN_16] (rows=500 width=179) Output:["_col0","_col1","_col3"],condition map:[{"":"{\"type\":\"Left Outer\",\"left\":0,\"right\":1}"}],keys:{"0":"_col1","1":"_col0"} @@ -1730,9 +1730,9 @@ Stage-0 Stage-1 Reducer 3 File Output Operator [FS_18] - Select Operator [SEL_17] (rows=183 width=178) + Select Operator [SEL_17] (rows=185 width=178) Output:["_col0","_col1"] - Filter Operator [FIL_16] (rows=183 width=179) + Filter Operator [FIL_16] (rows=185 width=179) predicate:_col4 is null Join Operator [JOIN_15] (rows=250 width=179) Output:["_col0","_col1","_col4"],condition map:[{"":"{\"type\":\"Left Outer\",\"left\":0,\"right\":1}"}],keys:{"0":"_col0, _col1","1":"_col0, _col1"} @@ -1806,7 +1806,7 @@ Stage-0 Stage-1 Reducer 2 File Output Operator [FS_12] - Join Operator [JOIN_10] (rows=133 width=178) + Join Operator [JOIN_10] (rows=131 width=178) Output:["_col0","_col1"],condition map:[{"":"{\"type\":\"Left Semi\",\"left\":0,\"right\":1}"}],keys:{"0":"_col0, _col1","1":"_col0, _col1"} <-Map 1 [PARTITION-LEVEL SORT] PARTITION-LEVEL SORT [RS_8] @@ -1858,7 +1858,7 @@ Stage-0 Stage-1 Reducer 2 File Output Operator [FS_12] - Join Operator [JOIN_10] (rows=133 width=178) + Join Operator [JOIN_10] (rows=131 width=178) Output:["_col0","_col1"],condition map:[{"":"{\"type\":\"Left Semi\",\"left\":0,\"right\":1}"}],keys:{"0":"_col0, _col1","1":"_col0, _col1"} <-Map 1 [PARTITION-LEVEL SORT] PARTITION-LEVEL SORT [RS_8] @@ -1900,7 +1900,7 @@ Stage-0 Stage-1 Reducer 2 File Output Operator [FS_12] - Join Operator [JOIN_10] (rows=133 width=178) + Join Operator [JOIN_10] (rows=131 width=178) Output:["_col0","_col1"],condition map:[{"":"{\"type\":\"Left Semi\",\"left\":0,\"right\":1}"}],keys:{"0":"_col0","1":"_col0"} <-Map 1 [PARTITION-LEVEL SORT] PARTITION-LEVEL SORT [RS_8] @@ -2021,16 +2021,16 @@ Stage-0 <-Reducer 3 [PARTITION-LEVEL SORT] PARTITION-LEVEL SORT [RS_27] PartitionCols:_col2 - Filter Operator [FIL_15] (rows=66 width=186) + Filter Operator [FIL_15] (rows=65 width=186) predicate:_col2 is not null - Group By Operator [GBY_14] (rows=66 width=186) + Group By Operator [GBY_14] (rows=65 width=186) Output:["_col0","_col1","_col2"],aggregations:["count(VALUE._col0)"],keys:KEY._col0, KEY._col1 <-Reducer 2 [GROUP] GROUP [RS_13] PartitionCols:_col0, _col1 - Group By Operator [GBY_12] (rows=66 width=186) + Group By Operator [GBY_12] (rows=65 width=186) Output:["_col0","_col1","_col2"],aggregations:["count()"],keys:_col0, _col1 - Join Operator [JOIN_10] (rows=133 width=178) + Join Operator [JOIN_10] (rows=131 width=178) Output:["_col0","_col1"],condition map:[{"":"{\"type\":\"Left Semi\",\"left\":0,\"right\":1}"}],keys:{"0":"_col0","1":"_col0"} <-Map 1 [PARTITION-LEVEL SORT] PARTITION-LEVEL SORT [RS_8] @@ -2634,7 +2634,7 @@ Stage-0 PartitionCols:_col0 Group By Operator [GBY_10] (rows=16 width=94) Output:["_col0","_col1"],aggregations:["count()"],keys:_col0 - Join Operator [JOIN_8] (rows=40 width=86) + Join Operator [JOIN_8] (rows=39 width=86) Output:["_col0"],condition map:[{"":"{\"type\":\"Inner\",\"left\":0,\"right\":1}"}],keys:{"0":"_col0","1":"_col0"} <-Map 1 [PARTITION-LEVEL SORT] PARTITION-LEVEL SORT [RS_6] @@ -2695,7 +2695,7 @@ Stage-0 PartitionCols:_col0 Group By Operator [GBY_10] (rows=16 width=94) Output:["_col0","_col1"],aggregations:["count()"],keys:_col0 - Join Operator [JOIN_8] (rows=40 width=86) + Join Operator [JOIN_8] (rows=39 width=86) Output:["_col0"],condition map:[{"":"{\"type\":\"Inner\",\"left\":0,\"right\":1}"}],keys:{"0":"_col0","1":"_col0"} <-Map 1 [PARTITION-LEVEL SORT] PARTITION-LEVEL SORT [RS_6] @@ -2755,7 +2755,7 @@ Stage-0 PartitionCols:_col0 Group By Operator [GBY_10] (rows=16 width=94) Output:["_col0","_col1"],aggregations:["count()"],keys:_col0 - Map Join Operator [MAPJOIN_22] (rows=40 width=86) + Map Join Operator [MAPJOIN_22] (rows=39 width=86) Conds:SEL_5._col0=SEL_5._col0(Inner),Output:["_col0"] <-Select Operator [SEL_5] (rows=500 width=87) Output:["_col0"] @@ -2807,16 +2807,16 @@ Stage-0 GROUP [RS_18] Group By Operator [GBY_17] (rows=1 width=16) Output:["_col0","_col1"],aggregations:["sum(_col0)","sum(_col1)"] - Select Operator [SEL_15] (rows=10 width=94) + Select Operator [SEL_15] (rows=9 width=94) Output:["_col0","_col1"] - Group By Operator [GBY_14] (rows=10 width=94) + Group By Operator [GBY_14] (rows=9 width=94) Output:["_col0","_col1"],aggregations:["count(VALUE._col0)"],keys:KEY._col0 <-Reducer 2 [GROUP] GROUP [RS_13] PartitionCols:_col0 - Group By Operator [GBY_12] (rows=10 width=94) + Group By Operator [GBY_12] (rows=9 width=94) Output:["_col0","_col1"],aggregations:["count()"],keys:_col0 - Join Operator [JOIN_10] (rows=20 width=86) + Join Operator [JOIN_10] (rows=19 width=86) Output:["_col0"],condition map:[{"":"{\"type\":\"Left Semi\",\"left\":0,\"right\":1}"}],keys:{"0":"_col0","1":"_col0"} <-Map 1 [PARTITION-LEVEL SORT] PARTITION-LEVEL SORT [RS_8] @@ -5215,9 +5215,9 @@ Stage-2 Reducer 2 File Output Operator [FS_11] table:{"name:":"default.dest_j1_n14"} - Select Operator [SEL_9] (rows=809 width=95) + Select Operator [SEL_9] (rows=791 width=95) Output:["_col0","_col1"] - Join Operator [JOIN_8] (rows=809 width=178) + Join Operator [JOIN_8] (rows=791 width=178) Output:["_col0","_col2"],condition map:[{"":"{\"type\":\"Inner\",\"left\":0,\"right\":1}"}],keys:{"0":"_col0","1":"_col0"} <-Map 1 [PARTITION-LEVEL SORT] PARTITION-LEVEL SORT [RS_6] @@ -5241,7 +5241,7 @@ Stage-2 Map 4 File Output Operator [FS_11] table:{"name:":"default.dest_j1_n14"} - Select Operator [SEL_9] (rows=809 width=95) + Select Operator [SEL_9] (rows=791 width=95) Output:["_col0","_col1"] Map Join Operator [MAPJOIN_16] Conds:TS_14.reducesinkkey0=TS_14.reducesinkkey0(Inner),Output:["_col0","_col2"] @@ -5496,7 +5496,7 @@ Stage-0 Stage-1 Reducer 2 File Output Operator [FS_10] - Join Operator [JOIN_8] (rows=809 width=356) + Join Operator [JOIN_8] (rows=791 width=356) Output:["_col0","_col1","_col2","_col3"],condition map:[{"":"{\"type\":\"Inner\",\"left\":0,\"right\":1}"}],keys:{"0":"_col0","1":"_col0"} <-Map 1 [PARTITION-LEVEL SORT] PARTITION-LEVEL SORT [RS_6] http://git-wip-us.apache.org/repos/asf/hive/blob/13960aa9/ql/src/test/results/clientpositive/tez/explainanalyze_1.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/tez/explainanalyze_1.q.out b/ql/src/test/results/clientpositive/tez/explainanalyze_1.q.out index 070fca7..c253fd2 100644 --- a/ql/src/test/results/clientpositive/tez/explainanalyze_1.q.out +++ b/ql/src/test/results/clientpositive/tez/explainanalyze_1.q.out @@ -251,7 +251,7 @@ Stage-0 Stage-1 Reducer 2 File Output Operator [FS_5] - Group By Operator [GBY_3] (rows=309/309 width=95) + Group By Operator [GBY_3] (rows=316/309 width=95) Output:["_col0","_col1"],aggregations:["count(KEY._col0)"],keys:KEY._col0 <-Map 1 [SIMPLE_EDGE] SHUFFLE [RS_2] @@ -298,7 +298,7 @@ Stage-0 Output:["_col0"],aggregations:["count()"] <-Reducer 2 [CUSTOM_SIMPLE_EDGE] PARTITION_ONLY_SHUFFLE [RS_10] - Merge Join Operator [MERGEJOIN_18] (rows=267/0 width=8) + Merge Join Operator [MERGEJOIN_18] (rows=262/0 width=8) Conds:RS_6._col0=RS_7._col0(Inner) <-Map 1 [SIMPLE_EDGE] SHUFFLE [RS_6] @@ -347,7 +347,7 @@ Stage-0 Output:["_col0"],aggregations:["count()"] <-Reducer 2 [CUSTOM_SIMPLE_EDGE] PARTITION_ONLY_SHUFFLE [RS_10] - Merge Join Operator [MERGEJOIN_18] (rows=267/1019 width=8) + Merge Join Operator [MERGEJOIN_18] (rows=262/1019 width=8) Conds:RS_6._col0=RS_7._col0(Inner) <-Map 1 [SIMPLE_EDGE] SHUFFLE [RS_6] @@ -451,9 +451,9 @@ Stage-0 Stage-1 Reducer 2 File Output Operator [FS_10] - Select Operator [SEL_9] (rows=809/1028 width=178) + Select Operator [SEL_9] (rows=791/1028 width=178) Output:["_col0","_col1"] - Merge Join Operator [MERGEJOIN_15] (rows=809/1028 width=178) + Merge Join Operator [MERGEJOIN_15] (rows=791/1028 width=178) Conds:RS_6._col0=RS_7._col0(Inner),Output:["_col0","_col2"] <-Map 1 [SIMPLE_EDGE] SHUFFLE [RS_6] @@ -513,9 +513,9 @@ Stage-0 Stage-1 Reducer 2 File Output Operator [FS_9] - Transform Operator [SCR_8] (rows=809/1028 width=178) + Transform Operator [SCR_8] (rows=791/1028 width=178) command:cat - Merge Join Operator [MERGEJOIN_14] (rows=809/1028 width=178) + Merge Join Operator [MERGEJOIN_14] (rows=791/1028 width=178) Conds:RS_3.key=RS_5.key(Inner),Output:["_col0","_col1"] <-Map 1 [SIMPLE_EDGE] SHUFFLE [RS_3] http://git-wip-us.apache.org/repos/asf/hive/blob/13960aa9/ql/src/test/results/clientpositive/tez/explainanalyze_3.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/tez/explainanalyze_3.q.out b/ql/src/test/results/clientpositive/tez/explainanalyze_3.q.out index bccfa04..42bad01 100644 --- a/ql/src/test/results/clientpositive/tez/explainanalyze_3.q.out +++ b/ql/src/test/results/clientpositive/tez/explainanalyze_3.q.out @@ -847,9 +847,9 @@ Stage-0 Stage-1 Map 2 vectorized File Output Operator [FS_34] - Select Operator [SEL_33] (rows=391/480 width=186) + Select Operator [SEL_33] (rows=399/480 width=186) Output:["_col0","_col1","_col2"] - Map Join Operator [MAPJOIN_32] (rows=391/480 width=186) + Map Join Operator [MAPJOIN_32] (rows=399/480 width=186) BucketMapJoin:true,Conds:RS_29._col0=SEL_31._col0(Inner),HybridGraceHashJoin:true,Output:["_col0","_col1","_col3"] <-Map 1 [CUSTOM_EDGE] vectorized MULTICAST [RS_29] http://git-wip-us.apache.org/repos/asf/hive/blob/13960aa9/ql/src/test/results/clientpositive/tez/explainanalyze_4.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/tez/explainanalyze_4.q.out b/ql/src/test/results/clientpositive/tez/explainanalyze_4.q.out index 5c17512..9d14557 100644 --- a/ql/src/test/results/clientpositive/tez/explainanalyze_4.q.out +++ b/ql/src/test/results/clientpositive/tez/explainanalyze_4.q.out @@ -44,11 +44,11 @@ Stage-0 Stage-1 Reducer 3 File Output Operator [FS_12] - Select Operator [SEL_11] (rows=2076/10 width=553) + Select Operator [SEL_11] (rows=2048/10 width=552) Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6","_col7","_col8","_col9","_col10","_col11","_col12","_col13","_col14","_col15","_col16","_col17","_col18","_col19","_col20","_col21","_col22","_col23"] <-Reducer 2 [SIMPLE_EDGE] SHUFFLE [RS_10] - Merge Join Operator [MERGEJOIN_17] (rows=2076/10 width=553) + Merge Join Operator [MERGEJOIN_17] (rows=2048/10 width=552) Conds:RS_6._col2=RS_7._col2(Inner),Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6","_col7","_col8","_col9","_col10","_col11","_col12","_col13","_col14","_col15","_col16","_col17","_col18","_col19","_col20","_col21","_col22","_col23"] <-Map 1 [SIMPLE_EDGE] SHUFFLE [RS_6] @@ -143,7 +143,7 @@ Stage-0 Output:["_col0"],aggregations:["count()"] <-Reducer 2 [CUSTOM_SIMPLE_EDGE] PARTITION_ONLY_SHUFFLE [RS_10] - Merge Join Operator [MERGEJOIN_18] (rows=2076/10 width=8) + Merge Join Operator [MERGEJOIN_18] (rows=2048/10 width=8) Conds:RS_6._col0=RS_7._col0(Inner) <-Map 1 [SIMPLE_EDGE] SHUFFLE [RS_6] @@ -232,16 +232,16 @@ Stage-0 Stage-1 Reducer 4 File Output Operator [FS_15] - Select Operator [SEL_14] (rows=623/5 width=11) + Select Operator [SEL_14] (rows=631/5 width=11) Output:["_col0","_col1"] <-Reducer 3 [SIMPLE_EDGE] SHUFFLE [RS_13] - Group By Operator [GBY_11] (rows=623/5 width=11) + Group By Operator [GBY_11] (rows=631/5 width=11) Output:["_col0","_col1"],aggregations:["count()"],keys:KEY._col0 <-Reducer 2 [SIMPLE_EDGE] SHUFFLE [RS_10] PartitionCols:_col0 - Merge Join Operator [MERGEJOIN_20] (rows=2076/10 width=3) + Merge Join Operator [MERGEJOIN_20] (rows=2048/10 width=3) Conds:RS_6._col1=RS_7._col0(Inner),Output:["_col0"] <-Map 1 [SIMPLE_EDGE] SHUFFLE [RS_6] http://git-wip-us.apache.org/repos/asf/hive/blob/13960aa9/ql/src/test/results/clientpositive/tez/explainanalyze_5.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/tez/explainanalyze_5.q.out b/ql/src/test/results/clientpositive/tez/explainanalyze_5.q.out index fd71c0c..75f29fa 100644 --- a/ql/src/test/results/clientpositive/tez/explainanalyze_5.q.out +++ b/ql/src/test/results/clientpositive/tez/explainanalyze_5.q.out @@ -114,9 +114,9 @@ Stage-3 <-Reducer 4 [CUSTOM_SIMPLE_EDGE] File Output Operator [FS_19] table:{"name:":"default.src_multi2_n7"} - Select Operator [SEL_18] (rows=849/508 width=178) + Select Operator [SEL_18] (rows=830/508 width=178) Output:["_col0","_col1"] - Merge Join Operator [MERGEJOIN_26] (rows=849/508 width=178) + Merge Join Operator [MERGEJOIN_26] (rows=830/508 width=178) Conds:RS_15._col0=RS_16._col0(Inner),Output:["_col0","_col3"] <-Map 7 [SIMPLE_EDGE] SHUFFLE [RS_16] @@ -154,7 +154,7 @@ Stage-3 TableScan [TS_3] (rows=25/25 width=175) Output:["key","value"] PARTITION_ONLY_SHUFFLE [RS_2] - Select Operator [SEL_1] (rows=849/508 width=178) + Select Operator [SEL_1] (rows=830/508 width=178) Output:["key","value"] Please refer to the previous Select Operator [SEL_18] http://git-wip-us.apache.org/repos/asf/hive/blob/13960aa9/ql/src/test/results/clientpositive/tez/explainuser_3.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/tez/explainuser_3.q.out b/ql/src/test/results/clientpositive/tez/explainuser_3.q.out index 8b7b11d..d993905 100644 --- a/ql/src/test/results/clientpositive/tez/explainuser_3.q.out +++ b/ql/src/test/results/clientpositive/tez/explainuser_3.q.out @@ -675,9 +675,9 @@ Stage-0 Stage-1 Map 2 vectorized File Output Operator [FS_34] - Select Operator [SEL_33] (rows=391 width=186) + Select Operator [SEL_33] (rows=399 width=186) Output:["_col0","_col1","_col2"] - Map Join Operator [MAPJOIN_32] (rows=391 width=186) + Map Join Operator [MAPJOIN_32] (rows=399 width=186) BucketMapJoin:true,Conds:RS_29._col0=SEL_31._col0(Inner),HybridGraceHashJoin:true,Output:["_col0","_col1","_col3"] <-Map 1 [CUSTOM_EDGE] vectorized MULTICAST [RS_29] http://git-wip-us.apache.org/repos/asf/hive/blob/13960aa9/ql/src/test/results/clientpositive/tez/hybridgrace_hashjoin_1.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/tez/hybridgrace_hashjoin_1.q.out b/ql/src/test/results/clientpositive/tez/hybridgrace_hashjoin_1.q.out index 5b9149c..910a812 100644 --- a/ql/src/test/results/clientpositive/tez/hybridgrace_hashjoin_1.q.out +++ b/ql/src/test/results/clientpositive/tez/hybridgrace_hashjoin_1.q.out @@ -56,7 +56,7 @@ STAGE PLANS: 1 _col0 (type: int) input vertices: 1 Map 3 - Statistics: Num rows: 25057 Data size: 200456 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 24737 Data size: 197896 Basic stats: COMPLETE Column stats: COMPLETE Group By Operator aggregations: count() mode: hash @@ -175,7 +175,7 @@ STAGE PLANS: 1 _col0 (type: int) input vertices: 1 Map 3 - Statistics: Num rows: 25057 Data size: 200456 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 24737 Data size: 197896 Basic stats: COMPLETE Column stats: COMPLETE HybridGraceHashJoin: true Group By Operator aggregations: count() @@ -293,7 +293,7 @@ STAGE PLANS: 1 _col0 (type: int) input vertices: 1 Map 3 - Statistics: Num rows: 18702 Data size: 149616 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 18464 Data size: 147712 Basic stats: COMPLETE Column stats: COMPLETE Group By Operator aggregations: count() mode: hash @@ -408,7 +408,7 @@ STAGE PLANS: 1 _col0 (type: int) input vertices: 1 Map 3 - Statistics: Num rows: 18702 Data size: 149616 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 18464 Data size: 147712 Basic stats: COMPLETE Column stats: COMPLETE HybridGraceHashJoin: true Group By Operator aggregations: count() @@ -521,7 +521,7 @@ STAGE PLANS: 1 _col0 (type: int) input vertices: 1 Map 3 - Statistics: Num rows: 25057 Data size: 200456 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 24737 Data size: 197896 Basic stats: COMPLETE Column stats: COMPLETE Group By Operator aggregations: count() mode: hash @@ -630,7 +630,7 @@ STAGE PLANS: 1 _col0 (type: int) input vertices: 1 Map 3 - Statistics: Num rows: 25057 Data size: 200456 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 24737 Data size: 197896 Basic stats: COMPLETE Column stats: COMPLETE HybridGraceHashJoin: true Group By Operator aggregations: count() http://git-wip-us.apache.org/repos/asf/hive/blob/13960aa9/ql/src/test/results/clientpositive/tez/hybridgrace_hashjoin_2.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/tez/hybridgrace_hashjoin_2.q.out b/ql/src/test/results/clientpositive/tez/hybridgrace_hashjoin_2.q.out index 3bacb4a..a3a77f9 100644 --- a/ql/src/test/results/clientpositive/tez/hybridgrace_hashjoin_2.q.out +++ b/ql/src/test/results/clientpositive/tez/hybridgrace_hashjoin_2.q.out @@ -72,7 +72,7 @@ STAGE PLANS: input vertices: 0 Map 1 2 Map 4 - Statistics: Num rows: 261 Data size: 2088 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 250 Data size: 2000 Basic stats: COMPLETE Column stats: COMPLETE Group By Operator aggregations: count() mode: hash @@ -200,7 +200,7 @@ STAGE PLANS: input vertices: 0 Map 1 2 Map 4 - Statistics: Num rows: 261 Data size: 2088 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 250 Data size: 2000 Basic stats: COMPLETE Column stats: COMPLETE HybridGraceHashJoin: true Group By Operator aggregations: count() @@ -343,7 +343,7 @@ STAGE PLANS: 0 Map 1 2 Map 4 3 Map 5 - Statistics: Num rows: 1694 Data size: 13552 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 1584 Data size: 12672 Basic stats: COMPLETE Column stats: COMPLETE Group By Operator aggregations: count() mode: hash @@ -492,7 +492,7 @@ STAGE PLANS: 0 Map 1 2 Map 4 3 Map 5 - Statistics: Num rows: 1694 Data size: 13552 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 1584 Data size: 12672 Basic stats: COMPLETE Column stats: COMPLETE HybridGraceHashJoin: true Group By Operator aggregations: count() @@ -671,7 +671,7 @@ STAGE PLANS: input vertices: 0 Map 1 2 Map 6 - Statistics: Num rows: 261 Data size: 2088 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 250 Data size: 2000 Basic stats: COMPLETE Column stats: COMPLETE Group By Operator aggregations: count() mode: hash @@ -729,7 +729,7 @@ STAGE PLANS: input vertices: 0 Map 7 2 Map 10 - Statistics: Num rows: 261 Data size: 2088 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 265 Data size: 2120 Basic stats: COMPLETE Column stats: COMPLETE Group By Operator aggregations: count() mode: hash @@ -915,7 +915,7 @@ STAGE PLANS: input vertices: 0 Map 1 2 Map 6 - Statistics: Num rows: 261 Data size: 2088 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 250 Data size: 2000 Basic stats: COMPLETE Column stats: COMPLETE HybridGraceHashJoin: true Group By Operator aggregations: count() @@ -974,7 +974,7 @@ STAGE PLANS: input vertices: 0 Map 7 2 Map 10 - Statistics: Num rows: 261 Data size: 2088 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 265 Data size: 2120 Basic stats: COMPLETE Column stats: COMPLETE HybridGraceHashJoin: true Group By Operator aggregations: count() @@ -1157,7 +1157,7 @@ STAGE PLANS: input vertices: 0 Map 1 2 Map 4 - Statistics: Num rows: 20 Data size: 1780 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 19 Data size: 1691 Basic stats: COMPLETE Column stats: COMPLETE Map Join Operator condition map: Inner Join 0 to 1 @@ -1169,7 +1169,7 @@ STAGE PLANS: input vertices: 1 Map 5 2 Map 6 - Statistics: Num rows: 204 Data size: 1632 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 196 Data size: 1568 Basic stats: COMPLETE Column stats: COMPLETE Group By Operator aggregations: count() mode: hash @@ -1347,7 +1347,7 @@ STAGE PLANS: input vertices: 0 Map 1 2 Map 4 - Statistics: Num rows: 20 Data size: 1780 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 19 Data size: 1691 Basic stats: COMPLETE Column stats: COMPLETE HybridGraceHashJoin: true Map Join Operator condition map: @@ -1360,7 +1360,7 @@ STAGE PLANS: input vertices: 1 Map 5 2 Map 6 - Statistics: Num rows: 204 Data size: 1632 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 196 Data size: 1568 Basic stats: COMPLETE Column stats: COMPLETE HybridGraceHashJoin: true Group By Operator aggregations: count() http://git-wip-us.apache.org/repos/asf/hive/blob/13960aa9/ql/src/test/results/clientpositive/tez/tez-tag.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/tez/tez-tag.q.out b/ql/src/test/results/clientpositive/tez/tez-tag.q.out index 55ce485..cf96067 100644 --- a/ql/src/test/results/clientpositive/tez/tez-tag.q.out +++ b/ql/src/test/results/clientpositive/tez/tez-tag.q.out @@ -190,7 +190,7 @@ Stage-0 PARTITION_ONLY_SHUFFLE [RS_17] Group By Operator [GBY_16] (rows=1 width=8) Output:["_col0"],aggregations:["count()"] - Merge Join Operator [MERGEJOIN_30] (rows=63 width=8) + Merge Join Operator [MERGEJOIN_30] (rows=64 width=8) Conds:RS_12._col0=RS_13._col0(Inner) <-Map 6 [SIMPLE_EDGE] SHUFFLE [RS_13] http://git-wip-us.apache.org/repos/asf/hive/blob/13960aa9/standalone-metastore/src/main/java/org/apache/hadoop/hive/common/ndv/NumDistinctValueEstimatorFactory.java ---------------------------------------------------------------------- diff --git a/standalone-metastore/src/main/java/org/apache/hadoop/hive/common/ndv/NumDistinctValueEstimatorFactory.java b/standalone-metastore/src/main/java/org/apache/hadoop/hive/common/ndv/NumDistinctValueEstimatorFactory.java index 4e4dfb7..b630fa3 100644 --- a/standalone-metastore/src/main/java/org/apache/hadoop/hive/common/ndv/NumDistinctValueEstimatorFactory.java +++ b/standalone-metastore/src/main/java/org/apache/hadoop/hive/common/ndv/NumDistinctValueEstimatorFactory.java @@ -25,6 +25,7 @@ import java.util.Arrays; import org.apache.hadoop.hive.common.ndv.fm.FMSketch; import org.apache.hadoop.hive.common.ndv.fm.FMSketchUtils; import org.apache.hadoop.hive.common.ndv.hll.HyperLogLog; +import org.apache.hadoop.hive.common.ndv.hll.HyperLogLogUtils; public class NumDistinctValueEstimatorFactory { @@ -44,7 +45,7 @@ public class NumDistinctValueEstimatorFactory { if (isFMSketch(buf)) { return FMSketchUtils.deserializeFM(buf); } else { - return HyperLogLog.builder().build().deserialize(buf); + return HyperLogLogUtils.deserializeHLL(buf); } } catch (IOException e) { throw new RuntimeException(e); @@ -56,7 +57,7 @@ public class NumDistinctValueEstimatorFactory { if (n instanceof FMSketch) { return new FMSketch(((FMSketch) n).getNumBitVectors()); } else { - return HyperLogLog.builder().build(); + return HyperLogLog.builder().setSizeOptimized().build(); } } @@ -65,7 +66,7 @@ public class NumDistinctValueEstimatorFactory { if ("fm".equals(func.toLowerCase())) { return new FMSketch(numBitVectors); } else if ("hll".equals(func.toLowerCase())) { - return HyperLogLog.builder().build(); + return HyperLogLog.builder().setSizeOptimized().build(); } else { throw new RuntimeException("Can not recognize " + func); } http://git-wip-us.apache.org/repos/asf/hive/blob/13960aa9/standalone-metastore/src/main/java/org/apache/hadoop/hive/common/ndv/hll/HLLDenseRegister.java ---------------------------------------------------------------------- diff --git a/standalone-metastore/src/main/java/org/apache/hadoop/hive/common/ndv/hll/HLLDenseRegister.java b/standalone-metastore/src/main/java/org/apache/hadoop/hive/common/ndv/hll/HLLDenseRegister.java index 12897fc..422bfbe 100644 --- a/standalone-metastore/src/main/java/org/apache/hadoop/hive/common/ndv/hll/HLLDenseRegister.java +++ b/standalone-metastore/src/main/java/org/apache/hadoop/hive/common/ndv/hll/HLLDenseRegister.java @@ -62,6 +62,31 @@ public class HLLDenseRegister implements HLLRegister { return set(registerIdx, (byte) lr); } + // this is a lossy invert of the function above, which produces a hashcode + // which collides with the current winner of the register (we lose all higher + // bits, but we get all bits useful for lesser p-bit options) + + // +-------------|-------------+ + // |xxxx100000000|1000000000000| (lr=9 + idx=1024) + // +-------------|-------------+ + // \ + // +---------------|-----------+ + // |xxxx10000000010|00000000000| (lr=2 + idx=0) + // +---------------|-----------+ + + // This shows the relevant bits of the original hash value + // and how the conversion is moving bits from the index value + // over to the leading zero computation + + public void extractLowBitsTo(HLLRegister dest) { + for (int idx = 0; idx < register.length; idx++) { + byte lr = register[idx]; // this can be a max of 65, never > 127 + if (lr != 0) { + dest.add((long) ((1 << (p + lr - 1)) | idx)); + } + } + } + public boolean set(int idx, byte value) { boolean updated = false; if (idx < register.length && value > register[idx]) { http://git-wip-us.apache.org/repos/asf/hive/blob/13960aa9/standalone-metastore/src/main/java/org/apache/hadoop/hive/common/ndv/hll/HLLSparseRegister.java ---------------------------------------------------------------------- diff --git a/standalone-metastore/src/main/java/org/apache/hadoop/hive/common/ndv/hll/HLLSparseRegister.java b/standalone-metastore/src/main/java/org/apache/hadoop/hive/common/ndv/hll/HLLSparseRegister.java index d62b858..d5ac54a 100644 --- a/standalone-metastore/src/main/java/org/apache/hadoop/hive/common/ndv/hll/HLLSparseRegister.java +++ b/standalone-metastore/src/main/java/org/apache/hadoop/hive/common/ndv/hll/HLLSparseRegister.java @@ -19,6 +19,7 @@ package org.apache.hadoop.hive.common.ndv.hll; import java.util.Map; +import java.util.Map.Entry; import java.util.TreeMap; public class HLLSparseRegister implements HLLRegister { @@ -187,6 +188,18 @@ public class HLLSparseRegister implements HLLRegister { return sparseMap; } + // this is effectively the same as the dense register impl. + public void extractLowBitsTo(HLLRegister dest) { + for (Entry<Integer, Byte> entry : getSparseMap().entrySet()) { + int idx = entry.getKey(); + byte lr = entry.getValue(); // this can be a max of 65, never > 127 + if (lr != 0) { + // should be a no-op for sparse + dest.add((long) ((1 << (p + lr - 1)) | idx)); + } + } + } + public int getP() { return p; } http://git-wip-us.apache.org/repos/asf/hive/blob/13960aa9/standalone-metastore/src/main/java/org/apache/hadoop/hive/common/ndv/hll/HyperLogLog.java ---------------------------------------------------------------------- diff --git a/standalone-metastore/src/main/java/org/apache/hadoop/hive/common/ndv/hll/HyperLogLog.java b/standalone-metastore/src/main/java/org/apache/hadoop/hive/common/ndv/hll/HyperLogLog.java index a3cc989..91a6865 100644 --- a/standalone-metastore/src/main/java/org/apache/hadoop/hive/common/ndv/hll/HyperLogLog.java +++ b/standalone-metastore/src/main/java/org/apache/hadoop/hive/common/ndv/hll/HyperLogLog.java @@ -18,10 +18,8 @@ package org.apache.hadoop.hive.common.ndv.hll; -import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; -import java.io.InputStream; import java.nio.ByteBuffer; import java.nio.charset.Charset; import java.util.Map; @@ -160,6 +158,13 @@ public class HyperLogLog implements NumDistinctValueEstimator { return this; } + public HyperLogLogBuilder setSizeOptimized() { + // allowing this to be increased via config breaks the merge impl + // p=10 = ~1kb per vector or smaller + this.numRegisterIndexBits = 10; + return this; + } + public HyperLogLogBuilder setEncoding(EncodingType enc) { this.encoding = enc; return this; @@ -431,12 +436,23 @@ public class HyperLogLog implements NumDistinctValueEstimator { * @throws IllegalArgumentException */ public void merge(HyperLogLog hll) { - if (p != hll.p || chosenHashBits != hll.chosenHashBits) { + if (chosenHashBits != hll.chosenHashBits) { throw new IllegalArgumentException( "HyperLogLog cannot be merged as either p or hashbits are different. Current: " + toString() + " Provided: " + hll.toString()); } + if (p > hll.p) { + throw new IllegalArgumentException( + "HyperLogLog cannot merge a smaller p into a larger one : " + + toString() + " Provided: " + hll.toString()); + } + + if (p != hll.p) { + // invariant: p > hll.p + hll = hll.squash(p); + } + EncodingType otherEncoding = hll.getEncoding(); if (encoding.equals(EncodingType.SPARSE) && otherEncoding.equals(EncodingType.SPARSE)) { @@ -464,7 +480,37 @@ public class HyperLogLog implements NumDistinctValueEstimator { } /** - * Converts sparse to dense hll register + * Reduces the accuracy of the HLL provided to a smaller size + * @param p0 + * - new p size for the new HyperLogLog (smaller or no change) + * @return reduced (or same) HyperLogLog instance + */ + public HyperLogLog squash(final int p0) { + if (p0 > p) { + throw new IllegalArgumentException( + "HyperLogLog cannot be be squashed to be bigger. Current: " + + toString() + " Provided: " + p0); + } + + if (p0 == p) { + return this; + } + + final HyperLogLog hll = new HyperLogLogBuilder() + .setNumRegisterIndexBits(p0).setEncoding(EncodingType.DENSE) + .enableNoBias(noBias).build(); + final HLLDenseRegister result = hll.denseRegister; + + if (encoding == EncodingType.SPARSE) { + sparseRegister.extractLowBitsTo(result); + } else if (encoding == EncodingType.DENSE) { + denseRegister.extractLowBitsTo(result); + } + return hll; + } + + /** + * Converts sparse to dense hll register. * @param sparseRegister * - sparse register to be converted * @return converted dense register @@ -576,14 +622,7 @@ public class HyperLogLog implements NumDistinctValueEstimator { @Override public NumDistinctValueEstimator deserialize(byte[] buf) { - InputStream is = new ByteArrayInputStream(buf); - try { - HyperLogLog result = HyperLogLogUtils.deserializeHLL(is); - is.close(); - return result; - } catch (IOException e) { - throw new RuntimeException(e); - } + return HyperLogLogUtils.deserializeHLL(buf); } @Override http://git-wip-us.apache.org/repos/asf/hive/blob/13960aa9/standalone-metastore/src/main/java/org/apache/hadoop/hive/common/ndv/hll/HyperLogLogUtils.java ---------------------------------------------------------------------- diff --git a/standalone-metastore/src/main/java/org/apache/hadoop/hive/common/ndv/hll/HyperLogLogUtils.java b/standalone-metastore/src/main/java/org/apache/hadoop/hive/common/ndv/hll/HyperLogLogUtils.java index 4e6510b..aeba2e9 100644 --- a/standalone-metastore/src/main/java/org/apache/hadoop/hive/common/ndv/hll/HyperLogLogUtils.java +++ b/standalone-metastore/src/main/java/org/apache/hadoop/hive/common/ndv/hll/HyperLogLogUtils.java @@ -18,6 +18,7 @@ package org.apache.hadoop.hive.common.ndv.hll; +import java.io.ByteArrayInputStream; import java.io.EOFException; import java.io.IOException; import java.io.InputStream; @@ -126,7 +127,7 @@ public class HyperLogLogUtils { } /** - * Refer serializeHLL() for format of serialization. This funtions + * Refer serializeHLL() for format of serialization. This function * deserializes the serialized hyperloglogs * @param in * - input stream @@ -198,6 +199,22 @@ public class HyperLogLogUtils { return result; } + /** + * This function deserializes the serialized hyperloglogs from a byte array. + * @param buf - to deserialize + * @return HyperLogLog + */ + public static HyperLogLog deserializeHLL(final byte[] buf) { + InputStream is = new ByteArrayInputStream(buf); // TODO: use faster non-sync inputstream + try { + HyperLogLog result = deserializeHLL(is); + is.close(); + return result; + } catch (IOException e) { + throw new RuntimeException(e); + } + } + private static void bitpackHLLRegister(OutputStream out, byte[] register, int bitWidth) throws IOException { int bitsLeft = 8; http://git-wip-us.apache.org/repos/asf/hive/blob/13960aa9/standalone-metastore/src/test/java/org/apache/hadoop/hive/common/ndv/hll/TestHyperLogLog.java ---------------------------------------------------------------------- diff --git a/standalone-metastore/src/test/java/org/apache/hadoop/hive/common/ndv/hll/TestHyperLogLog.java b/standalone-metastore/src/test/java/org/apache/hadoop/hive/common/ndv/hll/TestHyperLogLog.java index 617d9c3..e014fb5 100644 --- a/standalone-metastore/src/test/java/org/apache/hadoop/hive/common/ndv/hll/TestHyperLogLog.java +++ b/standalone-metastore/src/test/java/org/apache/hadoop/hive/common/ndv/hll/TestHyperLogLog.java @@ -37,14 +37,18 @@ public class TestHyperLogLog { HyperLogLog hll3 = HyperLogLog.builder().setEncoding(EncodingType.DENSE).build(); HyperLogLog hll4 = HyperLogLog.builder().setNumRegisterIndexBits(16) .setEncoding(EncodingType.DENSE).build(); + HyperLogLog hll5 = HyperLogLog.builder().setNumRegisterIndexBits(12) + .setEncoding(EncodingType.DENSE).build(); int size = 1000; for (int i = 0; i < size; i++) { hll.addLong(i); hll2.addLong(size + i); hll3.addLong(2 * size + i); + hll4.addLong(3 * size + i); } double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance; double delta = threshold * size / 100; + double delta4 = threshold * (4*size) / 100; assertEquals((double) size, (double) hll.count(), delta); assertEquals((double) size, (double) hll2.count(), delta); @@ -63,8 +67,13 @@ public class TestHyperLogLog { assertEquals((double) 3 * size, (double) hll.count(), delta); assertEquals(EncodingType.DENSE, hll.getEncoding()); - // invalid merge -- register set size doesn't match + // valid merge -- register set size gets bigger (also 4k items hll.merge(hll4); + assertEquals((double) 4 * size, (double) hll.count(), delta4); + assertEquals(EncodingType.DENSE, hll.getEncoding()); + + // invalid merge -- smaller register merge to bigger + hll.merge(hll5); } @Test(expected = IllegalArgumentException.class) @@ -74,14 +83,18 @@ public class TestHyperLogLog { HyperLogLog hll3 = HyperLogLog.builder().setEncoding(EncodingType.SPARSE).build(); HyperLogLog hll4 = HyperLogLog.builder().setNumRegisterIndexBits(16) .setEncoding(EncodingType.SPARSE).build(); + HyperLogLog hll5 = HyperLogLog.builder().setNumRegisterIndexBits(12) + .setEncoding(EncodingType.SPARSE).build(); int size = 500; for (int i = 0; i < size; i++) { hll.addLong(i); hll2.addLong(size + i); hll3.addLong(2 * size + i); + hll4.addLong(3 * size + i); } double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance; double delta = threshold * size / 100; + double delta4 = threshold * (4*size) / 100; assertEquals((double) size, (double) hll.count(), delta); assertEquals((double) size, (double) hll2.count(), delta); @@ -100,8 +113,13 @@ public class TestHyperLogLog { assertEquals((double) 3 * size, (double) hll.count(), delta); assertEquals(EncodingType.SPARSE, hll.getEncoding()); - // invalid merge -- register set size doesn't match + // valid merge -- register set size gets bigger & dense automatically hll.merge(hll4); + assertEquals((double) 4 * size, (double) hll.count(), delta4); + assertEquals(EncodingType.DENSE, hll.getEncoding()); + + // invalid merge -- smaller register merge to bigger + hll.merge(hll5); } @Test(expected = IllegalArgumentException.class) @@ -111,11 +129,14 @@ public class TestHyperLogLog { HyperLogLog hll3 = HyperLogLog.builder().setEncoding(EncodingType.DENSE).build(); HyperLogLog hll4 = HyperLogLog.builder().setNumRegisterIndexBits(16) .setEncoding(EncodingType.DENSE).build(); + HyperLogLog hll5 = HyperLogLog.builder().setNumRegisterIndexBits(12) + .setEncoding(EncodingType.DENSE).build(); int size = 1000; for (int i = 0; i < size; i++) { hll.addLong(i); hll2.addLong(size + i); hll3.addLong(2 * size + i); + hll4.addLong(3 * size + i); } double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance; double delta = threshold * size / 100; @@ -137,8 +158,13 @@ public class TestHyperLogLog { assertEquals((double) 3 * size, (double) hll.count(), delta); assertEquals(EncodingType.DENSE, hll.getEncoding()); - // invalid merge -- register set size doesn't match - hll.merge(hll4); + // merge should convert hll2 to DENSE + hll2.merge(hll4); + assertEquals((double) 2 * size, (double) hll2.count(), delta); + assertEquals(EncodingType.DENSE, hll2.getEncoding()); + + // invalid merge -- smaller register merge to bigger + hll.merge(hll5); } @Test(expected = IllegalArgumentException.class) @@ -148,11 +174,14 @@ public class TestHyperLogLog { HyperLogLog hll3 = HyperLogLog.builder().setEncoding(EncodingType.SPARSE).build(); HyperLogLog hll4 = HyperLogLog.builder().setNumRegisterIndexBits(16) .setEncoding(EncodingType.SPARSE).build(); + HyperLogLog hll5 = HyperLogLog.builder().setNumRegisterIndexBits(12) + .setEncoding(EncodingType.SPARSE).build(); int size = 1000; for (int i = 0; i < size; i++) { hll.addLong(i); hll2.addLong(size + i); hll3.addLong(2 * size + i); + hll4.addLong(3 * size + i); } double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance; double delta = threshold * size / 100; @@ -174,8 +203,14 @@ public class TestHyperLogLog { assertEquals((double) 3 * size, (double) hll.count(), delta); assertEquals(EncodingType.DENSE, hll.getEncoding()); - // invalid merge -- register set size doesn't match - hll.merge(hll4); + // merge should convert hll3 to DENSE + hll3.merge(hll4); + assertEquals((double) 2 * size, (double) hll3.count(), delta); + assertEquals(EncodingType.DENSE, hll3.getEncoding()); + + // invalid merge -- smaller register merge to bigger + hll.merge(hll5); + } @Test(expected = IllegalArgumentException.class) @@ -185,11 +220,14 @@ public class TestHyperLogLog { HyperLogLog hll3 = HyperLogLog.builder().setEncoding(EncodingType.SPARSE).build(); HyperLogLog hll4 = HyperLogLog.builder().setNumRegisterIndexBits(16) .setEncoding(EncodingType.SPARSE).build(); + HyperLogLog hll5 = HyperLogLog.builder().setNumRegisterIndexBits(12) + .setEncoding(EncodingType.SPARSE).build(); int size = 1000; for (int i = 0; i < size; i++) { hll.addLong(i); hll2.addLong(size + i); hll3.addLong(2 * size + i); + hll4.addLong(3 * size + i); } double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance; double delta = threshold * size / 100; @@ -211,8 +249,13 @@ public class TestHyperLogLog { assertEquals((double) 3 * size, (double) hll.count(), delta); assertEquals(EncodingType.DENSE, hll.getEncoding()); - // invalid merge -- register set size doesn't match - hll.merge(hll4); + // merge should convert hll2 to DENSE + hll2.merge(hll4); + assertEquals((double) 2 * size, (double) hll2.count(), delta); + assertEquals(EncodingType.DENSE, hll2.getEncoding()); + + // invalid merge -- smaller register merge to bigger + hll.merge(hll5); } @Test @@ -227,4 +270,69 @@ public class TestHyperLogLog { double delta = threshold * size / 100; assertEquals((double) size, (double) hll.count(), delta); } + + @Test + public void testHLLSquash() { + + int[] sizes = new int[] { 500, 1000, 2300, 4096}; + int minBits = 9; + for (final int size : sizes) { + + HyperLogLog hlls[] = new HyperLogLog[16]; + for (int k = minBits; k < hlls.length; k++) { + final HyperLogLog hll = HyperLogLog.builder() + .setEncoding(EncodingType.DENSE).setNumRegisterIndexBits(k).build(); + for (int i = 0; i < size; i++) { + hll.addLong(i); + } + hlls[k] = hll; + } + + for (int k = minBits; k < hlls.length; k++) { + for (int j = k + 1; j < hlls.length; j++) { + final HyperLogLog large = hlls[j]; + final HyperLogLog small = hlls[k]; + final HyperLogLog mush = large + .squash(small.getNumRegisterIndexBits()); + assertEquals(small.count(), mush.count(), 0); + double delta = Math.ceil(small.getStandardError()*size); + assertEquals((double) size, (double) mush.count(), delta); + } + } + } + } + + @Test + public void testHLLDenseDenseSquash() { + HyperLogLog p14HLL = HyperLogLog.builder().setEncoding(EncodingType.DENSE).setNumRegisterIndexBits(14).build(); + HyperLogLog p10HLL = HyperLogLog.builder().setEncoding(EncodingType.DENSE).setNumRegisterIndexBits(10).build(); + int size = 1_000_000; + for (int i = 0; i < size; i++) { + p14HLL.addLong(i); + } + + for (int i = 0; i < 10_000; i++) { + p10HLL.addLong(i); + } + + p14HLL.squash(p10HLL.getNumRegisterIndexBits()); + assertEquals((double) size, p14HLL.count(), longRangeTolerance * size / 100.0); + } + + @Test + public void testHLLSparseDenseSquash() { + HyperLogLog p14HLL = HyperLogLog.builder().setEncoding(EncodingType.SPARSE).setNumRegisterIndexBits(14).build(); + HyperLogLog p10HLL = HyperLogLog.builder().setEncoding(EncodingType.DENSE).setNumRegisterIndexBits(10).build(); + int size = 2000; + for (int i = 0; i < size; i++) { + p14HLL.addLong(i); + } + + for (int i = 0; i < 10_000; i++) { + p10HLL.addLong(i); + } + + p14HLL.squash(p10HLL.getNumRegisterIndexBits()); + assertEquals((double) size, p14HLL.count(), longRangeTolerance * size / 100.0); + } }
