Repository: hive Updated Branches: refs/heads/master f25b86520 -> df722342a
HIVE-13750: Avoid additional shuffle stage created by Sorted Dynamic Partition Optimizer when possible (Jesus Camacho Rodriguez, reviewed by Ashutosh Chauhan) Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/df722342 Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/df722342 Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/df722342 Branch: refs/heads/master Commit: df722342ac4b4cc39c5edef266bde16bb6bb56ac Parents: f25b865 Author: Jesus Camacho Rodriguez <[email protected]> Authored: Thu May 12 19:58:12 2016 +0100 Committer: Jesus Camacho Rodriguez <[email protected]> Committed: Thu May 19 21:24:19 2016 +0100 ---------------------------------------------------------------------- .../clientpositive/udf_row_sequence.q.out | 1021 +++++++++--------- .../optimizer/SortedDynPartitionOptimizer.java | 2 - .../correlation/ReduceSinkDeDuplication.java | 135 ++- .../dynpart_sort_opt_vectorization.q.out | 108 +- .../dynpart_sort_optimization.q.out | 36 +- .../dynpart_sort_optimization_acid.q.out | 132 +-- .../clientpositive/reducesink_dedup.q.out | 48 +- .../tez/dynpart_sort_opt_vectorization.q.out | 63 +- .../tez/dynpart_sort_optimization.q.out | 20 +- 9 files changed, 720 insertions(+), 845 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/df722342/contrib/src/test/results/clientpositive/udf_row_sequence.q.out ---------------------------------------------------------------------- diff --git a/contrib/src/test/results/clientpositive/udf_row_sequence.q.out b/contrib/src/test/results/clientpositive/udf_row_sequence.q.out index cc01db0..14798ae 100644 --- a/contrib/src/test/results/clientpositive/udf_row_sequence.q.out +++ b/contrib/src/test/results/clientpositive/udf_row_sequence.q.out @@ -35,8 +35,7 @@ order by r POSTHOOK: type: QUERY STAGE DEPENDENCIES: Stage-1 is a root stage - Stage-2 depends on stages: Stage-1 - Stage-0 depends on stages: Stage-2 + Stage-0 depends on stages: Stage-1 STAGE PLANS: Stage: Stage-1 @@ -50,30 +49,10 @@ STAGE PLANS: outputColumnNames: _col0 Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator - key expressions: _col0 (type: string) + key expressions: row_sequence() (type: bigint) sort order: + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - Reduce Operator Tree: - Select Operator - expressions: KEY.reducesinkkey0 (type: string), row_sequence() (type: bigint) - outputColumnNames: _col0, _col1 - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe - - Stage: Stage-2 - Map Reduce - Map Operator Tree: - TableScan - Reduce Output Operator - key expressions: _col1 (type: bigint) - sort order: + - Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE - value expressions: _col0 (type: string) + value expressions: _col0 (type: string) Reduce Operator Tree: Select Operator expressions: VALUE._col0 (type: string), KEY.reducesinkkey0 (type: bigint) @@ -105,506 +84,506 @@ order by r POSTHOOK: type: QUERY POSTHOOK: Input: default@src #### A masked pattern was here #### -0 1 -0 2 -0 3 -10 4 -100 5 -100 6 -103 7 -103 8 -104 9 -104 10 -105 11 -11 12 -111 13 -113 14 -113 15 -114 16 -116 17 -118 18 -118 19 -119 20 -119 21 -119 22 -12 23 -12 24 -120 25 -120 26 -125 27 -125 28 -126 29 -128 30 -128 31 -128 32 -129 33 -129 34 -131 35 -133 36 -134 37 -134 38 -136 39 -137 40 -137 41 -138 42 -138 43 -138 44 -138 45 -143 46 -145 47 -146 48 -146 49 -149 50 -149 51 -15 52 -15 53 -150 54 -152 55 -152 56 -153 57 -155 58 -156 59 -157 60 -158 61 -160 62 -162 63 -163 64 -164 65 -164 66 -165 67 -165 68 -166 69 -167 70 -167 71 -167 72 -168 73 -169 74 -169 75 -169 76 -169 77 +238 1 +86 2 +311 3 +27 4 +165 5 +409 6 +255 7 +278 8 +98 9 +484 10 +265 11 +193 12 +401 13 +150 14 +273 15 +224 16 +369 17 +66 18 +128 19 +213 20 +146 21 +406 22 +429 23 +374 24 +152 25 +469 26 +145 27 +495 28 +37 29 +327 30 +281 31 +277 32 +209 33 +15 34 +82 35 +403 36 +166 37 +417 38 +430 39 +252 40 +292 41 +219 42 +287 43 +153 44 +193 45 +338 46 +446 47 +459 48 +394 49 +237 50 +482 51 +174 52 +413 53 +494 54 +207 55 +199 56 +466 57 +208 58 +174 59 +399 60 +396 61 +247 62 +417 63 +489 64 +162 65 +377 66 +397 67 +309 68 +365 69 +266 70 +439 71 +342 72 +367 73 +325 74 +167 75 +195 76 +475 77 17 78 -170 79 -172 80 -172 81 -174 82 -174 83 -175 84 -175 85 -176 86 -176 87 -177 88 -178 89 -179 90 -179 91 -18 92 -18 93 -180 94 -181 95 -183 96 -186 97 -187 98 -187 99 -187 100 -189 101 -19 102 -190 103 -191 104 -191 105 -192 106 -193 107 -193 108 -193 109 -194 110 -195 111 -195 112 -196 113 -197 114 -197 115 -199 116 -199 117 -199 118 -2 119 -20 120 -200 121 -200 122 -201 123 -202 124 -203 125 -203 126 -205 127 -205 128 -207 129 -207 130 -208 131 -208 132 -208 133 -209 134 -209 135 -213 136 -213 137 -214 138 -216 139 -216 140 -217 141 -217 142 -218 143 -219 144 -219 145 -221 146 -221 147 -222 148 -223 149 -223 150 -224 151 -224 152 -226 153 -228 154 -229 155 -229 156 -230 157 -230 158 -230 159 -230 160 -230 161 -233 162 -233 163 -235 164 -237 165 -237 166 -238 167 -238 168 -239 169 -239 170 -24 171 -24 172 -241 173 -242 174 -242 175 -244 176 -247 177 -248 178 -249 179 -252 180 -255 181 -255 182 -256 183 -256 184 -257 185 -258 186 -26 187 -26 188 -260 189 -262 190 -263 191 -265 192 -265 193 -266 194 -27 195 -272 196 -272 197 -273 198 -273 199 -273 200 -274 201 -275 202 -277 203 -277 204 -277 205 -277 206 -278 207 -278 208 -28 209 -280 210 -280 211 -281 212 -281 213 -282 214 -282 215 -283 216 -284 217 -285 218 -286 219 -287 220 -288 221 -288 222 -289 223 -291 224 -292 225 -296 226 -298 227 -298 228 -298 229 -30 230 -302 231 -305 232 -306 233 -307 234 -307 235 -308 236 -309 237 -309 238 -310 239 -311 240 -311 241 -311 242 -315 243 -316 244 -316 245 -316 246 -317 247 -317 248 -318 249 -318 250 -318 251 -321 252 -321 253 -322 254 -322 255 -323 256 -325 257 -325 258 -327 259 -327 260 -327 261 -33 262 -331 263 -331 264 -332 265 -333 266 -333 267 -335 268 -336 269 -338 270 -339 271 -34 272 -341 273 -342 274 -342 275 -344 276 -344 277 -345 278 -348 279 -348 280 -348 281 -348 282 -348 283 -35 284 -35 285 -35 286 -351 287 -353 288 -353 289 -356 290 -360 291 -362 292 -364 293 -365 294 -366 295 -367 296 -367 297 -368 298 -369 299 -369 300 -369 301 -37 302 -37 303 -373 304 -374 305 -375 306 -377 307 -378 308 -379 309 -382 310 -382 311 -384 312 -384 313 -384 314 -386 315 -389 316 -392 317 -393 318 -394 319 -395 320 -395 321 -396 322 -396 323 -396 324 -397 325 -397 326 -399 327 -399 328 -4 329 -400 330 +113 79 +155 80 +203 81 +339 82 +0 83 +455 84 +128 85 +311 86 +316 87 +57 88 +302 89 +205 90 +149 91 +438 92 +345 93 +129 94 +170 95 +20 96 +489 97 +157 98 +378 99 +221 100 +92 101 +111 102 +47 103 +72 104 +4 105 +280 106 +35 107 +427 108 +277 109 +208 110 +356 111 +399 112 +169 113 +382 114 +498 115 +125 116 +386 117 +437 118 +469 119 +192 120 +286 121 +187 122 +176 123 +54 124 +459 125 +51 126 +138 127 +103 128 +239 129 +213 130 +216 131 +430 132 +278 133 +176 134 +289 135 +221 136 +65 137 +318 138 +332 139 +311 140 +275 141 +137 142 +241 143 +83 144 +333 145 +180 146 +284 147 +12 148 +230 149 +181 150 +67 151 +260 152 +404 153 +384 154 +489 155 +353 156 +373 157 +272 158 +138 159 +217 160 +84 161 +348 162 +466 163 +58 164 +8 165 +411 166 +230 167 +208 168 +348 169 +24 170 +463 171 +431 172 +179 173 +172 174 +42 175 +129 176 +158 177 +119 178 +496 179 +0 180 +322 181 +197 182 +468 183 +393 184 +454 185 +100 186 +298 187 +199 188 +191 189 +418 190 +96 191 +26 192 +165 193 +327 194 +230 195 +205 196 +120 197 +131 198 +51 199 +404 200 +43 201 +436 202 +156 203 +469 204 +468 205 +308 206 +95 207 +196 208 +288 209 +481 210 +457 211 +98 212 +282 213 +197 214 +187 215 +318 216 +318 217 +409 218 +470 219 +137 220 +369 221 +316 222 +169 223 +413 224 +85 225 +77 226 +0 227 +490 228 +87 229 +364 230 +179 231 +118 232 +134 233 +395 234 +282 235 +138 236 +238 237 +419 238 +15 239 +118 240 +72 241 +90 242 +307 243 +19 244 +435 245 +10 246 +277 247 +273 248 +306 249 +224 250 +309 251 +389 252 +327 253 +242 254 +369 255 +392 256 +272 257 +331 258 +401 259 +242 260 +452 261 +177 262 +226 263 +5 264 +497 265 +402 266 +396 267 +317 268 +395 269 +58 270 +35 271 +336 272 +95 273 +11 274 +168 275 +34 276 +229 277 +233 278 +143 279 +472 280 +322 281 +498 282 +160 283 +195 284 +42 285 +321 286 +430 287 +119 288 +489 289 +458 290 +78 291 +76 292 +41 293 +223 294 +492 295 +149 296 +449 297 +218 298 +228 299 +138 300 +453 301 +30 302 +209 303 +64 304 +468 305 +76 306 +74 307 +342 308 +69 309 +230 310 +33 311 +368 312 +103 313 +296 314 +113 315 +216 316 +367 317 +344 318 +167 319 +274 320 +219 321 +239 322 +485 323 +116 324 +223 325 +256 326 +263 327 +70 328 +487 329 +480 330 401 331 -401 332 -401 333 -401 334 -401 335 -402 336 -403 337 -403 338 -403 339 -404 340 -404 341 -406 342 -406 343 -406 344 -406 345 -407 346 -409 347 -409 348 -409 349 -41 350 -411 351 -413 352 -413 353 -414 354 -414 355 -417 356 -417 357 -417 358 -418 359 -419 360 -42 361 -42 362 -421 363 -424 364 -424 365 -427 366 -429 367 -429 368 -43 369 -430 370 -430 371 -430 372 -431 373 -431 374 -431 375 -432 376 -435 377 -436 378 -437 379 -438 380 -438 381 -438 382 -439 383 -439 384 -44 385 -443 386 -444 387 -446 388 -448 389 -449 390 -452 391 -453 392 -454 393 -454 394 -454 395 -455 396 -457 397 -458 398 -458 399 -459 400 -459 401 -460 402 -462 403 -462 404 -463 405 -463 406 -466 407 -466 408 -466 409 -467 410 -468 411 -468 412 -468 413 -468 414 -469 415 -469 416 -469 417 -469 418 -469 419 -47 420 -470 421 -472 422 -475 423 -477 424 +288 332 +191 333 +5 334 +244 335 +438 336 +128 337 +467 338 +432 339 +202 340 +316 341 +229 342 +469 343 +463 344 +280 345 +2 346 +35 347 +283 348 +331 349 +235 350 +80 351 +44 352 +193 353 +321 354 +335 355 +104 356 +466 357 +366 358 +175 359 +403 360 +483 361 +53 362 +105 363 +257 364 +406 365 +409 366 +190 367 +406 368 +401 369 +114 370 +258 371 +90 372 +203 373 +262 374 +348 375 +424 376 +12 377 +396 378 +201 379 +217 380 +164 381 +431 382 +454 383 +478 384 +298 385 +125 386 +431 387 +164 388 +424 389 +187 390 +382 391 +5 392 +70 393 +397 394 +480 395 +291 396 +24 397 +351 398 +255 399 +104 400 +70 401 +163 402 +438 403 +119 404 +414 405 +200 406 +491 407 +237 408 +439 409 +360 410 +248 411 +479 412 +305 413 +417 414 +199 415 +444 416 +120 417 +429 418 +169 419 +443 420 +323 421 +325 422 +277 423 +230 424 478 425 -478 426 -479 427 -480 428 -480 429 -480 430 -481 431 -482 432 -483 433 -484 434 -485 435 -487 436 -489 437 -489 438 -489 439 -489 440 -490 441 -491 442 -492 443 -492 444 -493 445 -494 446 -495 447 -496 448 -497 449 -498 450 -498 451 -498 452 -5 453 -5 454 -5 455 -51 456 -51 457 -53 458 -54 459 -57 460 -58 461 -58 462 -64 463 -65 464 -66 465 -67 466 -67 467 -69 468 -70 469 -70 470 -70 471 -72 472 -72 473 -74 474 -76 475 -76 476 -77 477 -78 478 -8 479 -80 480 -82 481 -83 482 -83 483 -84 484 -84 485 -85 486 -86 487 -87 488 -9 489 -90 490 -90 491 -90 492 -92 493 -95 494 -95 495 -96 496 -97 497 -97 498 -98 499 -98 500 +178 426 +468 427 +310 428 +317 429 +333 430 +493 431 +460 432 +207 433 +249 434 +265 435 +480 436 +83 437 +136 438 +353 439 +172 440 +214 441 +462 442 +233 443 +406 444 +133 445 +175 446 +189 447 +454 448 +375 449 +401 450 +421 451 +407 452 +384 453 +256 454 +26 455 +134 456 +67 457 +384 458 +379 459 +18 460 +462 461 +492 462 +100 463 +298 464 +9 465 +341 466 +498 467 +146 468 +458 469 +362 470 +186 471 +285 472 +348 473 +167 474 +18 475 +273 476 +183 477 +281 478 +344 479 +97 480 +469 481 +315 482 +84 483 +28 484 +37 485 +448 486 +152 487 +348 488 +307 489 +194 490 +414 491 +477 492 +222 493 +126 494 +90 495 +169 496 +403 497 +400 498 +200 499 +97 500 PREHOOK: query: -- make sure stateful functions do not get short-circuited away -- a true result for key=105 would indicate undesired short-circuiting select key, (key = 105) and (row_sequence() = 1) http://git-wip-us.apache.org/repos/asf/hive/blob/df722342/ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortedDynPartitionOptimizer.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortedDynPartitionOptimizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortedDynPartitionOptimizer.java index 010c89e..4adf7b2 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortedDynPartitionOptimizer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortedDynPartitionOptimizer.java @@ -246,7 +246,6 @@ public class SortedDynPartitionOptimizer extends Transform { // Create SelectDesc SelectDesc selConf = new SelectDesc(descs, colNames); - // Create Select Operator SelectOperator selOp = (SelectOperator) OperatorFactory.getAndMakeChild( selConf, selRS, rsOp); @@ -420,7 +419,6 @@ public class SortedDynPartitionOptimizer extends Transform { // 1) Partition columns // 2) Bucket number column // 3) Sort columns - // 4) Null sort columns Set<Integer> keyColsPosInVal = Sets.newLinkedHashSet(); ArrayList<ExprNodeDesc> keyCols = Lists.newArrayList(); List<Integer> newSortOrder = Lists.newArrayList(); http://git-wip-us.apache.org/repos/asf/hive/blob/df722342/ql/src/java/org/apache/hadoop/hive/ql/optimizer/correlation/ReduceSinkDeDuplication.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/correlation/ReduceSinkDeDuplication.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/correlation/ReduceSinkDeDuplication.java index 733620b..77771c3 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/correlation/ReduceSinkDeDuplication.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/correlation/ReduceSinkDeDuplication.java @@ -25,6 +25,7 @@ import java.util.ArrayList; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; +import java.util.Map.Entry; import java.util.Stack; import org.apache.hadoop.hive.conf.HiveConf; @@ -50,11 +51,12 @@ import org.apache.hadoop.hive.ql.parse.SemanticException; import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeDescUtils; +import org.apache.hadoop.hive.ql.plan.OperatorDesc; import org.apache.hadoop.hive.ql.plan.PlanUtils; import org.apache.hadoop.hive.ql.plan.ReduceSinkDesc; import org.apache.hadoop.hive.ql.plan.TableDesc; - -import com.google.common.collect.Lists; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * If two reducer sink operators share the same partition/sort columns and order, @@ -65,6 +67,8 @@ import com.google.common.collect.Lists; */ public class ReduceSinkDeDuplication extends Transform { + protected static final Logger LOG = LoggerFactory.getLogger(ReduceSinkDeDuplication.class); + private static final String RS = ReduceSinkOperator.getOperatorName(); private static final String GBY = GroupByOperator.getOperatorName(); private static final String JOIN = JoinOperator.getOperatorName(); @@ -253,7 +257,7 @@ public class ReduceSinkDeDuplication extends Transform { */ protected boolean merge(ReduceSinkOperator cRS, ReduceSinkOperator pRS, int minReducer) throws SemanticException { - int[] result = checkStatus(cRS, pRS, minReducer); + int[] result = extractMergeDirections(cRS, pRS, minReducer); if (result == null) { return false; } @@ -334,7 +338,7 @@ public class ReduceSinkDeDuplication extends Transform { * 2. for -1, configuration of parent RS is more specific than child RS * 3. for 1, configuration of child RS is more specific than parent RS */ - private int[] checkStatus(ReduceSinkOperator cRS, ReduceSinkOperator pRS, int minReducer) + private int[] extractMergeDirections(ReduceSinkOperator cRS, ReduceSinkOperator pRS, int minReducer) throws SemanticException { ReduceSinkDesc cConf = cRS.getConf(); ReduceSinkDesc pConf = pRS.getConf(); @@ -494,6 +498,112 @@ public class ReduceSinkDeDuplication extends Transform { } return 0; } + + protected boolean aggressiveDedup(ReduceSinkOperator cRS, ReduceSinkOperator pRS, + ReduceSinkDeduplicateProcCtx dedupCtx) throws SemanticException { + assert cRS.getNumParent() == 1; + + ReduceSinkDesc cConf = cRS.getConf(); + ReduceSinkDesc pConf = pRS.getConf(); + List<ExprNodeDesc> cKeys = cConf.getKeyCols(); + List<ExprNodeDesc> pKeys = pConf.getKeyCols(); + + // Check that in the path between cRS and pRS, there are only Select operators + // i.e. the sequence must be pRS-SEL*-cRS + Operator<? extends OperatorDesc> parent = cRS.getParentOperators().get(0); + while (parent != pRS) { + assert parent.getNumParent() == 1; + if (!(parent instanceof SelectOperator)) { + return false; + } + parent = parent.getParentOperators().get(0); + } + + // If child keys are null or empty, we bail out + if (cKeys == null || cKeys.isEmpty()) { + return false; + } + // If parent keys are null or empty, we bail out + if (pKeys == null || pKeys.isEmpty()) { + return false; + } + + // Backtrack key columns of cRS to pRS + // If we cannot backtrack any of the columns, bail out + List<ExprNodeDesc> cKeysInParentRS = ExprNodeDescUtils.backtrack(cKeys, cRS, pRS); + for (int i = 0; i < cKeysInParentRS.size(); i++) { + ExprNodeDesc pexpr = cKeysInParentRS.get(i); + if (pexpr == null) { + // We cannot backtrack the expression, we bail out + return false; + } + } + cRS.getConf().setKeyCols(ExprNodeDescUtils.backtrack(cKeysInParentRS, cRS, pRS)); + + // Backtrack partition columns of cRS to pRS + // If we cannot backtrack any of the columns, bail out + List<ExprNodeDesc> cPartitionInParentRS = ExprNodeDescUtils.backtrack( + cConf.getPartitionCols(), cRS, pRS); + for (int i = 0; i < cPartitionInParentRS.size(); i++) { + ExprNodeDesc pexpr = cPartitionInParentRS.get(i); + if (pexpr == null) { + // We cannot backtrack the expression, we bail out + return false; + } + } + cRS.getConf().setPartitionCols(ExprNodeDescUtils.backtrack(cPartitionInParentRS, cRS, pRS)); + + // Backtrack value columns of cRS to pRS + // If we cannot backtrack any of the columns, bail out + List<ExprNodeDesc> cValueInParentRS = ExprNodeDescUtils.backtrack( + cConf.getValueCols(), cRS, pRS); + for (int i = 0; i < cValueInParentRS.size(); i++) { + ExprNodeDesc pexpr = cValueInParentRS.get(i); + if (pexpr == null) { + // We cannot backtrack the expression, we bail out + return false; + } + } + cRS.getConf().setValueCols(ExprNodeDescUtils.backtrack(cValueInParentRS, cRS, pRS)); + + // Backtrack bucket columns of cRS to pRS (if any) + // If we cannot backtrack any of the columns, bail out + if (cConf.getBucketCols() != null) { + List<ExprNodeDesc> cBucketInParentRS = ExprNodeDescUtils.backtrack( + cConf.getBucketCols(), cRS, pRS); + for (int i = 0; i < cBucketInParentRS.size(); i++) { + ExprNodeDesc pexpr = cBucketInParentRS.get(i); + if (pexpr == null) { + // We cannot backtrack the expression, we bail out + return false; + } + } + cRS.getConf().setBucketCols(ExprNodeDescUtils.backtrack(cBucketInParentRS, cRS, pRS)); + } + + // Update column expression map + for (Entry<String, ExprNodeDesc> e : cRS.getColumnExprMap().entrySet()) { + e.setValue(ExprNodeDescUtils.backtrack(e.getValue(), cRS, pRS)); + } + + // Replace pRS with cRS and remove operator sequence from pRS to cRS + // Recall that the sequence must be pRS-SEL*-cRS + parent = cRS.getParentOperators().get(0); + while (parent != pRS) { + dedupCtx.addRemovedOperator(parent); + parent = parent.getParentOperators().get(0); + } + dedupCtx.addRemovedOperator(pRS); + cRS.getParentOperators().clear(); + for (Operator<? extends OperatorDesc> op : pRS.getParentOperators()) { + op.replaceChild(pRS, cRS); + cRS.getParentOperators().add(op); + } + pRS.getParentOperators().clear(); + pRS.getChildOperators().clear(); + + return true; + } } static class GroupbyReducerProc extends AbsctractReducerReducerProc { @@ -601,11 +711,18 @@ public class ReduceSinkDeDuplication extends Transform { ReduceSinkOperator pRS = CorrelationUtilities.findPossibleParent( cRS, ReduceSinkOperator.class, dedupCtx.trustScript()); - if (pRS != null && merge(cRS, pRS, dedupCtx.minReducer())) { - CorrelationUtilities.replaceReduceSinkWithSelectOperator( - cRS, dedupCtx.getPctx(), dedupCtx); - pRS.getConf().setDeduplicated(true); - return true; + if (pRS != null) { + // Try extended deduplication + if (aggressiveDedup(cRS, pRS, dedupCtx)) { + return true; + } + // Normal deduplication + if (merge(cRS, pRS, dedupCtx.minReducer())) { + CorrelationUtilities.replaceReduceSinkWithSelectOperator( + cRS, dedupCtx.getPctx(), dedupCtx); + pRS.getConf().setDeduplicated(true); + return true; + } } return false; } http://git-wip-us.apache.org/repos/asf/hive/blob/df722342/ql/src/test/results/clientpositive/dynpart_sort_opt_vectorization.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/dynpart_sort_opt_vectorization.q.out b/ql/src/test/results/clientpositive/dynpart_sort_opt_vectorization.q.out index d03bfe4..ab8f96c 100644 --- a/ql/src/test/results/clientpositive/dynpart_sort_opt_vectorization.q.out +++ b/ql/src/test/results/clientpositive/dynpart_sort_opt_vectorization.q.out @@ -159,9 +159,8 @@ explain insert overwrite table over1k_part_orc partition(ds="foo", t) select si, POSTHOOK: type: QUERY STAGE DEPENDENCIES: Stage-1 is a root stage - Stage-2 depends on stages: Stage-1 - Stage-0 depends on stages: Stage-2 - Stage-3 depends on stages: Stage-0 + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 STAGE PLANS: Stage: Stage-1 @@ -178,35 +177,14 @@ STAGE PLANS: outputColumnNames: _col0, _col1, _col2, _col3, _col4 Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator - key expressions: _col0 (type: smallint) - sort order: + + key expressions: _col4 (type: tinyint), _col0 (type: smallint) + sort order: ++ + Map-reduce partition columns: _col4 (type: tinyint) Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: int), _col2 (type: bigint), _col3 (type: float), _col4 (type: tinyint) + value expressions: _col0 (type: smallint), _col1 (type: int), _col2 (type: bigint), _col3 (type: float) Execution mode: vectorized Reduce Operator Tree: Select Operator - expressions: KEY.reducesinkkey0 (type: smallint), VALUE._col0 (type: int), VALUE._col1 (type: bigint), VALUE._col2 (type: float), VALUE._col3 (type: tinyint) - outputColumnNames: _col0, _col1, _col2, _col3, _col4 - Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe - - Stage: Stage-2 - Map Reduce - Map Operator Tree: - TableScan - Reduce Output Operator - key expressions: _col4 (type: tinyint), _col0 (type: smallint) - sort order: ++ - Map-reduce partition columns: _col4 (type: tinyint) - Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE - value expressions: _col0 (type: smallint), _col1 (type: int), _col2 (type: bigint), _col3 (type: float) - Reduce Operator Tree: - Select Operator expressions: VALUE._col0 (type: smallint), VALUE._col1 (type: int), VALUE._col2 (type: bigint), VALUE._col3 (type: float), KEY._col4 (type: tinyint) outputColumnNames: _col0, _col1, _col2, _col3, _col4 Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE @@ -232,7 +210,7 @@ STAGE PLANS: serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde name: default.over1k_part_orc - Stage: Stage-3 + Stage: Stage-2 Stats-Aggr Operator PREHOOK: query: explain insert overwrite table over1k_part_limit_orc partition(ds="foo", t) select si,i,b,f,t from over1k_orc where t is null or t=27 limit 10 @@ -517,9 +495,8 @@ explain insert into table over1k_part_orc partition(ds="foo", t) select si,i,b,f POSTHOOK: type: QUERY STAGE DEPENDENCIES: Stage-1 is a root stage - Stage-2 depends on stages: Stage-1 - Stage-0 depends on stages: Stage-2 - Stage-3 depends on stages: Stage-0 + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 STAGE PLANS: Stage: Stage-1 @@ -536,35 +513,14 @@ STAGE PLANS: outputColumnNames: _col0, _col1, _col2, _col3, _col4 Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator - key expressions: _col0 (type: smallint) - sort order: + + key expressions: _col4 (type: tinyint), _col0 (type: smallint) + sort order: ++ + Map-reduce partition columns: _col4 (type: tinyint) Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: int), _col2 (type: bigint), _col3 (type: float), _col4 (type: tinyint) + value expressions: _col0 (type: smallint), _col1 (type: int), _col2 (type: bigint), _col3 (type: float) Execution mode: vectorized Reduce Operator Tree: Select Operator - expressions: KEY.reducesinkkey0 (type: smallint), VALUE._col0 (type: int), VALUE._col1 (type: bigint), VALUE._col2 (type: float), VALUE._col3 (type: tinyint) - outputColumnNames: _col0, _col1, _col2, _col3, _col4 - Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe - - Stage: Stage-2 - Map Reduce - Map Operator Tree: - TableScan - Reduce Output Operator - key expressions: _col4 (type: tinyint), _col0 (type: smallint) - sort order: ++ - Map-reduce partition columns: _col4 (type: tinyint) - Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE - value expressions: _col0 (type: smallint), _col1 (type: int), _col2 (type: bigint), _col3 (type: float) - Reduce Operator Tree: - Select Operator expressions: VALUE._col0 (type: smallint), VALUE._col1 (type: int), VALUE._col2 (type: bigint), VALUE._col3 (type: float), KEY._col4 (type: tinyint) outputColumnNames: _col0, _col1, _col2, _col3, _col4 Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE @@ -590,7 +546,7 @@ STAGE PLANS: serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde name: default.over1k_part_orc - Stage: Stage-3 + Stage: Stage-2 Stats-Aggr Operator PREHOOK: query: explain insert into table over1k_part_limit_orc partition(ds="foo", t) select si,i,b,f,t from over1k_orc where t is null or t=27 limit 10 @@ -1336,9 +1292,8 @@ POSTHOOK: query: explain insert overwrite table over1k_part2_orc partition(ds="f POSTHOOK: type: QUERY STAGE DEPENDENCIES: Stage-1 is a root stage - Stage-2 depends on stages: Stage-1 - Stage-0 depends on stages: Stage-2 - Stage-3 depends on stages: Stage-0 + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 STAGE PLANS: Stage: Stage-1 @@ -1355,35 +1310,14 @@ STAGE PLANS: outputColumnNames: _col0, _col1, _col2, _col3, _col4 Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator - key expressions: _col1 (type: int) - sort order: + + key expressions: _col4 (type: tinyint), _col1 (type: int) + sort order: ++ + Map-reduce partition columns: _col4 (type: tinyint) Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE - value expressions: _col0 (type: smallint), _col2 (type: bigint), _col3 (type: float), _col4 (type: tinyint) + value expressions: _col0 (type: smallint), _col1 (type: int), _col2 (type: bigint), _col3 (type: float) Execution mode: vectorized Reduce Operator Tree: Select Operator - expressions: VALUE._col0 (type: smallint), KEY.reducesinkkey0 (type: int), VALUE._col1 (type: bigint), VALUE._col2 (type: float), VALUE._col3 (type: tinyint) - outputColumnNames: _col0, _col1, _col2, _col3, _col4 - Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe - - Stage: Stage-2 - Map Reduce - Map Operator Tree: - TableScan - Reduce Output Operator - key expressions: _col4 (type: tinyint), _col1 (type: int) - sort order: ++ - Map-reduce partition columns: _col4 (type: tinyint) - Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE - value expressions: _col0 (type: smallint), _col1 (type: int), _col2 (type: bigint), _col3 (type: float) - Reduce Operator Tree: - Select Operator expressions: VALUE._col0 (type: smallint), VALUE._col1 (type: int), VALUE._col2 (type: bigint), VALUE._col3 (type: float), KEY._col4 (type: tinyint) outputColumnNames: _col0, _col1, _col2, _col3, _col4 Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE @@ -1409,7 +1343,7 @@ STAGE PLANS: serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.over1k_part2_orc - Stage: Stage-3 + Stage: Stage-2 Stats-Aggr Operator PREHOOK: query: explain insert overwrite table over1k_part2_orc partition(ds="foo",t) select si,i,b,f,t from (select * from over1k_orc order by i limit 10) tmp where t is null or t=27 http://git-wip-us.apache.org/repos/asf/hive/blob/df722342/ql/src/test/results/clientpositive/dynpart_sort_optimization.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/dynpart_sort_optimization.q.out b/ql/src/test/results/clientpositive/dynpart_sort_optimization.q.out index dec872a..391acff 100644 --- a/ql/src/test/results/clientpositive/dynpart_sort_optimization.q.out +++ b/ql/src/test/results/clientpositive/dynpart_sort_optimization.q.out @@ -1240,9 +1240,8 @@ POSTHOOK: query: explain insert overwrite table over1k_part2 partition(ds="foo", POSTHOOK: type: QUERY STAGE DEPENDENCIES: Stage-1 is a root stage - Stage-2 depends on stages: Stage-1 - Stage-0 depends on stages: Stage-2 - Stage-3 depends on stages: Stage-0 + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 STAGE PLANS: Stage: Stage-1 @@ -1259,32 +1258,11 @@ STAGE PLANS: outputColumnNames: _col0, _col1, _col2, _col3, _col4 Statistics: Num rows: 4442 Data size: 106611 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator - key expressions: _col1 (type: int) - sort order: + + key expressions: _col4 (type: tinyint), _col1 (type: int) + sort order: ++ + Map-reduce partition columns: _col4 (type: tinyint) Statistics: Num rows: 4442 Data size: 106611 Basic stats: COMPLETE Column stats: NONE - value expressions: _col0 (type: smallint), _col2 (type: bigint), _col3 (type: float), _col4 (type: tinyint) - Reduce Operator Tree: - Select Operator - expressions: VALUE._col0 (type: smallint), KEY.reducesinkkey0 (type: int), VALUE._col1 (type: bigint), VALUE._col2 (type: float), VALUE._col3 (type: tinyint) - outputColumnNames: _col0, _col1, _col2, _col3, _col4 - Statistics: Num rows: 4442 Data size: 106611 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe - - Stage: Stage-2 - Map Reduce - Map Operator Tree: - TableScan - Reduce Output Operator - key expressions: _col4 (type: tinyint), _col1 (type: int) - sort order: ++ - Map-reduce partition columns: _col4 (type: tinyint) - Statistics: Num rows: 4442 Data size: 106611 Basic stats: COMPLETE Column stats: NONE - value expressions: _col0 (type: smallint), _col1 (type: int), _col2 (type: bigint), _col3 (type: float) + value expressions: _col0 (type: smallint), _col1 (type: int), _col2 (type: bigint), _col3 (type: float) Reduce Operator Tree: Select Operator expressions: VALUE._col0 (type: smallint), VALUE._col1 (type: int), VALUE._col2 (type: bigint), VALUE._col3 (type: float), KEY._col4 (type: tinyint) @@ -1312,7 +1290,7 @@ STAGE PLANS: serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.over1k_part2 - Stage: Stage-3 + Stage: Stage-2 Stats-Aggr Operator PREHOOK: query: explain insert overwrite table over1k_part2 partition(ds="foo",t) select si,i,b,f,t from (select * from over1k order by i limit 10) tmp where t is null or t=27 http://git-wip-us.apache.org/repos/asf/hive/blob/df722342/ql/src/test/results/clientpositive/dynpart_sort_optimization_acid.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/dynpart_sort_optimization_acid.q.out b/ql/src/test/results/clientpositive/dynpart_sort_optimization_acid.q.out index 8325803..ac95ec2 100644 --- a/ql/src/test/results/clientpositive/dynpart_sort_optimization_acid.q.out +++ b/ql/src/test/results/clientpositive/dynpart_sort_optimization_acid.q.out @@ -380,9 +380,8 @@ POSTHOOK: query: explain update acid set value = 'bar' where key = 'foo' and ds POSTHOOK: type: QUERY STAGE DEPENDENCIES: Stage-1 is a root stage - Stage-2 depends on stages: Stage-1 - Stage-0 depends on stages: Stage-2 - Stage-3 depends on stages: Stage-0 + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 STAGE PLANS: Stage: Stage-1 @@ -396,28 +395,9 @@ STAGE PLANS: expressions: ROW__ID (type: struct<transactionid:bigint,bucketid:int,rowid:bigint>), ds (type: string) outputColumnNames: _col0, _col3 Reduce Output Operator - key expressions: _col0 (type: struct<transactionid:bigint,bucketid:int,rowid:bigint>) - sort order: + - value expressions: _col3 (type: string) - Reduce Operator Tree: - Select Operator - expressions: KEY.reducesinkkey0 (type: struct<transactionid:bigint,bucketid:int,rowid:bigint>), VALUE._col2 (type: string) - outputColumnNames: _col0, _col3 - File Output Operator - compressed: false - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe - - Stage: Stage-2 - Map Reduce - Map Operator Tree: - TableScan - Reduce Output Operator - key expressions: _col3 (type: string), '_bucket_number' (type: string), _col0 (type: struct<transactionid:bigint,bucketid:int,rowid:bigint>) - sort order: +++ - Map-reduce partition columns: _col3 (type: string) + key expressions: _col3 (type: string), '_bucket_number' (type: string), _col0 (type: struct<transactionid:bigint,bucketid:int,rowid:bigint>) + sort order: +++ + Map-reduce partition columns: _col3 (type: string) Reduce Operator Tree: Select Operator expressions: KEY._col0 (type: struct<transactionid:bigint,bucketid:int,rowid:bigint>), 'foo' (type: string), 'bar' (type: string), KEY._col3 (type: string), KEY.'_bucket_number' (type: string) @@ -442,7 +422,7 @@ STAGE PLANS: serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde name: default.acid - Stage: Stage-3 + Stage: Stage-2 Stats-Aggr Operator PREHOOK: query: update acid set value = 'bar' where key = 'foo' and ds in ('2008-04-08') @@ -894,9 +874,8 @@ POSTHOOK: query: explain update acid set value = 'bar' where key = 'foo' and ds= POSTHOOK: type: QUERY STAGE DEPENDENCIES: Stage-1 is a root stage - Stage-2 depends on stages: Stage-1 - Stage-0 depends on stages: Stage-2 - Stage-3 depends on stages: Stage-0 + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 STAGE PLANS: Stage: Stage-1 @@ -910,28 +889,9 @@ STAGE PLANS: expressions: ROW__ID (type: struct<transactionid:bigint,bucketid:int,rowid:bigint>), hr (type: int) outputColumnNames: _col0, _col4 Reduce Output Operator - key expressions: _col0 (type: struct<transactionid:bigint,bucketid:int,rowid:bigint>) - sort order: + - value expressions: _col4 (type: int) - Reduce Operator Tree: - Select Operator - expressions: KEY.reducesinkkey0 (type: struct<transactionid:bigint,bucketid:int,rowid:bigint>), VALUE._col3 (type: int) - outputColumnNames: _col0, _col4 - File Output Operator - compressed: false - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe - - Stage: Stage-2 - Map Reduce - Map Operator Tree: - TableScan - Reduce Output Operator - key expressions: '2008-04-08' (type: string), _col4 (type: int), '_bucket_number' (type: string), _col0 (type: struct<transactionid:bigint,bucketid:int,rowid:bigint>) - sort order: ++++ - Map-reduce partition columns: '2008-04-08' (type: string), _col4 (type: int) + key expressions: '2008-04-08' (type: string), _col4 (type: int), '_bucket_number' (type: string), _col0 (type: struct<transactionid:bigint,bucketid:int,rowid:bigint>) + sort order: ++++ + Map-reduce partition columns: '2008-04-08' (type: string), _col4 (type: int) Reduce Operator Tree: Select Operator expressions: KEY._col0 (type: struct<transactionid:bigint,bucketid:int,rowid:bigint>), 'foo' (type: string), 'bar' (type: string), '2008-04-08' (type: string), KEY._col4 (type: int), KEY.'_bucket_number' (type: string) @@ -957,7 +917,7 @@ STAGE PLANS: serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde name: default.acid - Stage: Stage-3 + Stage: Stage-2 Stats-Aggr Operator PREHOOK: query: update acid set value = 'bar' where key = 'foo' and ds='2008-04-08' and hr>=11 @@ -1091,9 +1051,8 @@ POSTHOOK: query: explain update acid set value = 'bar' where key = 'foo' and ds= POSTHOOK: type: QUERY STAGE DEPENDENCIES: Stage-1 is a root stage - Stage-2 depends on stages: Stage-1 - Stage-0 depends on stages: Stage-2 - Stage-3 depends on stages: Stage-0 + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 STAGE PLANS: Stage: Stage-1 @@ -1107,29 +1066,10 @@ STAGE PLANS: expressions: ROW__ID (type: struct<transactionid:bigint,bucketid:int,rowid:bigint>), key (type: string), 'bar' (type: string), ds (type: string), hr (type: int) outputColumnNames: _col0, _col1, _col2, _col3, _col4 Reduce Output Operator - key expressions: _col0 (type: struct<transactionid:bigint,bucketid:int,rowid:bigint>) - sort order: + - value expressions: _col1 (type: string), _col2 (type: string), _col3 (type: string), _col4 (type: int) - Reduce Operator Tree: - Select Operator - expressions: KEY.reducesinkkey0 (type: struct<transactionid:bigint,bucketid:int,rowid:bigint>), VALUE._col0 (type: string), VALUE._col1 (type: string), VALUE._col2 (type: string), VALUE._col3 (type: int) - outputColumnNames: _col0, _col1, _col2, _col3, _col4 - File Output Operator - compressed: false - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe - - Stage: Stage-2 - Map Reduce - Map Operator Tree: - TableScan - Reduce Output Operator - key expressions: _col3 (type: string), _col4 (type: int), '_bucket_number' (type: string), _col0 (type: struct<transactionid:bigint,bucketid:int,rowid:bigint>) - sort order: ++++ - Map-reduce partition columns: _col3 (type: string), _col4 (type: int) - value expressions: _col1 (type: string), _col2 (type: string) + key expressions: _col3 (type: string), _col4 (type: int), '_bucket_number' (type: string), _col0 (type: struct<transactionid:bigint,bucketid:int,rowid:bigint>) + sort order: ++++ + Map-reduce partition columns: _col3 (type: string), _col4 (type: int) + value expressions: _col1 (type: string), _col2 (type: string) Reduce Operator Tree: Select Operator expressions: KEY._col0 (type: struct<transactionid:bigint,bucketid:int,rowid:bigint>), VALUE._col1 (type: string), VALUE._col2 (type: string), KEY._col3 (type: string), KEY._col4 (type: int), KEY.'_bucket_number' (type: string) @@ -1155,7 +1095,7 @@ STAGE PLANS: serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde name: default.acid - Stage: Stage-3 + Stage: Stage-2 Stats-Aggr Operator PREHOOK: query: update acid set value = 'bar' where key = 'foo' and ds='2008-04-08' and hr=11 @@ -1185,9 +1125,8 @@ POSTHOOK: query: explain update acid set value = 'bar' where key = 'foo' and ds= POSTHOOK: type: QUERY STAGE DEPENDENCIES: Stage-1 is a root stage - Stage-2 depends on stages: Stage-1 - Stage-0 depends on stages: Stage-2 - Stage-3 depends on stages: Stage-0 + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 STAGE PLANS: Stage: Stage-1 @@ -1201,29 +1140,10 @@ STAGE PLANS: expressions: ROW__ID (type: struct<transactionid:bigint,bucketid:int,rowid:bigint>), key (type: string), 'bar' (type: string), ds (type: string), hr (type: int) outputColumnNames: _col0, _col1, _col2, _col3, _col4 Reduce Output Operator - key expressions: _col0 (type: struct<transactionid:bigint,bucketid:int,rowid:bigint>) - sort order: + - value expressions: _col1 (type: string), _col2 (type: string), _col3 (type: string), _col4 (type: int) - Reduce Operator Tree: - Select Operator - expressions: KEY.reducesinkkey0 (type: struct<transactionid:bigint,bucketid:int,rowid:bigint>), VALUE._col0 (type: string), VALUE._col1 (type: string), VALUE._col2 (type: string), VALUE._col3 (type: int) - outputColumnNames: _col0, _col1, _col2, _col3, _col4 - File Output Operator - compressed: false - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe - - Stage: Stage-2 - Map Reduce - Map Operator Tree: - TableScan - Reduce Output Operator - key expressions: _col3 (type: string), _col4 (type: int), '_bucket_number' (type: string), _col0 (type: struct<transactionid:bigint,bucketid:int,rowid:bigint>) - sort order: ++++ - Map-reduce partition columns: _col3 (type: string), _col4 (type: int) - value expressions: _col1 (type: string), _col2 (type: string) + key expressions: _col3 (type: string), _col4 (type: int), '_bucket_number' (type: string), _col0 (type: struct<transactionid:bigint,bucketid:int,rowid:bigint>) + sort order: ++++ + Map-reduce partition columns: _col3 (type: string), _col4 (type: int) + value expressions: _col1 (type: string), _col2 (type: string) Reduce Operator Tree: Select Operator expressions: KEY._col0 (type: struct<transactionid:bigint,bucketid:int,rowid:bigint>), VALUE._col1 (type: string), VALUE._col2 (type: string), KEY._col3 (type: string), KEY._col4 (type: int), KEY.'_bucket_number' (type: string) @@ -1249,7 +1169,7 @@ STAGE PLANS: serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde name: default.acid - Stage: Stage-3 + Stage: Stage-2 Stats-Aggr Operator PREHOOK: query: update acid set value = 'bar' where key = 'foo' and ds='2008-04-08' and hr>=11 http://git-wip-us.apache.org/repos/asf/hive/blob/df722342/ql/src/test/results/clientpositive/reducesink_dedup.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/reducesink_dedup.q.out b/ql/src/test/results/clientpositive/reducesink_dedup.q.out index b89df52..77bffff 100644 --- a/ql/src/test/results/clientpositive/reducesink_dedup.q.out +++ b/ql/src/test/results/clientpositive/reducesink_dedup.q.out @@ -10,29 +10,29 @@ distribute by 1 sort by 1 POSTHOOK: type: QUERY POSTHOOK: Input: default@part #### A masked pattern was here #### +almond azure blanched chiffon midnight +almond aquamarine dodger light gainsboro +almond antique sky peru orange +almond antique medium spring khaki +almond antique blue firebrick mint +almond azure aquamarine papaya violet +almond aquamarine yellow dodger mint +almond aquamarine floral ivory bisque +almond antique violet mint lemon +almond antique gainsboro frosted violet +almond antique olive coral navajo +almond antique misty red olive +almond antique metallic orange dim +almond antique forest lavender goldenrod +almond antique chartreuse khaki white +almond aquamarine sandy cyan gainsboro +almond aquamarine rose maroon antique +almond aquamarine midnight light salmon +almond antique violet turquoise frosted +almond antique violet chocolate turquoise +almond aquamarine pink moccasin thistle +almond aquamarine burnished black steel +almond antique salmon chartreuse burlywood +almond antique chartreuse lavender yellow almond antique burnished rose metallic almond antique burnished rose metallic -almond antique chartreuse lavender yellow -almond antique salmon chartreuse burlywood -almond aquamarine burnished black steel -almond aquamarine pink moccasin thistle -almond antique violet chocolate turquoise -almond antique violet turquoise frosted -almond aquamarine midnight light salmon -almond aquamarine rose maroon antique -almond aquamarine sandy cyan gainsboro -almond antique chartreuse khaki white -almond antique forest lavender goldenrod -almond antique metallic orange dim -almond antique misty red olive -almond antique olive coral navajo -almond antique gainsboro frosted violet -almond antique violet mint lemon -almond aquamarine floral ivory bisque -almond aquamarine yellow dodger mint -almond azure aquamarine papaya violet -almond antique blue firebrick mint -almond antique medium spring khaki -almond antique sky peru orange -almond aquamarine dodger light gainsboro -almond azure blanched chiffon midnight http://git-wip-us.apache.org/repos/asf/hive/blob/df722342/ql/src/test/results/clientpositive/tez/dynpart_sort_opt_vectorization.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/tez/dynpart_sort_opt_vectorization.q.out b/ql/src/test/results/clientpositive/tez/dynpart_sort_opt_vectorization.q.out index a90e3f6..9a72586 100644 --- a/ql/src/test/results/clientpositive/tez/dynpart_sort_opt_vectorization.q.out +++ b/ql/src/test/results/clientpositive/tez/dynpart_sort_opt_vectorization.q.out @@ -169,7 +169,6 @@ STAGE PLANS: #### A masked pattern was here #### Edges: Reducer 2 <- Map 1 (SIMPLE_EDGE) - Reducer 3 <- Reducer 2 (SIMPLE_EDGE) #### A masked pattern was here #### Vertices: Map 1 @@ -185,28 +184,16 @@ STAGE PLANS: outputColumnNames: _col0, _col1, _col2, _col3, _col4 Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator - key expressions: _col0 (type: smallint) - sort order: + + key expressions: _col4 (type: tinyint), _col0 (type: smallint) + sort order: ++ + Map-reduce partition columns: _col4 (type: tinyint) Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: int), _col2 (type: bigint), _col3 (type: float), _col4 (type: tinyint) + value expressions: _col0 (type: smallint), _col1 (type: int), _col2 (type: bigint), _col3 (type: float) Execution mode: vectorized Reducer 2 Execution mode: vectorized Reduce Operator Tree: Select Operator - expressions: KEY.reducesinkkey0 (type: smallint), VALUE._col0 (type: int), VALUE._col1 (type: bigint), VALUE._col2 (type: float), VALUE._col3 (type: tinyint) - outputColumnNames: _col0, _col1, _col2, _col3, _col4 - Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: _col4 (type: tinyint), _col0 (type: smallint) - sort order: ++ - Map-reduce partition columns: _col4 (type: tinyint) - Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE - value expressions: _col0 (type: smallint), _col1 (type: int), _col2 (type: bigint), _col3 (type: float) - Reducer 3 - Execution mode: vectorized - Reduce Operator Tree: - Select Operator expressions: VALUE._col0 (type: smallint), VALUE._col1 (type: int), VALUE._col2 (type: bigint), VALUE._col3 (type: float), KEY._col4 (type: tinyint) outputColumnNames: _col0, _col1, _col2, _col3, _col4 Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE @@ -557,7 +544,6 @@ STAGE PLANS: #### A masked pattern was here #### Edges: Reducer 2 <- Map 1 (SIMPLE_EDGE) - Reducer 3 <- Reducer 2 (SIMPLE_EDGE) #### A masked pattern was here #### Vertices: Map 1 @@ -573,28 +559,16 @@ STAGE PLANS: outputColumnNames: _col0, _col1, _col2, _col3, _col4 Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator - key expressions: _col0 (type: smallint) - sort order: + + key expressions: _col4 (type: tinyint), _col0 (type: smallint) + sort order: ++ + Map-reduce partition columns: _col4 (type: tinyint) Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE - value expressions: _col1 (type: int), _col2 (type: bigint), _col3 (type: float), _col4 (type: tinyint) + value expressions: _col0 (type: smallint), _col1 (type: int), _col2 (type: bigint), _col3 (type: float) Execution mode: vectorized Reducer 2 Execution mode: vectorized Reduce Operator Tree: Select Operator - expressions: KEY.reducesinkkey0 (type: smallint), VALUE._col0 (type: int), VALUE._col1 (type: bigint), VALUE._col2 (type: float), VALUE._col3 (type: tinyint) - outputColumnNames: _col0, _col1, _col2, _col3, _col4 - Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: _col4 (type: tinyint), _col0 (type: smallint) - sort order: ++ - Map-reduce partition columns: _col4 (type: tinyint) - Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE - value expressions: _col0 (type: smallint), _col1 (type: int), _col2 (type: bigint), _col3 (type: float) - Reducer 3 - Execution mode: vectorized - Reduce Operator Tree: - Select Operator expressions: VALUE._col0 (type: smallint), VALUE._col1 (type: int), VALUE._col2 (type: bigint), VALUE._col3 (type: float), KEY._col4 (type: tinyint) outputColumnNames: _col0, _col1, _col2, _col3, _col4 Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE @@ -1418,7 +1392,6 @@ STAGE PLANS: #### A masked pattern was here #### Edges: Reducer 2 <- Map 1 (SIMPLE_EDGE) - Reducer 3 <- Reducer 2 (SIMPLE_EDGE) #### A masked pattern was here #### Vertices: Map 1 @@ -1434,28 +1407,16 @@ STAGE PLANS: outputColumnNames: _col0, _col1, _col2, _col3, _col4 Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator - key expressions: _col1 (type: int) - sort order: + + key expressions: _col4 (type: tinyint), _col1 (type: int) + sort order: ++ + Map-reduce partition columns: _col4 (type: tinyint) Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE - value expressions: _col0 (type: smallint), _col2 (type: bigint), _col3 (type: float), _col4 (type: tinyint) + value expressions: _col0 (type: smallint), _col1 (type: int), _col2 (type: bigint), _col3 (type: float) Execution mode: vectorized Reducer 2 Execution mode: vectorized Reduce Operator Tree: Select Operator - expressions: VALUE._col0 (type: smallint), KEY.reducesinkkey0 (type: int), VALUE._col1 (type: bigint), VALUE._col2 (type: float), VALUE._col3 (type: tinyint) - outputColumnNames: _col0, _col1, _col2, _col3, _col4 - Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: _col4 (type: tinyint), _col1 (type: int) - sort order: ++ - Map-reduce partition columns: _col4 (type: tinyint) - Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE - value expressions: _col0 (type: smallint), _col1 (type: int), _col2 (type: bigint), _col3 (type: float) - Reducer 3 - Execution mode: vectorized - Reduce Operator Tree: - Select Operator expressions: VALUE._col0 (type: smallint), VALUE._col1 (type: int), VALUE._col2 (type: bigint), VALUE._col3 (type: float), KEY._col4 (type: tinyint) outputColumnNames: _col0, _col1, _col2, _col3, _col4 Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE http://git-wip-us.apache.org/repos/asf/hive/blob/df722342/ql/src/test/results/clientpositive/tez/dynpart_sort_optimization.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/tez/dynpart_sort_optimization.q.out b/ql/src/test/results/clientpositive/tez/dynpart_sort_optimization.q.out index 723e819..2f88148 100644 --- a/ql/src/test/results/clientpositive/tez/dynpart_sort_optimization.q.out +++ b/ql/src/test/results/clientpositive/tez/dynpart_sort_optimization.q.out @@ -1329,7 +1329,6 @@ STAGE PLANS: #### A masked pattern was here #### Edges: Reducer 2 <- Map 1 (SIMPLE_EDGE) - Reducer 3 <- Reducer 2 (SIMPLE_EDGE) #### A masked pattern was here #### Vertices: Map 1 @@ -1345,25 +1344,14 @@ STAGE PLANS: outputColumnNames: _col0, _col1, _col2, _col3, _col4 Statistics: Num rows: 4442 Data size: 106611 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator - key expressions: _col1 (type: int) - sort order: + + key expressions: _col4 (type: tinyint), _col1 (type: int) + sort order: ++ + Map-reduce partition columns: _col4 (type: tinyint) Statistics: Num rows: 4442 Data size: 106611 Basic stats: COMPLETE Column stats: NONE - value expressions: _col0 (type: smallint), _col2 (type: bigint), _col3 (type: float), _col4 (type: tinyint) + value expressions: _col0 (type: smallint), _col1 (type: int), _col2 (type: bigint), _col3 (type: float) Reducer 2 Reduce Operator Tree: Select Operator - expressions: VALUE._col0 (type: smallint), KEY.reducesinkkey0 (type: int), VALUE._col1 (type: bigint), VALUE._col2 (type: float), VALUE._col3 (type: tinyint) - outputColumnNames: _col0, _col1, _col2, _col3, _col4 - Statistics: Num rows: 4442 Data size: 106611 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: _col4 (type: tinyint), _col1 (type: int) - sort order: ++ - Map-reduce partition columns: _col4 (type: tinyint) - Statistics: Num rows: 4442 Data size: 106611 Basic stats: COMPLETE Column stats: NONE - value expressions: _col0 (type: smallint), _col1 (type: int), _col2 (type: bigint), _col3 (type: float) - Reducer 3 - Reduce Operator Tree: - Select Operator expressions: VALUE._col0 (type: smallint), VALUE._col1 (type: int), VALUE._col2 (type: bigint), VALUE._col3 (type: float), KEY._col4 (type: tinyint) outputColumnNames: _col0, _col1, _col2, _col3, _col4 Statistics: Num rows: 4442 Data size: 106611 Basic stats: COMPLETE Column stats: NONE
