[GitHub] spark pull request #17122: [SPARK-19786][SQL] Facilitate loop optimizations ...
Github user asfgit closed the pull request at: https://github.com/apache/spark/pull/17122 --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #17122: [SPARK-19786][SQL] Facilitate loop optimizations ...
Github user kiszk commented on a diff in the pull request: https://github.com/apache/spark/pull/17122#discussion_r104666030 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala --- @@ -206,6 +206,21 @@ trait CodegenSupport extends SparkPlan { def doConsume(ctx: CodegenContext, input: Seq[ExprCode], row: ExprCode): String = { throw new UnsupportedOperationException } + + /** + * For optimization to suppress shouldStop() in a loop of WholeStageCodegen. + * Returning true means we need to insert shouldStop() into the loop producing rows, if any. + */ + def isShouldStopRequired: Boolean = { +return shouldStopRequired && !(this.parent != null && !this.parent.isShouldStopRequired) --- End diff -- thanks, done --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #17122: [SPARK-19786][SQL] Facilitate loop optimizations ...
Github user viirya commented on a diff in the pull request: https://github.com/apache/spark/pull/17122#discussion_r104592747 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala --- @@ -206,6 +206,21 @@ trait CodegenSupport extends SparkPlan { def doConsume(ctx: CodegenContext, input: Seq[ExprCode], row: ExprCode): String = { throw new UnsupportedOperationException } + + /** + * For optimization to suppress shouldStop() in a loop of WholeStageCodegen. + * Returning true means we need to insert shouldStop() into the loop producing rows, if any. + */ + def isShouldStopRequired: Boolean = { +return shouldStopRequired && !(this.parent != null && !this.parent.isShouldStopRequired) --- End diff -- I think `shouldStopRequired && (this.parent == null || this.parent.isShouldStopRequired)` is better to understand. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #17122: [SPARK-19786][SQL] Facilitate loop optimizations ...
Github user hvanhovell commented on a diff in the pull request: https://github.com/apache/spark/pull/17122#discussion_r104472986 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala --- @@ -206,6 +206,21 @@ trait CodegenSupport extends SparkPlan { def doConsume(ctx: CodegenContext, input: Seq[ExprCode], row: ExprCode): String = { throw new UnsupportedOperationException } + + /** + * For optimization to suppress shouldStop() in a loop of WholeStageCodegen. + * Returning true means we need to insert shouldStop() into the loop producing rows, if any. + */ + def isShouldStopRequired: Boolean = { +return shouldStopRequired && !(this.parent != null && !this.parent.isShouldStopRequired) + } + + /** + * Set to false if this plan consumes all rows produced by children but doesn't output row + * to buffer by calling append(), so the children don't require shouldStop() + * in the loop of producing rows. + */ + protected def shouldStopRequired: Boolean = true --- End diff -- The only reason I mentioned sort is that there is no use in stopping early and that it would not be correct to do so. I was not really expecting any improvement. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #17122: [SPARK-19786][SQL] Facilitate loop optimizations ...
Github user kiszk commented on a diff in the pull request: https://github.com/apache/spark/pull/17122#discussion_r104472230 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala --- @@ -206,6 +206,21 @@ trait CodegenSupport extends SparkPlan { def doConsume(ctx: CodegenContext, input: Seq[ExprCode], row: ExprCode): String = { throw new UnsupportedOperationException } + + /** + * For optimization to suppress shouldStop() in a loop of WholeStageCodegen. + * Returning true means we need to insert shouldStop() into the loop producing rows, if any. + */ + def isShouldStopRequired: Boolean = { +return shouldStopRequired && !(this.parent != null && !this.parent.isShouldStopRequired) + } + + /** + * Set to false if this plan consumes all rows produced by children but doesn't output row + * to buffer by calling append(), so the children don't require shouldStop() + * in the loop of producing rows. + */ + protected def shouldStopRequired: Boolean = true --- End diff -- I agree with you. I just confirmed that this PR is not effective for Sort. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #17122: [SPARK-19786][SQL] Facilitate loop optimizations ...
Github user hvanhovell commented on a diff in the pull request: https://github.com/apache/spark/pull/17122#discussion_r104468446 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala --- @@ -206,6 +206,21 @@ trait CodegenSupport extends SparkPlan { def doConsume(ctx: CodegenContext, input: Seq[ExprCode], row: ExprCode): String = { throw new UnsupportedOperationException } + + /** + * For optimization to suppress shouldStop() in a loop of WholeStageCodegen. + * Returning true means we need to insert shouldStop() into the loop producing rows, if any. + */ + def isShouldStopRequired: Boolean = { +return shouldStopRequired && !(this.parent != null && !this.parent.isShouldStopRequired) + } + + /** + * Set to false if this plan consumes all rows produced by children but doesn't output row + * to buffer by calling append(), so the children don't require shouldStop() + * in the loop of producing rows. + */ + protected def shouldStopRequired: Boolean = true --- End diff -- How would sort be improved? Sort is very expensive operation, so it dominates the runtime of the job. The only possible improvement here is that you could avoid sorting with range (assuming that we do not overflow). --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #17122: [SPARK-19786][SQL] Facilitate loop optimizations ...
Github user kiszk commented on a diff in the pull request: https://github.com/apache/spark/pull/17122#discussion_r104467009 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala --- @@ -206,6 +206,21 @@ trait CodegenSupport extends SparkPlan { def doConsume(ctx: CodegenContext, input: Seq[ExprCode], row: ExprCode): String = { throw new UnsupportedOperationException } + + /** + * For optimization to suppress shouldStop() in a loop of WholeStageCodegen. + * Returning true means we need to insert shouldStop() into the loop producing rows, if any. + */ + def isShouldStopRequired: Boolean = { +return shouldStopRequired && !(this.parent != null && !this.parent.isShouldStopRequired) + } + + /** + * Set to false if this plan consumes all rows produced by children but doesn't output row + * to buffer by calling append(), so the children don't require shouldStop() + * in the loop of producing rows. + */ + protected def shouldStopRequired: Boolean = true --- End diff -- I cannot see performance improvement on Sort. I think there are two reasons for this result. One is that the loop body is too large. At the inner-most loop, a [`insertRow` method](https://github.com/apache/spark/blob/master/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java#L107-L120) is called. I think that the size of this method is too large to facilitate loop optimizations. The other is that the hotspot method is not here. I guess that the hotspot method may be `sort()` at line 154. Here is the generated code. ```java /* 114 */ while (true) { /* 115 */ long range_range = range_batchEnd - range_number; /* 116 */ if (range_range != 0L) { /* 117 */ int range_localEnd = (int)(range_range / -1L); /* 118 */ for (int range_localIdx = 0; range_localIdx < range_localEnd; range_localIdx++) { /* 119 */ long range_value = ((long)range_localIdx * -1L) + range_number; /* 120 */ /* 121 */ range_rowWriter.write(0, range_value); /* 122 */ sort_sorter.insertRow((UnsafeRow)range_result); /* 123 */ /* 124 */ // shouldStop check is eliminated /* 125 */ } /* 126 */ range_number = range_batchEnd; /* 127 */ } /* 128 */ /* 129 */ if (range_taskContext.isInterrupted()) { /* 130 */ throw new TaskKilledException(); /* 131 */ } /* 132 */ /* 133 */ long range_nextBatchTodo; /* 134 */ if (range_numElementsTodo > 1000L) { /* 135 */ range_nextBatchTodo = 1000L; /* 136 */ range_numElementsTodo -= 1000L; /* 137 */ } else { /* 138 */ range_nextBatchTodo = range_numElementsTodo; /* 139 */ range_numElementsTodo = 0; /* 140 */ if (range_nextBatchTodo == 0) break; /* 141 */ } /* 142 */ range_numOutputRows.add(range_nextBatchTodo); /* 143 */ range_inputMetrics.incRecordsRead(range_nextBatchTodo); /* 144 */ /* 145 */ range_batchEnd += range_nextBatchTodo * -1L; /* 146 */ } /* 147 */ /* 148 */ } /* 149 */ /* 150 */ protected void processNext() throws java.io.IOException { /* 151 */ if (sort_needToSort) { /* 152 */ long sort_spillSizeBefore = sort_metrics.memoryBytesSpilled(); /* 153 */ sort_addToSorter(); /* 154 */ sort_sortedIter = sort_sorter.sort(); /* 155 */ sort_sortTime.add(sort_sorter.getSortTimeNanos() / 100); /* 156 */ sort_peakMemory.add(sort_sorter.getPeakMemoryUsage()); /* 157 */ sort_spillSize.add(sort_metrics.memoryBytesSpilled() - sort_spillSizeBefore); /* 158 */ sort_metrics.incPeakExecutionMemory(sort_sorter.getPeakMemoryUsage()); /* 159 */ sort_needToSort = false; /* 160 */ } /* 161 */ /* 162 */ while (sort_sortedIter.hasNext()) { /* 163 */ UnsafeRow sort_outputRow = (UnsafeRow)sort_sortedIter.next(); /* 164 */ /* 165 */ append(sort_outputRow); /* 166 */ /* 167 */ if (shouldStop()) return; /* 168 */ } /* 169 */ } ``` --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #17122: [SPARK-19786][SQL] Facilitate loop optimizations ...
Github user viirya commented on a diff in the pull request: https://github.com/apache/spark/pull/17122#discussion_r104308330 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala --- @@ -206,6 +206,21 @@ trait CodegenSupport extends SparkPlan { def doConsume(ctx: CodegenContext, input: Seq[ExprCode], row: ExprCode): String = { throw new UnsupportedOperationException } + + /** + * For optimization to suppress shouldStop() in a loop of WholeStageCodegen. + * Returning true means we need to insert shouldStop() into the loop producing rows, if any. + */ + def isShouldStopRequired: Boolean = { +return shouldStopRequired && !(this.parent != null && !this.parent.isShouldStopRequired) + } + + /** + * Set to false if this plan consumes all rows produced by children but doesn't output row + * to buffer by calling append(), so the children don't require shouldStop() + * in the loop of producing rows. + */ + protected def shouldStopRequired: Boolean = true --- End diff -- Curious about the performance improvement on Sort. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #17122: [SPARK-19786][SQL] Facilitate loop optimizations ...
Github user kiszk commented on a diff in the pull request: https://github.com/apache/spark/pull/17122#discussion_r104285537 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala --- @@ -206,6 +206,21 @@ trait CodegenSupport extends SparkPlan { def doConsume(ctx: CodegenContext, input: Seq[ExprCode], row: ExprCode): String = { throw new UnsupportedOperationException } + + /** + * For optimization to suppress shouldStop() in a loop of WholeStageCodegen. + * Returning true means we need to insert shouldStop() into the loop producing rows, if any. + */ + def isShouldStopRequired: Boolean = { +return shouldStopRequired && !(this.parent != null && !this.parent.isShouldStopRequired) + } + + /** + * Set to false if this plan consumes all rows produced by children but doesn't output row + * to buffer by calling append(), so the children don't require shouldStop() + * in the loop of producing rows. + */ + protected def shouldStopRequired: Boolean = true --- End diff -- Thank you for your comment. I overlooked Sort. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #17122: [SPARK-19786][SQL] Facilitate loop optimizations ...
Github user hvanhovell commented on a diff in the pull request: https://github.com/apache/spark/pull/17122#discussion_r104165504 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala --- @@ -206,6 +206,21 @@ trait CodegenSupport extends SparkPlan { def doConsume(ctx: CodegenContext, input: Seq[ExprCode], row: ExprCode): String = { throw new UnsupportedOperationException } + + /** + * For optimization to suppress shouldStop() in a loop of WholeStageCodegen. + * Returning true means we need to insert shouldStop() into the loop producing rows, if any. + */ + def isShouldStopRequired: Boolean = { +return shouldStopRequired && !(this.parent != null && !this.parent.isShouldStopRequired) + } + + /** + * Set to false if this plan consumes all rows produced by children but doesn't output row + * to buffer by calling append(), so the children don't require shouldStop() + * in the loop of producing rows. + */ + protected def shouldStopRequired: Boolean = true --- End diff -- Should this be set to true for all blocking operators? Sort for instance? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #17122: [SPARK-19786][SQL] Facilitate loop optimizations ...
Github user kiszk commented on a diff in the pull request: https://github.com/apache/spark/pull/17122#discussion_r104091814 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala --- @@ -206,6 +206,18 @@ trait CodegenSupport extends SparkPlan { def doConsume(ctx: CodegenContext, input: Seq[ExprCode], row: ExprCode): String = { throw new UnsupportedOperationException } + + /** + * for optimization to suppress shouldStop() in a loop of WholeStageCodegen + * + * isShouldStopRequired: require to insert shouldStop() into the loop if true + */ + def isShouldStopRequired: Boolean = { +return shouldStopRequired && !(this.parent != null && !this.parent.isShouldStopRequired) --- End diff -- Thank you for your suggestion. However, it caused an assertion failure at `"SPARK-7150 range api"` in DataFrameRangeSuite. In the failure case, `isShouldStopRequired` is called in the class hierarchy by `parent`. ` RangeExec -> FilterExec -> WholeStageCodegenExec` --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #17122: [SPARK-19786][SQL] Facilitate loop optimizations ...
Github user kiszk commented on a diff in the pull request: https://github.com/apache/spark/pull/17122#discussion_r104090569 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala --- @@ -206,6 +206,18 @@ trait CodegenSupport extends SparkPlan { def doConsume(ctx: CodegenContext, input: Seq[ExprCode], row: ExprCode): String = { throw new UnsupportedOperationException } + + /** --- End diff -- Thanks, done --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #17122: [SPARK-19786][SQL] Facilitate loop optimizations ...
Github user kiszk commented on a diff in the pull request: https://github.com/apache/spark/pull/17122#discussion_r104090563 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala --- @@ -206,6 +206,18 @@ trait CodegenSupport extends SparkPlan { def doConsume(ctx: CodegenContext, input: Seq[ExprCode], row: ExprCode): String = { throw new UnsupportedOperationException } + + /** + * for optimization to suppress shouldStop() in a loop of WholeStageCodegen + * + * isShouldStopRequired: require to insert shouldStop() into the loop if true + */ + def isShouldStopRequired: Boolean = { +return shouldStopRequired && !(this.parent != null && !this.parent.isShouldStopRequired) + } + + // set false if doConsume() does not insert append() that requires shouldStop() in the loop --- End diff -- Look good, done --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #17122: [SPARK-19786][SQL] Facilitate loop optimizations ...
Github user viirya commented on a diff in the pull request: https://github.com/apache/spark/pull/17122#discussion_r104078113 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala --- @@ -206,6 +206,18 @@ trait CodegenSupport extends SparkPlan { def doConsume(ctx: CodegenContext, input: Seq[ExprCode], row: ExprCode): String = { throw new UnsupportedOperationException } + + /** --- End diff -- Suggestion for the comment: /** * For optimization to suppress `shouldStop()` in a loop of WholeStageCodegen. * Returning true means we need to insert `shouldStop()` into the loop producing rows, if any. */ --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #17122: [SPARK-19786][SQL] Facilitate loop optimizations ...
Github user viirya commented on a diff in the pull request: https://github.com/apache/spark/pull/17122#discussion_r104077831 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala --- @@ -206,6 +206,18 @@ trait CodegenSupport extends SparkPlan { def doConsume(ctx: CodegenContext, input: Seq[ExprCode], row: ExprCode): String = { throw new UnsupportedOperationException } + + /** + * for optimization to suppress shouldStop() in a loop of WholeStageCodegen + * + * isShouldStopRequired: require to insert shouldStop() into the loop if true + */ + def isShouldStopRequired: Boolean = { +return shouldStopRequired && !(this.parent != null && !this.parent.isShouldStopRequired) + } + + // set false if doConsume() does not insert append() that requires shouldStop() in the loop --- End diff -- Suggestion for the comment: /** * Set to false if this plan consumes all rows produced by children but doesn't output row to buffer * by calling `append()`, so the children don't require `shouldStop()` in the loop of producing rows. */ --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #17122: [SPARK-19786][SQL] Facilitate loop optimizations ...
Github user viirya commented on a diff in the pull request: https://github.com/apache/spark/pull/17122#discussion_r104076198 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala --- @@ -206,6 +206,18 @@ trait CodegenSupport extends SparkPlan { def doConsume(ctx: CodegenContext, input: Seq[ExprCode], row: ExprCode): String = { throw new UnsupportedOperationException } + + /** + * for optimization to suppress shouldStop() in a loop of WholeStageCodegen + * + * isShouldStopRequired: require to insert shouldStop() into the loop if true + */ + def isShouldStopRequired: Boolean = { +return shouldStopRequired && !(this.parent != null && !this.parent.isShouldStopRequired) --- End diff -- Is this better to understand? def isShouldStopRequired: Boolean = { assert(this.parent != null) shouldStopRequired && this.parent.isShouldStopRequired } --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #17122: [SPARK-19786][SQL] Facilitate loop optimizations ...
Github user kiszk commented on a diff in the pull request: https://github.com/apache/spark/pull/17122#discussion_r103978915 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala --- @@ -206,6 +206,18 @@ trait CodegenSupport extends SparkPlan { def doConsume(ctx: CodegenContext, input: Seq[ExprCode], row: ExprCode): String = { throw new UnsupportedOperationException } + + /** + * for optimization to suppress shouldStop() in a loop of WholeStageCodegen + * + * isShouldStopRequired: require to insert shouldStop() into the loop if true + */ + def isShouldStopRequired: Boolean = { +shouldStopRequired || (this.parent != null && this.parent.isShouldStopRequired) + } + + // set true if doConsume() inserts append() method that requires shouldStop() in the loop + protected var shouldStopRequired: Boolean = false --- End diff -- I updated to use an immutable variable. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #17122: [SPARK-19786][SQL] Facilitate loop optimizations ...
Github user kiszk commented on a diff in the pull request: https://github.com/apache/spark/pull/17122#discussion_r103927340 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala --- @@ -206,6 +206,18 @@ trait CodegenSupport extends SparkPlan { def doConsume(ctx: CodegenContext, input: Seq[ExprCode], row: ExprCode): String = { throw new UnsupportedOperationException } + + /** + * for optimization to suppress shouldStop() in a loop of WholeStageCodegen + * + * isShouldStopRequired: require to insert shouldStop() into the loop if true + */ + def isShouldStopRequired: Boolean = { +shouldStopRequired || (this.parent != null && this.parent.isShouldStopRequired) + } + + // set true if doConsume() inserts append() method that requires shouldStop() in the loop + protected var shouldStopRequired: Boolean = false --- End diff -- I see. I will rewrite this using immutable var. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #17122: [SPARK-19786][SQL] Facilitate loop optimizations ...
Github user viirya commented on a diff in the pull request: https://github.com/apache/spark/pull/17122#discussion_r103926506 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala --- @@ -206,6 +206,18 @@ trait CodegenSupport extends SparkPlan { def doConsume(ctx: CodegenContext, input: Seq[ExprCode], row: ExprCode): String = { throw new UnsupportedOperationException } + + /** + * for optimization to suppress shouldStop() in a loop of WholeStageCodegen + * + * isShouldStopRequired: require to insert shouldStop() into the loop if true + */ + def isShouldStopRequired: Boolean = { +shouldStopRequired || (this.parent != null && this.parent.isShouldStopRequired) + } + + // set true if doConsume() inserts append() method that requires shouldStop() in the loop + protected var shouldStopRequired: Boolean = false --- End diff -- I would +1 for this. That is part of reason why I said it complicates the logic in previous comment. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #17122: [SPARK-19786][SQL] Facilitate loop optimizations ...
Github user hvanhovell commented on a diff in the pull request: https://github.com/apache/spark/pull/17122#discussion_r103925096 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala --- @@ -206,6 +206,18 @@ trait CodegenSupport extends SparkPlan { def doConsume(ctx: CodegenContext, input: Seq[ExprCode], row: ExprCode): String = { throw new UnsupportedOperationException } + + /** + * for optimization to suppress shouldStop() in a loop of WholeStageCodegen + * + * isShouldStopRequired: require to insert shouldStop() into the loop if true + */ + def isShouldStopRequired: Boolean = { +shouldStopRequired || (this.parent != null && this.parent.isShouldStopRequired) + } + + // set true if doConsume() inserts append() method that requires shouldStop() in the loop + protected var shouldStopRequired: Boolean = false --- End diff -- We spend quite a bit of time debugging issues caused by poorly managed mutable vars in code generation. So I'd rather avoid it. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #17122: [SPARK-19786][SQL] Facilitate loop optimizations ...
Github user kiszk commented on a diff in the pull request: https://github.com/apache/spark/pull/17122#discussion_r103923150 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala --- @@ -206,6 +206,18 @@ trait CodegenSupport extends SparkPlan { def doConsume(ctx: CodegenContext, input: Seq[ExprCode], row: ExprCode): String = { throw new UnsupportedOperationException } + + /** + * for optimization to suppress shouldStop() in a loop of WholeStageCodegen + * + * isShouldStopRequired: require to insert shouldStop() into the loop if true + */ + def isShouldStopRequired: Boolean = { +shouldStopRequired || (this.parent != null && this.parent.isShouldStopRequired) + } + + // set true if doConsume() inserts append() method that requires shouldStop() in the loop + protected var shouldStopRequired: Boolean = false --- End diff -- @hvanhovell We can do it technically. It looks simple. On the other hand, we have to maintain an immutable var carefully by investigating whether `append()` is required. In particular, we would add a new CodegenSupport-related class. In contrast, the current approach is easy to maintain the var. When we would add a new CodegenSupport-related class, it is unnecessary to carefully investigate it. This is a trade-off between simplicity and maintainability. What do you think? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #17122: [SPARK-19786][SQL] Facilitate loop optimizations ...
Github user hvanhovell commented on a diff in the pull request: https://github.com/apache/spark/pull/17122#discussion_r103878480 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala --- @@ -206,6 +206,18 @@ trait CodegenSupport extends SparkPlan { def doConsume(ctx: CodegenContext, input: Seq[ExprCode], row: ExprCode): String = { throw new UnsupportedOperationException } + + /** + * for optimization to suppress shouldStop() in a loop of WholeStageCodegen + * + * isShouldStopRequired: require to insert shouldStop() into the loop if true + */ + def isShouldStopRequired: Boolean = { +shouldStopRequired || (this.parent != null && this.parent.isShouldStopRequired) + } + + // set true if doConsume() inserts append() method that requires shouldStop() in the loop + protected var shouldStopRequired: Boolean = false --- End diff -- Do you think it is possible to do this without a mutable var? Code generation has way to much mutable state as it is. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #17122: [SPARK-19786][SQL] Facilitate loop optimizations ...
Github user kiszk commented on a diff in the pull request: https://github.com/apache/spark/pull/17122#discussion_r103865206 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala --- @@ -206,6 +206,16 @@ trait CodegenSupport extends SparkPlan { def doConsume(ctx: CodegenContext, input: Seq[ExprCode], row: ExprCode): String = { throw new UnsupportedOperationException } + + /* + * for optimization to suppress shouldStop() in a loop of WholeStageCodegen + */ + // true: require to insert shouldStop() into a loop + protected var shouldStopRequired: Boolean = false + --- End diff -- ditto --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #17122: [SPARK-19786][SQL] Facilitate loop optimizations ...
Github user kiszk commented on a diff in the pull request: https://github.com/apache/spark/pull/17122#discussion_r103865202 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala --- @@ -206,6 +206,16 @@ trait CodegenSupport extends SparkPlan { def doConsume(ctx: CodegenContext, input: Seq[ExprCode], row: ExprCode): String = { throw new UnsupportedOperationException } + + /* + * for optimization to suppress shouldStop() in a loop of WholeStageCodegen + */ + // true: require to insert shouldStop() into a loop --- End diff -- Updated comments around here --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #17122: [SPARK-19786][SQL] Facilitate loop optimizations ...
Github user viirya commented on a diff in the pull request: https://github.com/apache/spark/pull/17122#discussion_r103863428 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala --- @@ -206,6 +206,16 @@ trait CodegenSupport extends SparkPlan { def doConsume(ctx: CodegenContext, input: Seq[ExprCode], row: ExprCode): String = { throw new UnsupportedOperationException } + + /* + * for optimization to suppress shouldStop() in a loop of WholeStageCodegen + */ + // true: require to insert shouldStop() into a loop + protected var shouldStopRequired: Boolean = false + --- End diff -- Please add a simple comment. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #17122: [SPARK-19786][SQL] Facilitate loop optimizations ...
Github user viirya commented on a diff in the pull request: https://github.com/apache/spark/pull/17122#discussion_r103863351 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala --- @@ -206,6 +206,16 @@ trait CodegenSupport extends SparkPlan { def doConsume(ctx: CodegenContext, input: Seq[ExprCode], row: ExprCode): String = { throw new UnsupportedOperationException } + + /* + * for optimization to suppress shouldStop() in a loop of WholeStageCodegen + */ + // true: require to insert shouldStop() into a loop --- End diff -- Btw, the usual style is: /** * * */ --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #17122: [SPARK-19786][SQL] Facilitate loop optimizations ...
Github user viirya commented on a diff in the pull request: https://github.com/apache/spark/pull/17122#discussion_r103862946 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala --- @@ -206,6 +206,16 @@ trait CodegenSupport extends SparkPlan { def doConsume(ctx: CodegenContext, input: Seq[ExprCode], row: ExprCode): String = { throw new UnsupportedOperationException } + + /* + * for optimization to suppress shouldStop() in a loop of WholeStageCodegen + */ + // true: require to insert shouldStop() into a loop --- End diff -- Your comment style looks weird. Please put `true...` in the /*... */ --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #17122: [SPARK-19786][SQL] Facilitate loop optimizations ...
Github user kiszk commented on a diff in the pull request: https://github.com/apache/spark/pull/17122#discussion_r103862749 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala --- @@ -206,6 +206,16 @@ trait CodegenSupport extends SparkPlan { def doConsume(ctx: CodegenContext, input: Seq[ExprCode], row: ExprCode): String = { throw new UnsupportedOperationException } + + /* + * for optimization to suppress shouldStop() in a loop of WholeStageCodegen + */ + // true: require to insert shouldStop() into a loop --- End diff -- ? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #17122: [SPARK-19786][SQL] Facilitate loop optimizations ...
Github user viirya commented on a diff in the pull request: https://github.com/apache/spark/pull/17122#discussion_r103862423 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala --- @@ -206,6 +206,16 @@ trait CodegenSupport extends SparkPlan { def doConsume(ctx: CodegenContext, input: Seq[ExprCode], row: ExprCode): String = { throw new UnsupportedOperationException } + + /* + * for optimization to suppress shouldStop() in a loop of WholeStageCodegen + */ + // true: require to insert shouldStop() into a loop --- End diff -- ?? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #17122: [SPARK-19786][SQL] Facilitate loop optimizations ...
Github user kiszk commented on a diff in the pull request: https://github.com/apache/spark/pull/17122#discussion_r103862311 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala --- @@ -434,6 +434,17 @@ case class RangeExec(range: org.apache.spark.sql.catalyst.plans.logical.Range) val input = ctx.freshName("input") // Right now, Range is only used when there is one upstream. ctx.addMutableState("scala.collection.Iterator", input, s"$input = inputs[0];") + +val localIdx = ctx.freshName("localIdx") +val localEnd = ctx.freshName("localEnd") +val range = ctx.freshName("range") +// we need to place consume() before calling isShouldStopRequired --- End diff -- Thank you, done. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #17122: [SPARK-19786][SQL] Facilitate loop optimizations ...
Github user kiszk commented on a diff in the pull request: https://github.com/apache/spark/pull/17122#discussion_r103862282 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala --- @@ -206,6 +207,13 @@ trait CodegenSupport extends SparkPlan { def doConsume(ctx: CodegenContext, input: Seq[ExprCode], row: ExprCode): String = { throw new UnsupportedOperationException } + + /* for optimization */ + var shouldStopRequired: Boolean = false --- End diff -- Sure, done --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #17122: [SPARK-19786][SQL] Facilitate loop optimizations ...
Github user kiszk commented on a diff in the pull request: https://github.com/apache/spark/pull/17122#discussion_r103862257 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala --- @@ -77,6 +77,7 @@ trait CodegenSupport extends SparkPlan { */ final def produce(ctx: CodegenContext, parent: CodegenSupport): String = executeQuery { this.parent = parent + --- End diff -- good catch. done. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #17122: [SPARK-19786][SQL] Facilitate loop optimizations ...
Github user kiszk commented on a diff in the pull request: https://github.com/apache/spark/pull/17122#discussion_r103862272 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala --- @@ -206,6 +207,13 @@ trait CodegenSupport extends SparkPlan { def doConsume(ctx: CodegenContext, input: Seq[ExprCode], row: ExprCode): String = { throw new UnsupportedOperationException } + + /* for optimization */ --- End diff -- I see. done. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #17122: [SPARK-19786][SQL] Facilitate loop optimizations ...
Github user viirya commented on a diff in the pull request: https://github.com/apache/spark/pull/17122#discussion_r103861360 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala --- @@ -434,6 +434,17 @@ case class RangeExec(range: org.apache.spark.sql.catalyst.plans.logical.Range) val input = ctx.freshName("input") // Right now, Range is only used when there is one upstream. ctx.addMutableState("scala.collection.Iterator", input, s"$input = inputs[0];") + +val localIdx = ctx.freshName("localIdx") +val localEnd = ctx.freshName("localEnd") +val range = ctx.freshName("range") +// we need to place consume() before calling isShouldStopRequired --- End diff -- Better to describe the reason that consume() may modify `shouldStopRequired`. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #17122: [SPARK-19786][SQL] Facilitate loop optimizations ...
Github user viirya commented on a diff in the pull request: https://github.com/apache/spark/pull/17122#discussion_r103861241 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala --- @@ -206,6 +207,13 @@ trait CodegenSupport extends SparkPlan { def doConsume(ctx: CodegenContext, input: Seq[ExprCode], row: ExprCode): String = { throw new UnsupportedOperationException } + + /* for optimization */ + var shouldStopRequired: Boolean = false --- End diff -- Please add `protected`. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #17122: [SPARK-19786][SQL] Facilitate loop optimizations ...
Github user viirya commented on a diff in the pull request: https://github.com/apache/spark/pull/17122#discussion_r103860895 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala --- @@ -77,6 +77,7 @@ trait CodegenSupport extends SparkPlan { */ final def produce(ctx: CodegenContext, parent: CodegenSupport): String = executeQuery { this.parent = parent + --- End diff -- extra space. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #17122: [SPARK-19786][SQL] Facilitate loop optimizations ...
Github user viirya commented on a diff in the pull request: https://github.com/apache/spark/pull/17122#discussion_r103860938 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala --- @@ -206,6 +207,13 @@ trait CodegenSupport extends SparkPlan { def doConsume(ctx: CodegenContext, input: Seq[ExprCode], row: ExprCode): String = { throw new UnsupportedOperationException } + + /* for optimization */ --- End diff -- Deserve better comment. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #17122: [SPARK-19786][SQL] Facilitate loop optimizations ...
Github user kiszk commented on a diff in the pull request: https://github.com/apache/spark/pull/17122#discussion_r103858494 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala --- @@ -77,6 +77,10 @@ trait CodegenSupport extends SparkPlan { */ final def produce(ctx: CodegenContext, parent: CodegenSupport): String = executeQuery { this.parent = parent + +// to track the existence of apply() call in the current produce-consume cycle +// if apply is not called (e.g. in aggregation), we can skip shoudStop in the inner-most loop +parent.shouldStopRequired = false --- End diff -- I wanted to ensure `produce()` starts with `parent.shouldStopRequired = false`. This is because I am afraid other produce-consume may set true into `shouldStopRequired` if we have more than one-produce-consume in one parent. However, in most of cases, it would not happen. For the simplicity, I eliminated this. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #17122: [SPARK-19786][SQL] Facilitate loop optimizations ...
Github user kiszk commented on a diff in the pull request: https://github.com/apache/spark/pull/17122#discussion_r103858474 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/limit.scala --- @@ -69,6 +69,7 @@ trait BaseLimitExec extends UnaryExecNode with CodegenSupport { override def doConsume(ctx: CodegenContext, input: Seq[ExprCode], row: ExprCode): String = { val stopEarly = ctx.freshName("stopEarly") ctx.addMutableState("boolean", stopEarly, s"$stopEarly = false;") +shouldStopRequired = true // loop may break early even without append in loop body --- End diff -- Good catch. This implementation depends on slightly old revision that means there is no `stopEarly()` method. Removed this line. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #17122: [SPARK-19786][SQL] Facilitate loop optimizations ...
Github user kiszk commented on a diff in the pull request: https://github.com/apache/spark/pull/17122#discussion_r103857751 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala --- @@ -434,6 +434,17 @@ case class RangeExec(range: org.apache.spark.sql.catalyst.plans.logical.Range) val input = ctx.freshName("input") // Right now, Range is only used when there is one upstream. ctx.addMutableState("scala.collection.Iterator", input, s"$input = inputs[0];") + +val localIdx = ctx.freshName("localIdx") +val localEnd = ctx.freshName("localEnd") +val range = ctx.freshName("range") +// we need to place consume() before calling isShouldStopRequired +val body = consume(ctx, Seq(ev)) +val shouldStop = if (isShouldStopRequired) { --- End diff -- I think that `isShouldStopRequired` is simple logic. It just checks whether `shouldStopRequired` or parents `shouldStopRequired` is true There are two reasons why `isShouldStopRequired` is necessary. 1. The improvement is largely degraded from 7.6x to 5.5x without `isShouldStopRequired` 2. We may miss some opportunities to enable compiler optimizations since the size of loop body would be increased without `isShouldStopRequired`. This is because a JIT compiler has a threshold of loop body size to apply some loop optimizations such as loop unrolling. ``` OpenJDK 64-Bit Server VM 1.8.0_111-8u111-b14-2ubuntu0.16.04.2-b14 on Linux 4.4.0-47-generic Intel(R) Xeon(R) CPU E5-2667 v3 @ 3.20GHz cnt: Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative cnt247 / 289 4340.6 0.2 1.0X ``` --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #17122: [SPARK-19786][SQL] Facilitate loop optimizations ...
Github user viirya commented on a diff in the pull request: https://github.com/apache/spark/pull/17122#discussion_r103841503 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/limit.scala --- @@ -69,6 +69,7 @@ trait BaseLimitExec extends UnaryExecNode with CodegenSupport { override def doConsume(ctx: CodegenContext, input: Seq[ExprCode], row: ExprCode): String = { val stopEarly = ctx.freshName("stopEarly") ctx.addMutableState("boolean", stopEarly, s"$stopEarly = false;") +shouldStopRequired = true // loop may break early even without append in loop body --- End diff -- If this `Limit` is the parent of an `Aggregate`, the final `shouldStopRequired` is true. But actually we can skip the check. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #17122: [SPARK-19786][SQL] Facilitate loop optimizations ...
Github user viirya commented on a diff in the pull request: https://github.com/apache/spark/pull/17122#discussion_r103840075 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala --- @@ -442,11 +453,15 @@ case class RangeExec(range: org.apache.spark.sql.catalyst.plans.logical.Range) | } | | while (true) { - | while ($number != $batchEnd) { - | long $value = $number; - | $number += ${step}L; - | ${consume(ctx, Seq(ev))} - | if (shouldStop()) return; + | long $range = $batchEnd - $number; + | if ($range != 0L) { + | int $localEnd = (int)($range / ${step}L); + | for (int $localIdx = 0; $localIdx < $localEnd; $localIdx++) { + | long $value = ((long)$localIdx * ${step}L) + $number; + | $body + | $shouldStop --- End diff -- oh. nvm. the outer-most `WholeStageCodegenExec`'s `shouldStopRequired` is true. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #17122: [SPARK-19786][SQL] Facilitate loop optimizations ...
Github user viirya commented on a diff in the pull request: https://github.com/apache/spark/pull/17122#discussion_r103839637 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala --- @@ -442,11 +453,15 @@ case class RangeExec(range: org.apache.spark.sql.catalyst.plans.logical.Range) | } | | while (true) { - | while ($number != $batchEnd) { - | long $value = $number; - | $number += ${step}L; - | ${consume(ctx, Seq(ev))} - | if (shouldStop()) return; + | long $range = $batchEnd - $number; + | if ($range != 0L) { + | int $localEnd = (int)($range / ${step}L); + | for (int $localIdx = 0; $localIdx < $localEnd; $localIdx++) { + | long $value = ((long)$localIdx * ${step}L) + $number; + | $body + | $shouldStop --- End diff -- I think under most of cases, we need `shouldStop` check. But currently `shouldStopRequired` is false by default, so you will consume many additional rows now. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #17122: [SPARK-19786][SQL] Facilitate loop optimizations ...
Github user viirya commented on a diff in the pull request: https://github.com/apache/spark/pull/17122#discussion_r103838737 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala --- @@ -77,6 +77,10 @@ trait CodegenSupport extends SparkPlan { */ final def produce(ctx: CodegenContext, parent: CodegenSupport): String = executeQuery { this.parent = parent + +// to track the existence of apply() call in the current produce-consume cycle +// if apply is not called (e.g. in aggregation), we can skip shoudStop in the inner-most loop +parent.shouldStopRequired = false --- End diff -- Do we need this? The default value of `shouldStopRequired` is already false. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #17122: [SPARK-19786][SQL] Facilitate loop optimizations ...
Github user viirya commented on a diff in the pull request: https://github.com/apache/spark/pull/17122#discussion_r103837174 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala --- @@ -434,6 +434,17 @@ case class RangeExec(range: org.apache.spark.sql.catalyst.plans.logical.Range) val input = ctx.freshName("input") // Right now, Range is only used when there is one upstream. ctx.addMutableState("scala.collection.Iterator", input, s"$input = inputs[0];") + +val localIdx = ctx.freshName("localIdx") +val localEnd = ctx.freshName("localEnd") +val range = ctx.freshName("range") +// we need to place consume() before calling isShouldStopRequired +val body = consume(ctx, Seq(ev)) +val shouldStop = if (isShouldStopRequired) { --- End diff -- `isShouldStopRequired` complicates the logic. Is it necessary? How much improvement it brings? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #17122: [SPARK-19786][SQL] Facilitate loop optimizations ...
Github user kiszk commented on a diff in the pull request: https://github.com/apache/spark/pull/17122#discussion_r103756297 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala --- @@ -206,6 +210,15 @@ trait CodegenSupport extends SparkPlan { def doConsume(ctx: CodegenContext, input: Seq[ExprCode], row: ExprCode): String = { throw new UnsupportedOperationException } + + /* for optimization */ + var shouldStopRequired: Boolean = false + + def isShouldStopRequired: Boolean = { +if (shouldStopRequired) return true --- End diff -- Thank you, it looks better. Done --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #17122: [SPARK-19786][SQL] Facilitate loop optimizations ...
Github user srowen commented on a diff in the pull request: https://github.com/apache/spark/pull/17122#discussion_r103731873 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala --- @@ -206,6 +210,15 @@ trait CodegenSupport extends SparkPlan { def doConsume(ctx: CodegenContext, input: Seq[ExprCode], row: ExprCode): String = { throw new UnsupportedOperationException } + + /* for optimization */ + var shouldStopRequired: Boolean = false + + def isShouldStopRequired: Boolean = { +if (shouldStopRequired) return true --- End diff -- Can you just write `shouldStopRequired || (this.parent != null && this.parent.isShouldStopRequired)`? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #17122: [SPARK-19786][SQL] Facilitate loop optimizations ...
GitHub user kiszk opened a pull request: https://github.com/apache/spark/pull/17122 [SPARK-19786][SQL] Facilitate loop optimizations in a JIT compiler regarding range() ## What changes were proposed in this pull request? This PR improves performance of operations with `range()` by changing Java code generated by Catalyst. This PR is inspired by the [blog article](https://databricks.com/blog/2017/02/16/processing-trillion-rows-per-second-single-machine-can-nested-loop-joins-fast.html). This PR changes generated code in the following two points. 1. Replace a while-loop with long instance variables a for-loop with int local varibles 2. Suppress generation of `shouldStop()` method if this method is unnecessary (e.g. `append()` is not generated). These points facilitates compiler optimizations in a JIT compiler by feeding the simplified Java code into the JIT compiler. The performance is improved by 7.6x. Benchmark program: ```java val N = 1 << 29 val iters = 2 val benchmark = new Benchmark("range.count", N * iters) benchmark.addCase(s"with this PR") { i => var n = 0 var len = 0 while (n < iters) { len += sparkSession.range(N).selectExpr("count(id)").collect.length n += 1 } } benchmark.run ``` Performance result without this PR ``` OpenJDK 64-Bit Server VM 1.8.0_111-8u111-b14-2ubuntu0.16.04.2-b14 on Linux 4.4.0-47-generic Intel(R) Xeon(R) CPU E5-2667 v3 @ 3.20GHz range.count: Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative w/o this PR 1349 / 1356796.2 1.3 1.0X ``` Performance result with this PR ``` OpenJDK 64-Bit Server VM 1.8.0_111-8u111-b14-2ubuntu0.16.04.2-b14 on Linux 4.4.0-47-generic Intel(R) Xeon(R) CPU E5-2667 v3 @ 3.20GHz range.count: Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative with this PR 177 / 271 6065.3 0.2 1.0X ``` Here is a comparison between generated code w/o and with this PR. Only the method ```agg_doAggregateWithoutKey``` is changed. Generated code without this PR ```java /* 005 */ final class GeneratedIterator extends org.apache.spark.sql.execution.BufferedRowIterator { /* 006 */ private Object[] references; /* 007 */ private scala.collection.Iterator[] inputs; /* 008 */ private boolean agg_initAgg; /* 009 */ private boolean agg_bufIsNull; /* 010 */ private long agg_bufValue; /* 011 */ private org.apache.spark.sql.execution.metric.SQLMetric range_numOutputRows; /* 012 */ private org.apache.spark.sql.execution.metric.SQLMetric range_numGeneratedRows; /* 013 */ private boolean range_initRange; /* 014 */ private long range_number; /* 015 */ private TaskContext range_taskContext; /* 016 */ private InputMetrics range_inputMetrics; /* 017 */ private long range_batchEnd; /* 018 */ private long range_numElementsTodo; /* 019 */ private scala.collection.Iterator range_input; /* 020 */ private UnsafeRow range_result; /* 021 */ private org.apache.spark.sql.catalyst.expressions.codegen.BufferHolder range_holder; /* 022 */ private org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter range_rowWriter; /* 023 */ private org.apache.spark.sql.execution.metric.SQLMetric agg_numOutputRows; /* 024 */ private org.apache.spark.sql.execution.metric.SQLMetric agg_aggTime; /* 025 */ private UnsafeRow agg_result; /* 026 */ private org.apache.spark.sql.catalyst.expressions.codegen.BufferHolder agg_holder; /* 027 */ private org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter agg_rowWriter; /* 028 */ /* 029 */ public GeneratedIterator(Object[] references) { /* 030 */ this.references = references; /* 031 */ } /* 032 */ /* 033 */ public void init(int index, scala.collection.Iterator[] inputs) { /* 034 */ partitionIndex = index; /* 035 */ this.inputs = inputs; /* 036 */ agg_initAgg = false; /* 037 */ /* 038 */ this.range_numOutputRows = (org.apache.spark.sql.execution.metric.SQLMetric) references[0]; /* 039 */ this.range_numGeneratedRows = (org.apache.spark.sql.execution.metric.SQLMetric) references[1]; /* 040 */ range_initRange = false; /* 041 */ range_number = 0L; /* 042 */ range_taskContext = TaskContext.get(); /* 043 */ range_inputMetrics = range_taskContext.taskMetrics().inputMetrics(); /* 044 */