juliuszsompolski commented on code in PR #41829:
URL: https://github.com/apache/spark/pull/41829#discussion_r1251980200
##########
connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/SparkConnectClient.scala:
##########
@@ -60,14 +64,42 @@ private[sql] class SparkConnectClient(
new ArtifactManager(userContext, sessionId, channel)
}
+ private val retryPolicy: SparkConnectClient.RetryPolicy =
configuration.retryPolicy
+
+ @tailrec private[client] final def retry[T](fn: => T, currentRetryNum: Int =
0): T = {
+ if (currentRetryNum > retryPolicy.maxRetries) {
+ throw new IllegalArgumentException(
+ s"The number of retries ($currentRetryNum) must not exceed " +
+ s"the maximum number of retires (${retryPolicy.maxRetries}).")
+ }
+ try {
+ return fn
+ } catch {
+ case NonFatal(e) if retryPolicy.canRetry(e) && currentRetryNum <
retryPolicy.maxRetries =>
+ Thread.sleep(
+ (retryPolicy.maxBackoff min retryPolicy.initialBackoff * Math
+ .pow(retryPolicy.backoffMultiplier, currentRetryNum)).toMillis)
+ }
+ retry(fn, currentRetryNum + 1)
+ }
+
/**
* Dispatch the [[proto.AnalyzePlanRequest]] to the Spark Connect server.
* @return
* A [[proto.AnalyzePlanResponse]] from the Spark Connect server.
*/
def analyze(request: proto.AnalyzePlanRequest): proto.AnalyzePlanResponse = {
artifactManager.uploadAllClassFileArtifacts()
- stub.analyzePlan(request)
+ retry {
+ stub.analyzePlan(request)
+ }
+ }
+
+ private class executeRetryIterator(result:
java.util.Iterator[proto.ExecutePlanResponse])
+ extends java.util.Iterator[proto.ExecutePlanResponse] {
+ override def next(): proto.ExecutePlanResponse = retry { result.next() }
+ override def hasNext(): Boolean = retry { result.hasNext() }
+ override def remove(): Unit = retry { result.remove() }
}
Review Comment:
This is quite dangerous... likely also in the existing Python retry.
If the `next()` call has actually reached the server, but then something on
the response path errored, and we retry it, the retry here can silently lose
some rows from the middle of the query (if we e.g. lose an ArrowBatch response;
and we currently don't track consistency, whether we didn't skip some row
offsets).
I think we should not retry any errors in the iterator like that.
We should only retry if `stub.executePlan(request)` failed to return an
iterator at all.
Same for python, if it's different there.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]