zhuqi-lucas commented on code in PR #890:
URL: https://github.com/apache/yunikorn-k8shim/pull/890#discussion_r1717812365
##########
pkg/cache/context.go:
##########
@@ -754,20 +755,73 @@ func (ctx *Context) bindPodVolumes(pod *v1.Pod) error {
// convert nil to empty array
volumes.DynamicProvisions =
make([]*v1.PersistentVolumeClaim, 0)
}
- err =
ctx.apiProvider.GetAPIs().VolumeBinder.BindPodVolumes(context.Background(),
assumedPod, volumes)
- if err != nil {
- log.Log(log.ShimContext).Error("Failed to bind
pod volumes",
- zap.String("podName", assumedPod.Name),
- zap.String("nodeName",
assumedPod.Spec.NodeName),
- zap.Int("dynamicProvisions",
len(volumes.DynamicProvisions)),
- zap.Int("staticBindings",
len(volumes.StaticBindings)))
- return err
- }
+ // Here we set the max retry count to 5, and use the
default retry strategy
+ // Details:
+ // max sleep time is 1s, 2s, 4s, 8s and the last one
will not sleep
+ return ctx.bindPodVolumesWithRetry(assumedPod, volumes,
5, &DefaultRetryStrategy{})
}
}
return nil
}
+type RetryStrategy interface {
+ // Sleep function used for retry delays
+ Sleep(duration time.Duration)
+}
+
+// DefaultRetryStrategy is a simple retry strategy that sleeps for a fixed
duration
+// We can extend this to support more advanced retry strategies in the future
and also for testing purposes
+type DefaultRetryStrategy struct{}
+
+func (r *DefaultRetryStrategy) Sleep(duration time.Duration) {
+ time.Sleep(duration)
+}
+
+func (ctx *Context) bindPodVolumesWithRetry(
+ assumedPod *v1.Pod,
+ volumes *volumebinding.PodVolumes,
+ maxRetries int,
+ retryStrategy RetryStrategy,
+) error {
+ const baseDelay = time.Second
+ const maxDelay = 8 * time.Second
+
+ var err error
+ for i := 0; i < maxRetries; i++ {
+ err =
ctx.apiProvider.GetAPIs().VolumeBinder.BindPodVolumes(context.Background(),
assumedPod, volumes)
+ if err == nil {
+ return nil
+ }
+
+ log.Log(log.ShimContext).Error("Failed to bind pod volumes",
+ zap.String("podName", assumedPod.Name),
+ zap.String("nodeName", assumedPod.Spec.NodeName),
+ zap.Int("dynamicProvisions",
len(volumes.DynamicProvisions)),
+ zap.Int("staticBindings", len(volumes.StaticBindings)),
+ zap.Int("retryCount", i+1),
+ zap.Error(err))
+
+ if i == maxRetries-1 {
+ log.Log(log.ShimContext).Error("Failed to bind pod
volumes after retry",
+ zap.String("podName", assumedPod.Name),
+ zap.String("nodeName",
assumedPod.Spec.NodeName),
+ zap.Int("dynamicProvisions",
len(volumes.DynamicProvisions)),
+ zap.Int("staticBindings",
len(volumes.StaticBindings)),
+ zap.Error(err))
+ return err
+ }
+
+ delay := baseDelay * time.Duration(1<<uint(i))
+ if delay > maxDelay {
+ delay = maxDelay
+ }
+
+ retryStrategy.Sleep(delay) // Use the retry strategy
+ }
Review Comment:
This is a good wrapper, i will take a look to replace.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]