attilapiros opened a new pull request #33261:
URL: https://github.com/apache/spark/pull/33261


   ### What changes were proposed in this pull request?
   
   Setting `kubernetes.request.retry.backoffLimit` by default to 3 when the 
user haven't specified  any value for it.
   
   This way when k8s API servers gives back HTTP response >=500 then an 
exponential backoff will be triggered (where 
`kubernetes.request.retry.backoffInterval` is 1000ms by default).
   
   For details please check 
https://github.com/fabric8io/kubernetes-client/issues/3087.
   
   ### Why are the changes needed?
   
   We experienced some internal K8s errors for example when the `etcdserver` 
leader election was ongoing the error was propagated to the API client and 
caused an issue in Spark:
   
   ```
   Caused by: io.fabric8.kubernetes.client.KubernetesClientException: Failure 
executing: GET at:
   
https://kubernetes.default.svc/api/v1/namespaces/dex-app-bl24w4z9/pods/sparkpi-10-fcd3f6781a874212-driver.
 Message: etcdserver: 
   leader changed. Received status: Status(apiVersion=v1, code=500, 
details=null, kind=Status, message=etcdserver: leader changed, 
   metadata=ListMeta(_continue=null, remainingItemCount=null, 
resourceVersion=null, selfLink=null, additionalProperties={}), reason=null, 
   status=Failure, additionalProperties={}).
   ```
   
   ### Does this PR introduce _any_ user-facing change?
   No.
   
   ### How was this patch tested?
   
   Running the integration tests along with 
`log4j.logger.org.apache.spark.deploy.k8s.SparkKubernetesClientFactory=DEBUG` 
the log4j config. It produced the following log:
   
   ```
   21/07/08 11:01:14.873 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils: 21/07/08 11:01:14 DEBUG 
org.apache.spark.deploy.k8s.SparkKubernetesClientFactory: Kubernetes client 
config: {
   21/07/08 11:01:14.873 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:   "requestConfig" : {
   21/07/08 11:01:14.873 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:     "username" : null,
   21/07/08 11:01:14.873 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:     "password" : null,
   21/07/08 11:01:14.873 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:     "oauthToken" : null,
   21/07/08 11:01:14.873 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:     "oauthTokenProvider" : null,
   21/07/08 11:01:14.873 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:     "impersonateUsername" : null,
   21/07/08 11:01:14.873 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:     "impersonateGroups" : [ null ],
   21/07/08 11:01:14.873 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:     "impersonateExtras" : { },
   21/07/08 11:01:14.873 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:     "watchReconnectInterval" : 1000,
   21/07/08 11:01:14.873 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:     "watchReconnectLimit" : -1,
   21/07/08 11:01:14.873 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:     "connectionTimeout" : 10000,
   21/07/08 11:01:14.873 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:     "uploadConnectionTimeout" : 10000,
   21/07/08 11:01:14.873 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:     "uploadRequestTimeout" : 120000,
   21/07/08 11:01:14.873 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:     "requestRetryBackoffLimit" : 3,
   21/07/08 11:01:14.873 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:     "requestRetryBackoffInterval" : 1000,
   21/07/08 11:01:14.873 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:     "requestTimeout" : 10000,
   21/07/08 11:01:14.873 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:     "rollingTimeout" : 900000,
   21/07/08 11:01:14.873 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:     "scaleTimeout" : 600000,
   21/07/08 11:01:14.873 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:     "loggingInterval" : 20000,
   21/07/08 11:01:14.874 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:     "websocketTimeout" : 5000,
   21/07/08 11:01:14.874 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:     "websocketPingInterval" : 0,
   21/07/08 11:01:14.874 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:     "maxConcurrentRequests" : 64,
   21/07/08 11:01:14.874 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:     "maxConcurrentRequestsPerHost" : 5,
   21/07/08 11:01:14.874 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:     "impersonateGroup" : null
   21/07/08 11:01:14.874 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:   },
   21/07/08 11:01:14.874 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:   "contexts" : [ {
   21/07/08 11:01:14.874 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:     "context" : {
   21/07/08 11:01:14.874 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:       "cluster" : "talos-default",
   21/07/08 11:01:14.874 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:       "namespace" : "default",
   21/07/08 11:01:14.874 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:       "user" : "admin@talos-default"
   21/07/08 11:01:14.874 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:     },
   21/07/08 11:01:14.874 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:     "name" : "admin@talos-default"
   21/07/08 11:01:14.874 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:   }, {
   21/07/08 11:01:14.874 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:     "context" : {
   21/07/08 11:01:14.874 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:       "cluster" : 
"arn:aws:eks:us-west-2:392479084068:cluster/mow",
   21/07/08 11:01:14.874 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:       "user" : "arn:aws:eks:us-west-2:392479084068:cluster/mow"
   21/07/08 11:01:14.874 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:     },
   21/07/08 11:01:14.874 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:     "name" : "arn:aws:eks:us-west-2:392479084068:cluster/mow"
   21/07/08 11:01:14.874 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:   }, {
   21/07/08 11:01:14.874 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:     "context" : {
   21/07/08 11:01:14.874 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:       "cluster" : "minikube",
   21/07/08 11:01:14.874 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:       "extensions" : [ {
   21/07/08 11:01:14.874 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:         "name" : "context_info"
   21/07/08 11:01:14.874 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:       } ],
   21/07/08 11:01:14.874 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:       "namespace" : "default",
   21/07/08 11:01:14.874 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:       "user" : "minikube"
   21/07/08 11:01:14.874 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:     },
   21/07/08 11:01:14.874 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:     "name" : "minikube"
   21/07/08 11:01:14.874 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:   }, {
   21/07/08 11:01:14.874 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:     "context" : {
   21/07/08 11:01:14.874 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:       "cluster" : "",
   21/07/08 11:01:14.874 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:       "user" : ""
   21/07/08 11:01:14.874 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:     },
   21/07/08 11:01:14.874 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:     "name" : "mow"
   21/07/08 11:01:14.874 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:   } ],
   21/07/08 11:01:14.874 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:   "currentContext" : {
   21/07/08 11:01:14.874 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:     "context" : {
   21/07/08 11:01:14.874 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:       "cluster" : "minikube",
   21/07/08 11:01:14.874 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:       "extensions" : [ {
   21/07/08 11:01:14.874 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:         "name" : "context_info"
   21/07/08 11:01:14.874 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:       } ],
   21/07/08 11:01:14.874 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:       "namespace" : "default",
   21/07/08 11:01:14.874 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:       "user" : "minikube"
   21/07/08 11:01:14.874 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:     },
   21/07/08 11:01:14.874 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:     "name" : "minikube"
   21/07/08 11:01:14.874 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:   },
   21/07/08 11:01:14.874 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:   "maxConcurrentRequests" : 64,
   21/07/08 11:01:14.874 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:   "maxConcurrentRequestsPerHost" : 5,
   21/07/08 11:01:14.874 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:   "autoConfigure" : false,
   21/07/08 11:01:14.874 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:   "trustCerts" : false,
   21/07/08 11:01:14.874 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:   "disableHostnameVerification" : false,
   21/07/08 11:01:14.874 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:   "masterUrl" : "https://192.168.64.127:8443/";,
   21/07/08 11:01:14.875 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:   "apiVersion" : "v1",
   21/07/08 11:01:14.875 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:   "namespace" : "a0993113b8084cd3868b3052e698b17f",
   21/07/08 11:01:14.875 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:   "caCertFile" : "/Users/attilazsoltpiros/.minikube/ca.crt",
   21/07/08 11:01:14.875 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:   "clientCertFile" : 
"/Users/attilazsoltpiros/.minikube/profiles/minikube/client.crt",
   21/07/08 11:01:14.875 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:   "clientKeyFile" : 
"/Users/attilazsoltpiros/.minikube/profiles/minikube/client.key",
   21/07/08 11:01:14.875 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:   "clientKeyAlgo" : "RSA",
   21/07/08 11:01:14.875 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:   "clientKeyPassphrase" : "changeit",
   21/07/08 11:01:14.875 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:   "watchReconnectInterval" : 1000,
   21/07/08 11:01:14.875 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:   "watchReconnectLimit" : -1,
   21/07/08 11:01:14.875 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:   "connectionTimeout" : 10000,
   21/07/08 11:01:14.875 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:   "uploadConnectionTimeout" : 10000,
   21/07/08 11:01:14.875 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:   "uploadRequestTimeout" : 120000,
   21/07/08 11:01:14.875 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:   "requestRetryBackoffLimit" : 3,
   21/07/08 11:01:14.875 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:   "requestRetryBackoffInterval" : 1000,
   21/07/08 11:01:14.875 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:   "requestTimeout" : 10000,
   21/07/08 11:01:14.875 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:   "rollingTimeout" : 900000,
   21/07/08 11:01:14.875 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:   "scaleTimeout" : 600000,
   21/07/08 11:01:14.875 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:   "loggingInterval" : 20000,
   21/07/08 11:01:14.875 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:   "websocketTimeout" : 5000,
   21/07/08 11:01:14.875 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:   "websocketPingInterval" : 0,
   21/07/08 11:01:14.875 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:   "impersonateGroups" : [ null ],
   21/07/08 11:01:14.875 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:   "impersonateExtras" : { },
   21/07/08 11:01:14.875 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:   "http2Disable" : false,
   21/07/08 11:01:14.875 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:   "noProxy" : [ ],
   21/07/08 11:01:14.875 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:   "tlsVersions" : [ "TLS_1_2" ],
   21/07/08 11:01:14.875 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:   "errorMessages" : {
   21/07/08 11:01:14.875 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:     "401" : "Unauthorized! Token may have expired! Please log-in 
again.",
   21/07/08 11:01:14.875 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:     "403" : "Forbidden! User minikube doesn't have permission."
   21/07/08 11:01:14.875 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:   }
   21/07/08 11:01:14.875 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils: }
   ```
   
   Which contains the expected values:
   ```
   21/07/08 11:01:14.875 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:   "requestRetryBackoffLimit" : 3,
   21/07/08 11:01:14.875 ScalaTest-main-running-KubernetesSuite INFO 
ProcessUtils:   "requestRetryBackoffInterval" : 1000,
   ```


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to