yangwwei commented on a change in pull request #359:
URL: 
https://github.com/apache/incubator-yunikorn-k8shim/pull/359#discussion_r791274687



##########
File path: pkg/cache/context.go
##########
@@ -365,19 +366,68 @@ func (ctx *Context) bindPodVolumes(pod *v1.Pod) error {
                                zap.String("podName", pod.Name))
                } else {
                        log.Logger().Info("Binding Pod Volumes", 
zap.String("podName", pod.Name))
-                       boundClaims, claimsToBind, _, err := 
ctx.apiProvider.GetAPIs().VolumeBinder.GetPodVolumes(assumedPod)
+                       boundClaims, claimsToBind, unboundClaimsImmediate, err 
:= ctx.apiProvider.GetAPIs().VolumeBinder.GetPodVolumes(assumedPod)
                        if err != nil {
+                               log.Logger().Error("Failed to get pod volumes",
+                                       zap.String("podName", assumedPod.Name),
+                                       zap.Error(err))
+                               return err
+                       }
+                       if len(unboundClaimsImmediate) > 0 {
+                               err = fmt.Errorf("pod %s has unbound immediate 
claims", pod.Name)
+                               log.Logger().Error("Pod has unbound immediate 
claims",
+                                       zap.String("podName", assumedPod.Name),
+                                       zap.Error(err))
                                return err
                        }
                        node, err := 
ctx.schedulerCache.GetNodeInfo(assumedPod.Spec.NodeName)
                        if err != nil {
+                               log.Logger().Error("Failed to get node info",
+                                       zap.String("podName", assumedPod.Name),
+                                       zap.String("nodeName", 
assumedPod.Spec.NodeName),
+                                       zap.Error(err))
+                               return err
+                       }
+                       volumes, reasons, err := 
ctx.apiProvider.GetAPIs().VolumeBinder.FindPodVolumes(assumedPod, boundClaims, 
claimsToBind, node)
+                       if err != nil {
+                               log.Logger().Error("Failed to find pod volumes",
+                                       zap.String("podName", assumedPod.Name),
+                                       zap.String("nodeName", 
assumedPod.Spec.NodeName),
+                                       zap.Int("claimsToBind", 
len(claimsToBind)),
+                                       zap.Error(err))
                                return err
                        }
-                       volumes, _, err := 
ctx.apiProvider.GetAPIs().VolumeBinder.FindPodVolumes(assumedPod, boundClaims, 
claimsToBind, node)
+                       if len(reasons) > 0 {
+                               sReasons := make([]string, 0)
+                               for _, reason := range reasons {
+                                       sReasons = append(sReasons, 
string(reason))
+                               }
+                               sReason := strings.Join(sReasons, ", ")
+                               err = fmt.Errorf("pod %s has conflicting volume 
claims: %s", pod.Name, sReason)
+                               log.Logger().Error("Pod has conflicting volume 
claims",
+                                       zap.String("podName", assumedPod.Name),
+                                       zap.String("nodeName", 
assumedPod.Spec.NodeName),
+                                       zap.Int("claimsToBind", 
len(claimsToBind)),
+                                       zap.Error(err))
+                               return err
+                       }

Review comment:
       Good catch.
   
   ```
   volumes, reasons, err := 
ctx.apiProvider.GetAPIs().VolumeBinder.FindPodVolumes(assumedPod, boundClaims, 
claimsToBind, node)
   ```
   Seems like there will be a case where `err==nil` but `len(reasons) > 0`
   
   > // It returns an error when something went wrong or a list of reasons why 
the node is
        // (currently) not usable for the pod.
   

##########
File path: pkg/cache/context.go
##########
@@ -365,19 +366,68 @@ func (ctx *Context) bindPodVolumes(pod *v1.Pod) error {
                                zap.String("podName", pod.Name))
                } else {
                        log.Logger().Info("Binding Pod Volumes", 
zap.String("podName", pod.Name))
-                       boundClaims, claimsToBind, _, err := 
ctx.apiProvider.GetAPIs().VolumeBinder.GetPodVolumes(assumedPod)
+                       boundClaims, claimsToBind, unboundClaimsImmediate, err 
:= ctx.apiProvider.GetAPIs().VolumeBinder.GetPodVolumes(assumedPod)
                        if err != nil {
+                               log.Logger().Error("Failed to get pod volumes",
+                                       zap.String("podName", assumedPod.Name),
+                                       zap.Error(err))
+                               return err
+                       }
+                       if len(unboundClaimsImmediate) > 0 {
+                               err = fmt.Errorf("pod %s has unbound immediate 
claims", pod.Name)
+                               log.Logger().Error("Pod has unbound immediate 
claims",
+                                       zap.String("podName", assumedPod.Name),
+                                       zap.Error(err))
                                return err
                        }
                        node, err := 
ctx.schedulerCache.GetNodeInfo(assumedPod.Spec.NodeName)
                        if err != nil {
+                               log.Logger().Error("Failed to get node info",
+                                       zap.String("podName", assumedPod.Name),
+                                       zap.String("nodeName", 
assumedPod.Spec.NodeName),
+                                       zap.Error(err))
+                               return err
+                       }
+                       volumes, reasons, err := 
ctx.apiProvider.GetAPIs().VolumeBinder.FindPodVolumes(assumedPod, boundClaims, 
claimsToBind, node)
+                       if err != nil {
+                               log.Logger().Error("Failed to find pod volumes",
+                                       zap.String("podName", assumedPod.Name),
+                                       zap.String("nodeName", 
assumedPod.Spec.NodeName),
+                                       zap.Int("claimsToBind", 
len(claimsToBind)),
+                                       zap.Error(err))
                                return err
                        }
-                       volumes, _, err := 
ctx.apiProvider.GetAPIs().VolumeBinder.FindPodVolumes(assumedPod, boundClaims, 
claimsToBind, node)
+                       if len(reasons) > 0 {
+                               sReasons := make([]string, 0)
+                               for _, reason := range reasons {
+                                       sReasons = append(sReasons, 
string(reason))
+                               }
+                               sReason := strings.Join(sReasons, ", ")
+                               err = fmt.Errorf("pod %s has conflicting volume 
claims: %s", pod.Name, sReason)
+                               log.Logger().Error("Pod has conflicting volume 
claims",
+                                       zap.String("podName", assumedPod.Name),
+                                       zap.String("nodeName", 
assumedPod.Spec.NodeName),
+                                       zap.Int("claimsToBind", 
len(claimsToBind)),
+                                       zap.Error(err))
+                               return err
+                       }
+                       if volumes.StaticBindings == nil {
+                               // convert nil to empty array
+                               volumes.StaticBindings = 
make([]*scheduling.BindingInfo, 0)
+                       }
+                       if volumes.DynamicProvisions == nil {
+                               // convert nil to empty array
+                               volumes.DynamicProvisions = 
make([]*v1.PersistentVolumeClaim, 0)
+                       }

Review comment:
       why does this fix the problem? 
   I did not see anywhere to complain NIL value

##########
File path: pkg/cache/context.go
##########
@@ -365,19 +366,68 @@ func (ctx *Context) bindPodVolumes(pod *v1.Pod) error {
                                zap.String("podName", pod.Name))
                } else {
                        log.Logger().Info("Binding Pod Volumes", 
zap.String("podName", pod.Name))
-                       boundClaims, claimsToBind, _, err := 
ctx.apiProvider.GetAPIs().VolumeBinder.GetPodVolumes(assumedPod)
+                       boundClaims, claimsToBind, unboundClaimsImmediate, err 
:= ctx.apiProvider.GetAPIs().VolumeBinder.GetPodVolumes(assumedPod)
                        if err != nil {
+                               log.Logger().Error("Failed to get pod volumes",
+                                       zap.String("podName", assumedPod.Name),
+                                       zap.Error(err))
+                               return err
+                       }
+                       if len(unboundClaimsImmediate) > 0 {
+                               err = fmt.Errorf("pod %s has unbound immediate 
claims", pod.Name)
+                               log.Logger().Error("Pod has unbound immediate 
claims",
+                                       zap.String("podName", assumedPod.Name),
+                                       zap.Error(err))

Review comment:
       Is this the right place to check the unbound immediate claims? Maybe too 
late? The PVC relies on a PV controller to do the binding, before this happens, 
the default scheduler [skips scheduling the pod] in the PreFilter phase 
(https://github.com/kubernetes/kubernetes/blob/c592bd40f2df941aa4ea364592ce92fd5c669bfc/pkg/scheduler/framework/plugins/volumebinding/volume_binding.go#L185-L187).
 Our code here is pretty late, and returning an error means we failed to 
schedule this pod, and user will get a "PodVolumesBindFailure" error in the 
pod. Would it make sense to do this check 
[here](https://github.com/apache/incubator-yunikorn-k8shim/blob/799a42ed9b247815fd344f70e0df27d6b83f90d2/pkg/cache/application.go#L414)?
 This is a sanity check to ensure that a pod is ready for scheduling.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@yunikorn.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Reply via email to