bernardodemarco commented on code in PR #9102: URL: https://github.com/apache/cloudstack/pull/9102#discussion_r1876689595
########## plugins/integrations/kubernetes-service/src/main/java/com/cloud/kubernetes/cluster/KubernetesClusterManagerImpl.java: ########## @@ -846,6 +1005,37 @@ private void validateManagedKubernetesClusterCreateParameters(final CreateKubern } } + protected void validateServiceOfferingsForNodeTypes(Map<String, Long> map, + Long defaultServiceOfferingId, + Long etcdNodes, + KubernetesSupportedVersion clusterKubernetesVersion) { + for (String key : CLUSTER_NODES_TYPES_LIST) { + validateServiceOfferingForNode(map, defaultServiceOfferingId, key, etcdNodes, clusterKubernetesVersion); + } + } Review Comment: When creating a k8s cluster without specifying any compute offerings, for example: ```bash create kubernetescluster name="k8s-cluster" zoneid="<zone-id>" kubernetesversionid="<k8s-version-id>" size="1" noderootdisksize="8" ``` The following error is returned: ```bash 🙈 Error: (HTTP 530, error code 9999) No service offering found with ID: null ``` What do you think about validating this scenario when creating the cluster? ```suggestion protected void validateServiceOfferingsForNodeTypes(Map<String, Long> map, Long defaultServiceOfferingId, Long etcdNodes, KubernetesSupportedVersion clusterKubernetesVersion) { if (defaultServiceOfferingId == null && isAnyNodeOfferingEmpty(map)) { throw new InvalidParameterValueException("When serviceofferingid is not specified, service offerings for each node type must be specified in the nodeofferings parameter"); } for (String key : CLUSTER_NODES_TYPES_LIST) { validateServiceOfferingForNode(map, defaultServiceOfferingId, key, clusterKubernetesVersion); } } ``` ########## plugins/integrations/kubernetes-service/src/main/java/com/cloud/kubernetes/cluster/actionworkers/KubernetesClusterResourceModifierActionWorker.java: ########## @@ -795,8 +782,27 @@ protected String getKubernetesClusterNodeNamePrefix() { return prefix; } + protected String getEtcdNodeNameForCluster() { + String prefix = kubernetesCluster.getName(); + if (!NetUtils.verifyDomainNameLabel(prefix, true)) { + prefix = prefix.replaceAll("[^a-zA-Z0-9-]", ""); + if (prefix.isEmpty()) { + prefix = kubernetesCluster.getUuid(); + } + } + prefix = prefix + "-etcd" ; + if (prefix.length() > 40) { + prefix = prefix.substring(0, 40); + } + return prefix; + } + protected KubernetesClusterVO updateKubernetesClusterEntry(final Long cores, final Long memory, final Long size, - final Long serviceOfferingId, final Boolean autoscaleEnabled, final Long minSize, final Long maxSize) { + final Long serviceOfferingId, final Boolean autoscaleEnabled, + final Long minSize, final Long maxSize, + final KubernetesClusterNodeType nodeType, + final boolean updateNodeOffering, + final boolean updateClusterOffering) { return Transaction.execute((TransactionCallback<KubernetesClusterVO>) status -> { KubernetesClusterVO updatedCluster = kubernetesClusterDao.createForUpdate(kubernetesCluster.getId()); Review Comment: The `updateKubernetesClusterEntry` method is calling `kubernetesClusterDao.createForUpdate` to update the k8s cluster VO. Apparently, this DAO's method returns the VO with its attributes set to `null`. The VO is then updated, persisted and returned. Eventually, this returned VO will be assigned to the `com.cloud.kubernetes.cluster.actionworkers.KubernetesClusterActionWorker#kubernetesCluster` attribute. Previously, this flow wasn't causing any issues, since it was only executed once per cluster scaling. However, currently, it's executed for each cluster node type. Therefore, in the loop's second iteration, `com.cloud.kubernetes.cluster.actionworkers.KubernetesClusterActionWorker#kubernetesCluster` will contain some `null` attributes and will break the scaling flow. One possible workaround for this scenario would be to, in the `updateKubernetesClusterEntry` method, return the `kubernetesClusterDao.findById(kubernetesCluster.getId())` after persisting the modifications performed on `updatedCluster`. ########## plugins/integrations/kubernetes-service/src/main/java/com/cloud/kubernetes/cluster/KubernetesClusterManagerImpl.java: ########## @@ -1041,37 +1234,53 @@ private void validateKubernetesClusterScaleParameters(ScaleKubernetesClusterCmd } } - ServiceOffering serviceOffering = null; - if (serviceOfferingId != null) { - serviceOffering = serviceOfferingDao.findById(serviceOfferingId); - if (serviceOffering == null) { - throw new InvalidParameterValueException("Failed to find service offering ID: " + serviceOfferingId); - } else { - if (serviceOffering.isDynamic()) { - throw new InvalidParameterValueException(String.format("Custom service offerings are not supported for Kubernetes clusters. Kubernetes cluster : %s, service offering : %s", kubernetesCluster.getName(), serviceOffering.getName())); - } - if (serviceOffering.getCpu() < MIN_KUBERNETES_CLUSTER_NODE_CPU || serviceOffering.getRamSize() < MIN_KUBERNETES_CLUSTER_NODE_RAM_SIZE) { - throw new InvalidParameterValueException(String.format("Kubernetes cluster : %s cannot be scaled with service offering : %s, Kubernetes cluster template(CoreOS) needs minimum %d vCPUs and %d MB RAM", - kubernetesCluster.getName(), serviceOffering.getName(), MIN_KUBERNETES_CLUSTER_NODE_CPU, MIN_KUBERNETES_CLUSTER_NODE_RAM_SIZE)); - } - if (serviceOffering.getCpu() < clusterVersion.getMinimumCpu()) { - throw new InvalidParameterValueException(String.format("Kubernetes cluster : %s cannot be scaled with service offering : %s, associated Kubernetes version : %s needs minimum %d vCPUs", - kubernetesCluster.getName(), serviceOffering.getName(), clusterVersion.getName(), clusterVersion.getMinimumCpu())); + validateServiceOfferingsForNodeTypesScale(serviceOfferingNodeTypeMap, defaultServiceOfferingId, kubernetesCluster, clusterVersion); + + validateKubernetesClusterScaleSize(kubernetesCluster, clusterSize, maxClusterSize, zone); + } + + protected void validateServiceOfferingsForNodeTypesScale(Map<String, Long> map, Long defaultServiceOfferingId, KubernetesClusterVO kubernetesCluster, KubernetesSupportedVersion clusterVersion) { + for (String key : CLUSTER_NODES_TYPES_LIST) { + Long serviceOfferingId = map.getOrDefault(key, defaultServiceOfferingId); + if (serviceOfferingId != null) { + ServiceOffering serviceOffering = serviceOfferingDao.findById(serviceOfferingId); + if (serviceOffering == null) { + throw new InvalidParameterValueException("Failed to find service offering ID: " + serviceOfferingId); } - if (serviceOffering.getRamSize() < clusterVersion.getMinimumRamSize()) { - throw new InvalidParameterValueException(String.format("Kubernetes cluster : %s cannot be scaled with service offering : %s, associated Kubernetes version : %s needs minimum %d MB RAM", - kubernetesCluster.getName(), serviceOffering.getName(), clusterVersion.getName(), clusterVersion.getMinimumRamSize())); + checkServiceOfferingForNodesScale(serviceOffering, kubernetesCluster, clusterVersion); + final ServiceOffering existingServiceOffering = serviceOfferingDao.findById(kubernetesCluster.getServiceOfferingId()); + if (KubernetesCluster.State.Running.equals(kubernetesCluster.getState()) && (serviceOffering.getRamSize() < existingServiceOffering.getRamSize() || + serviceOffering.getCpu() * serviceOffering.getSpeed() < existingServiceOffering.getCpu() * existingServiceOffering.getSpeed())) { + logAndThrow(Level.WARN, String.format("Kubernetes cluster cannot be scaled down for service offering. Service offering : %s offers lesser resources as compared to service offering : %s of Kubernetes cluster : %s", + serviceOffering.getName(), existingServiceOffering.getName(), kubernetesCluster.getName())); } } - final ServiceOffering existingServiceOffering = serviceOfferingDao.findById(kubernetesCluster.getServiceOfferingId()); - if (KubernetesCluster.State.Running.equals(kubernetesCluster.getState()) && (serviceOffering.getRamSize() < existingServiceOffering.getRamSize() || - serviceOffering.getCpu() * serviceOffering.getSpeed() < existingServiceOffering.getCpu() * existingServiceOffering.getSpeed())) { - logAndThrow(Level.WARN, String.format("Kubernetes cluster cannot be scaled down for service offering. Service offering : %s offers lesser resources as compared to service offering : %s of Kubernetes cluster : %s", - serviceOffering.getName(), existingServiceOffering.getName(), kubernetesCluster.getName())); - } } + } - validateKubernetesClusterScaleSize(kubernetesCluster, clusterSize, maxClusterSize, zone); + protected void checkServiceOfferingForNodesScale(ServiceOffering serviceOffering, KubernetesClusterVO kubernetesCluster, KubernetesSupportedVersion clusterVersion) { + if (serviceOffering.isDynamic()) { + throw new InvalidParameterValueException(String.format("Custom service offerings are not supported for Kubernetes clusters. Kubernetes cluster : %s, service offering : %s", kubernetesCluster.getName(), serviceOffering.getName())); + } + if (serviceOffering.getCpu() < MIN_KUBERNETES_CLUSTER_NODE_CPU || serviceOffering.getRamSize() < MIN_KUBERNETES_CLUSTER_NODE_RAM_SIZE) { + throw new InvalidParameterValueException(String.format("Kubernetes cluster : %s cannot be scaled with service offering : %s, Kubernetes cluster template(CoreOS) needs minimum %d vCPUs and %d MB RAM", + kubernetesCluster.getName(), serviceOffering.getName(), MIN_KUBERNETES_CLUSTER_NODE_CPU, MIN_KUBERNETES_CLUSTER_NODE_RAM_SIZE)); + } + if (serviceOffering.getCpu() < clusterVersion.getMinimumCpu()) { + throw new InvalidParameterValueException(String.format("Kubernetes cluster : %s cannot be scaled with service offering : %s, associated Kubernetes version : %s needs minimum %d vCPUs", + kubernetesCluster.getName(), serviceOffering.getName(), clusterVersion.getName(), clusterVersion.getMinimumCpu())); + } + if (serviceOffering.getRamSize() < clusterVersion.getMinimumRamSize()) { + throw new InvalidParameterValueException(String.format("Kubernetes cluster : %s cannot be scaled with service offering : %s, associated Kubernetes version : %s needs minimum %d MB RAM", + kubernetesCluster.getName(), serviceOffering.getName(), clusterVersion.getName(), clusterVersion.getMinimumRamSize())); + } + } + + protected boolean isAnyNodeOfferingEmpty(Map<String, Long> map) { + if (MapUtils.isEmpty(map)) { + return false; + } + return map.values().stream().anyMatch(Objects::isNull); } Review Comment: When scaling k8s clusters this method is executed to check whether the service offerings were specified. However, if the service offerings `map` is empty shouldn't the method return `true` here? ```suggestion protected boolean isAnyNodeOfferingEmpty(Map<String, Long> map) { if (MapUtils.isEmpty(map)) { return true; } return map.values().stream().anyMatch(Objects::isNull); } ``` ########## plugins/integrations/kubernetes-service/src/main/java/com/cloud/kubernetes/cluster/actionworkers/KubernetesClusterResourceModifierActionWorker.java: ########## @@ -293,13 +259,13 @@ protected DeployDestination plan(final long nodesCount, final DataCenter zone, f throw new InsufficientServerCapacityException(msg, DataCenter.class, zone.getId()); } - protected DeployDestination plan() throws InsufficientServerCapacityException { + protected DeployDestination plan(Long domainId, Long accountId, Hypervisor.HypervisorType hypervisorType) throws InsufficientServerCapacityException { ServiceOffering offering = serviceOfferingDao.findById(kubernetesCluster.getServiceOfferingId()); DataCenter zone = dataCenterDao.findById(kubernetesCluster.getZoneId()); if (logger.isDebugEnabled()) { logger.debug(String.format("Checking deployment destination for Kubernetes cluster : %s in zone : %s", kubernetesCluster.getName(), zone.getName())); } - return plan(kubernetesCluster.getTotalNodeCount(), zone, offering); + return plan(kubernetesCluster.getTotalNodeCount(), zone, offering, domainId, accountId, hypervisorType); Review Comment: When creating a cluster only specifying the compute offerings for the nodes, a NPE is thrown in this flow: ```bash 🙈 Error: (HTTP 530, error code 9999) Cannot invoke "com.cloud.offering.ServiceOffering.getId()" because "this.val$defaultServiceOffering" is null ``` That is happening basically because the `plan` method, that is executed when the cluster is starting, isn't considering the possibility of different service offerings per each k8s plane. So, a possible workaround for this would be to iterate over each node type, retrieve its respective offering VO by calling `com.cloud.kubernetes.cluster.actionworkers.KubernetesClusterActionWorker#getServiceOfferingForNodeTypeOnCluster`, retrieve the cluster's node count and call `plan` specifying those attributes. This flow would be similar to what was implemented in `com.cloud.kubernetes.cluster.KubernetesClusterManagerImpl#getHypervisorTypeAndValidateNodeDeployments`. ########## plugins/integrations/kubernetes-service/src/main/java/com/cloud/kubernetes/cluster/KubernetesClusterManagerImpl.java: ########## @@ -1217,56 +1427,135 @@ public KubernetesCluster createManagedKubernetesCluster(CreateKubernetesClusterC final DataCenter zone = dataCenterDao.findById(cmd.getZoneId()); final long controlNodeCount = cmd.getControlNodes(); final long clusterSize = cmd.getClusterSize(); - final long totalNodeCount = controlNodeCount + clusterSize; - final ServiceOffering serviceOffering = serviceOfferingDao.findById(cmd.getServiceOfferingId()); + final long etcdNodes = cmd.getEtcdNodes(); + final Map<String, Long> nodeTypeCount = Map.of(WORKER.name(), clusterSize, + CONTROL.name(), controlNodeCount, ETCD.name(), etcdNodes); final Account owner = accountService.getActiveAccountById(cmd.getEntityOwnerId()); final KubernetesSupportedVersion clusterKubernetesVersion = kubernetesSupportedVersionDao.findById(cmd.getKubernetesVersionId()); - - DeployDestination deployDestination = null; - try { - deployDestination = plan(totalNodeCount, zone, serviceOffering); - } catch (InsufficientCapacityException e) { - logAndThrow(Level.ERROR, String.format("Creating Kubernetes cluster failed due to insufficient capacity for %d nodes cluster in zone : %s with service offering : %s", totalNodeCount, zone.getName(), serviceOffering.getName())); - } - if (deployDestination == null || deployDestination.getCluster() == null) { - logAndThrow(Level.ERROR, String.format("Creating Kubernetes cluster failed due to error while finding suitable deployment plan for cluster in zone : %s", zone.getName())); + final Hypervisor.HypervisorType hypervisor = cmd.getHypervisorType(); + + Map<String, Long> serviceOfferingNodeTypeMap = cmd.getServiceOfferingNodeTypeMap(); + Long defaultServiceOfferingId = cmd.getServiceOfferingId(); + String accountName = cmd.getAccountName(); + Long domainId = cmd.getDomainId(); + Long accountId = null; + if (Objects.nonNull(accountName) && Objects.nonNull(domainId)) { + Account account = accountDao.findActiveAccount(accountName, domainId); + if (Objects.nonNull(account)) { + accountId = account.getId(); + } } + Hypervisor.HypervisorType hypervisorType = getHypervisorTypeAndValidateNodeDeployments(serviceOfferingNodeTypeMap, defaultServiceOfferingId, nodeTypeCount, zone, domainId, accountId, hypervisor); SecurityGroup securityGroup = null; if (zone.isSecurityGroupEnabled()) { securityGroup = getOrCreateSecurityGroupForAccount(owner); } + Map<String, Long> templateNodeTypeMap = cmd.getTemplateNodeTypeMap(); + final VMTemplateVO finalTemplate = getKubernetesServiceTemplate(zone, hypervisorType, templateNodeTypeMap, DEFAULT); + final VMTemplateVO controlNodeTemplate = getKubernetesServiceTemplate(zone, hypervisorType, templateNodeTypeMap, CONTROL); + final VMTemplateVO workerNodeTemplate = getKubernetesServiceTemplate(zone, hypervisorType, templateNodeTypeMap, WORKER); + final VMTemplateVO etcdNodeTemplate = getKubernetesServiceTemplate(zone, hypervisorType, templateNodeTypeMap, ETCD); final Network defaultNetwork = getKubernetesClusterNetworkIfMissing(cmd.getName(), zone, owner, (int)controlNodeCount, (int)clusterSize, cmd.getExternalLoadBalancerIpAddress(), cmd.getNetworkId()); - final VMTemplateVO finalTemplate = getKubernetesServiceTemplate(zone, deployDestination.getCluster().getHypervisorType()); - final long cores = serviceOffering.getCpu() * (controlNodeCount + clusterSize); - final long memory = serviceOffering.getRamSize() * (controlNodeCount + clusterSize); - final SecurityGroup finalSecurityGroup = securityGroup; final KubernetesClusterVO cluster = Transaction.execute(new TransactionCallback<KubernetesClusterVO>() { @Override public KubernetesClusterVO doInTransaction(TransactionStatus status) { + final ServiceOffering defaultServiceOffering = serviceOfferingDao.findById(defaultServiceOfferingId); + Pair<Long, Long> capacityPair = calculateClusterCapacity(serviceOfferingNodeTypeMap, nodeTypeCount, defaultServiceOfferingId); + final long cores = capacityPair.first(); + final long memory = capacityPair.second(); + KubernetesClusterVO newCluster = new KubernetesClusterVO(cmd.getName(), cmd.getDisplayName(), zone.getId(), clusterKubernetesVersion.getId(), - serviceOffering.getId(), finalTemplate.getId(), defaultNetwork.getId(), owner.getDomainId(), - owner.getAccountId(), controlNodeCount, clusterSize, KubernetesCluster.State.Created, cmd.getSSHKeyPairName(), cores, memory, + defaultServiceOffering.getId(), Objects.nonNull(finalTemplate) ? finalTemplate.getId() : null, Review Comment: When creating a cluster without specifying a default service offering ID, a NPE is thrown here. ```suggestion defaultServiceOfferingId, Objects.nonNull(finalTemplate) ? finalTemplate.getId() : null, ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@cloudstack.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org