This is an automated email from the ASF dual-hosted git repository. alexkun pushed a commit to branch dev-1.2.0 in repository https://gitbox.apache.org/repos/asf/incubator-linkis.git
commit 4b1cb6eb11e0aa1bc311d7931c1d32595752201c Author: peacewong <[email protected]> AuthorDate: Fri Jul 22 19:36:31 2022 +0800 Optimized engine startup to support more Retry scenarios close #2505 --- .../server/service/impl/ProcessEngineConnLaunchService.scala | 4 ++++ .../manager/am/service/engine/DefaultEngineCreateService.scala | 10 ++++++++++ .../manager/rm/service/impl/DefaultResourceManager.scala | 5 +++-- 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/linkis-computation-governance/linkis-engineconn-manager/linkis-engineconn-manager-server/src/main/scala/org/apache/linkis/ecm/server/service/impl/ProcessEngineConnLaunchService.scala b/linkis-computation-governance/linkis-engineconn-manager/linkis-engineconn-manager-server/src/main/scala/org/apache/linkis/ecm/server/service/impl/ProcessEngineConnLaunchService.scala index ca0db8b8f..1f004f777 100644 --- a/linkis-computation-governance/linkis-engineconn-manager/linkis-engineconn-manager-server/src/main/scala/org/apache/linkis/ecm/server/service/impl/ProcessEngineConnLaunchService.scala +++ b/linkis-computation-governance/linkis-engineconn-manager/linkis-engineconn-manager-server/src/main/scala/org/apache/linkis/ecm/server/service/impl/ProcessEngineConnLaunchService.scala @@ -36,6 +36,7 @@ import org.apache.linkis.manager.engineplugin.common.launch.entity.EngineConnLau import org.apache.linkis.rpc.Sender import org.apache.commons.io.IOUtils import org.apache.commons.lang3.exception.ExceptionUtils +import org.apache.commons.lang3.StringUtils import org.apache.linkis.ecm.server.service.LocalDirsHandleService import org.apache.linkis.manager.label.utils.LabelUtil @@ -65,6 +66,9 @@ abstract class ProcessEngineConnLaunchService extends AbstractEngineConnLaunchSe case ecmError: ECMErrorException => if (ECMErrorCode.EC_START_TIME_OUT == ecmError.getErrCode) { true + } else if (StringUtils.isBlank(ecmError.getDesc)) { + logger.info("exception desc is null, can be retry") + true } else { false } diff --git a/linkis-computation-governance/linkis-manager/linkis-application-manager/src/main/scala/org/apache/linkis/manager/am/service/engine/DefaultEngineCreateService.scala b/linkis-computation-governance/linkis-manager/linkis-application-manager/src/main/scala/org/apache/linkis/manager/am/service/engine/DefaultEngineCreateService.scala index 4dcc236be..733cfb1e5 100644 --- a/linkis-computation-governance/linkis-manager/linkis-application-manager/src/main/scala/org/apache/linkis/manager/am/service/engine/DefaultEngineCreateService.scala +++ b/linkis-computation-governance/linkis-manager/linkis-application-manager/src/main/scala/org/apache/linkis/manager/am/service/engine/DefaultEngineCreateService.scala @@ -215,6 +215,16 @@ class DefaultEngineCreateService extends AbstractEngineService with EngineCreate Utils.tryCatch(getEngineNodeManager.updateEngineNode(oldServiceInstance, engineNode)) { t => logger.warn(s"Failed to update engineNode $engineNode", t) + val stopEngineRequest = new EngineStopRequest(engineNode.getServiceInstance, ManagerUtils.getAdminUser) + engineStopService.asyncStopEngine(stopEngineRequest) + val failedEcNode = getEngineNodeManager.getEngineNode(oldServiceInstance) + if (null == failedEcNode) { + logger.info(s" engineConn is not exists in db: $oldServiceInstance ") + } else { + failedEcNode.setLabels(nodeLabelService.getNodeLabels(oldServiceInstance)) + failedEcNode.getLabels.addAll(LabelUtils.distinctLabel(labelFilter.choseEngineLabel(labelList), emNode.getLabels)) + engineStopService.engineConnInfoClear(failedEcNode) + } throw new LinkisRetryException(AMConstant.EM_ERROR_CODE, s"Failed to update engineNode: ${t.getMessage}") } diff --git a/linkis-computation-governance/linkis-manager/linkis-application-manager/src/main/scala/org/apache/linkis/manager/rm/service/impl/DefaultResourceManager.scala b/linkis-computation-governance/linkis-manager/linkis-application-manager/src/main/scala/org/apache/linkis/manager/rm/service/impl/DefaultResourceManager.scala index 5b675f508..b98a70446 100644 --- a/linkis-computation-governance/linkis-manager/linkis-application-manager/src/main/scala/org/apache/linkis/manager/rm/service/impl/DefaultResourceManager.scala +++ b/linkis-computation-governance/linkis-manager/linkis-application-manager/src/main/scala/org/apache/linkis/manager/rm/service/impl/DefaultResourceManager.scala @@ -19,6 +19,7 @@ package org.apache.linkis.manager.rm.service.impl import com.google.common.collect.Lists import org.apache.linkis.common.ServiceInstance +import org.apache.linkis.common.exception.LinkisRetryException import org.apache.linkis.common.utils.{Logging, Utils} import org.apache.linkis.governance.common.conf.GovernanceCommonConf import org.apache.linkis.manager.common.conf.RMConfiguration @@ -587,10 +588,10 @@ class DefaultResourceManager extends ResourceManager with Logging with Initializ private def tryLock(labelContainer: RMLabelContainer, timeOut: Long = -1): Unit = { labelContainer.getResourceLabels.foreach { case label: Label[_] => - labelContainer.setCurrentLabel(label.asInstanceOf[Label[_]]) + labelContainer.setCurrentLabel(label) val locked = resourceLockService.tryLock(labelContainer, timeOut) if (!locked) { - throw new RMWarnException(110022, s"try to lock resource label ${labelContainer.getCurrentLabel} over $timeOut ms, please wait a moment and try again!") + throw new LinkisRetryException(110022, s"try to lock resource label ${labelContainer.getCurrentLabel} over $timeOut ms, please wait a moment and try again!") } case _ => } --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
