pan3793 commented on code in PR #6997: URL: https://github.com/apache/kyuubi/pull/6997#discussion_r2010661031
########## kyuubi-server/src/main/scala/org/apache/kyuubi/operation/BatchJobSubmission.scala: ########## @@ -250,50 +252,58 @@ class BatchJobSubmission( private def submitAndMonitorBatchJob(): Unit = { var appStatusFirstUpdated = false var lastStarvationCheckTime = createTime + + def doUpdateApplicationInfoMetadataIfNeeded(): Unit = { + updateApplicationInfoMetadataIfNeeded() + if (!appStatusFirstUpdated) { + // only the ApplicationInfo with non-empty id indicates that batch is RUNNING + if (applicationId(_applicationInfo).isDefined) { + setStateIfNotCanceled(OperationState.RUNNING) + updateBatchMetadata() + appStatusFirstUpdated = true + } else { + val currentTime = System.currentTimeMillis() + if (currentTime - lastStarvationCheckTime > applicationStarvationTimeout) { + lastStarvationCheckTime = currentTime + warn(s"Batch[$batchId] has not started, check the Kyuubi server to ensure" + + s" that batch jobs can be submitted.") + } + } + } + } + try { info(s"Submitting $batchType batch[$batchId] job:\n$builder") val process = builder.start - while (!applicationFailed(_applicationInfo) && process.isAlive) { - updateApplicationInfoMetadataIfNeeded() - if (!appStatusFirstUpdated) { - // only the ApplicationInfo with non-empty id indicates that batch is RUNNING - if (applicationId(_applicationInfo).isDefined) { - setStateIfNotCanceled(OperationState.RUNNING) - updateBatchMetadata() - appStatusFirstUpdated = true - } else { - val currentTime = System.currentTimeMillis() - if (currentTime - lastStarvationCheckTime > applicationStarvationTimeout) { - lastStarvationCheckTime = currentTime - warn(s"Batch[$batchId] has not started, check the Kyuubi server to ensure" + - s" that batch jobs can be submitted.") - } - } - } + while (process.isAlive && !applicationFailed(_applicationInfo)) { + doUpdateApplicationInfoMetadataIfNeeded() process.waitFor(applicationCheckInterval, TimeUnit.MILLISECONDS) } + if (!process.isAlive) { + doUpdateApplicationInfoMetadataIfNeeded() Review Comment: just for note, this is the key change - in the current round, app state is `NOT_FOUND` because the submit stage exceeds the `kyuubi.engine.yarn.submit.timeout`, while during the `process.waitFor` period, submit success, then `process.isAlive` returns false, thus there is no chance to retrieve the app state from the cluster manager -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: notifications-unsubscr...@kyuubi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: notifications-unsubscr...@kyuubi.apache.org For additional commands, e-mail: notifications-h...@kyuubi.apache.org