style95 commented on a change in pull request #5102: URL: https://github.com/apache/openwhisk/pull/5102#discussion_r627261946
########## File path: core/invoker/src/main/resources/application.conf ########## @@ -60,9 +60,13 @@ whisk { user-memory: 1024 m concurrent-peek-factor: 0.5 #factor used to limit message peeking: 0 < factor <= 1.0 - larger number improves concurrent processing, but increases risk of message loss during invoker crash akka-client: false # if true, use PoolingContainerClient for HTTP from invoker to action container (otherwise use ApacheBlockingContainerClient) - prewarm-expiration-check-interval: 1 minute # period to check for prewarm expiration + prewarm-expiration-check-init-delay: 10 minute # the init delay time for the first check + prewarm-expiration-check-interval: 10 minute # period to check for prewarm expiration prewarm-expiration-check-interval-variance: 10 seconds # varies expiration across invokers to avoid many concurrent expirations prewarm-expiration-limit: 100 # number of prewarms to expire in one expiration cycle (remaining expired will be considered for expiration in next cycle) + prewarm-max-retry-limit: 5 # max retry limit for create prewarm + prewarm-promotion: false # if true, action can take prewarm container which has bigger memory Review comment: Are these following two configurations used? ########## File path: core/invoker/src/main/scala/org/apache/openwhisk/core/containerpool/v2/FunctionPullingContainerPool.scala ########## @@ -0,0 +1,866 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.openwhisk.core.containerpool.v2 + +import akka.actor.{Actor, ActorRef, ActorRefFactory, Cancellable, Props} +import org.apache.kafka.clients.producer.RecordMetadata +import org.apache.openwhisk.common._ +import org.apache.openwhisk.core.connector.ContainerCreationError._ +import org.apache.openwhisk.core.connector.{ + ContainerCreationAckMessage, + ContainerCreationMessage, + ContainerDeletionMessage +} +import org.apache.openwhisk.core.containerpool.{ + AdjustPrewarmedContainer, + BlackboxStartupError, + ColdStartKey, + ContainerPool, + ContainerPoolConfig, + ContainerRemoved, + PrewarmingConfig, + WhiskContainerStartupError +} +import org.apache.openwhisk.core.entity._ +import org.apache.openwhisk.core.entity.size._ +import org.apache.openwhisk.http.Messages + +import scala.annotation.tailrec +import scala.collection.concurrent.TrieMap +import scala.collection.immutable +import scala.concurrent.Future +import scala.concurrent.duration._ +import scala.util.{Random, Try} +import scala.collection.immutable.Queue + +case class Creation(creationMessage: ContainerCreationMessage, action: WhiskAction) +case class Deletion(deletionMessage: ContainerDeletionMessage) +case object Remove +case class Keep(timeout: FiniteDuration) +case class PrewarmContainer(maxConcurrent: Int) + +/** + * A pool managing containers to run actions on. + * + * This pool fulfills the other half of the ContainerProxy contract. Only + * one job (either Start or Run) is sent to a child-actor at any given + * time. The pool then waits for a response of that container, indicating + * the container is done with the job. Only then will the pool send another + * request to that container. + * + * Upon actor creation, the pool will start to prewarm containers according + * to the provided prewarmConfig, iff set. Those containers will **not** be + * part of the poolsize calculation, which is capped by the poolSize parameter. + * Prewarm containers are only used, if they have matching arguments + * (kind, memory) and there is space in the pool. + * + * @param childFactory method to create new container proxy actor + * @param prewarmConfig optional settings for container prewarming + * @param poolConfig config for the ContainerPool + */ +class FunctionPullingContainerPool( + childFactory: ActorRefFactory => ActorRef, + invokerHealthService: ActorRef, + poolConfig: ContainerPoolConfig, + instance: InvokerInstanceId, + prewarmConfig: List[PrewarmingConfig] = List.empty, + sendAckToScheduler: (SchedulerInstanceId, ContainerCreationAckMessage) => Future[RecordMetadata])( + implicit val logging: Logging) + extends Actor { + import ContainerPoolV2.memoryConsumptionOf + + implicit val ec = context.system.dispatcher + + private var busyPool = immutable.Map.empty[ActorRef, Data] + private var inProgressPool = immutable.Map.empty[ActorRef, Data] + private var warmedPool = immutable.Map.empty[ActorRef, WarmData] + private var prewarmedPool = immutable.Map.empty[ActorRef, PreWarmData] + private var prewarmStartingPool = immutable.Map.empty[ActorRef, (String, ByteSize)] + + private var shuttingDown = false + + private val creationMessages = TrieMap[ActorRef, ContainerCreationMessage]() + + private var preWarmScheduler: Option[Cancellable] = None + private var prewarmConfigQueue = Queue.empty[(CodeExec[_], ByteSize, Option[FiniteDuration])] + private var prewarmCreateFailedCount = immutable.Map.empty[(String, ByteSize), Int] + + val logScheduler = context.system.scheduler.schedule(0.seconds, 1.seconds) { + MetricEmitter.emitHistogramMetric( + LoggingMarkers.INVOKER_CONTAINERPOOL_MEMORY("inprogress"), + memoryConsumptionOf(inProgressPool)) + MetricEmitter.emitHistogramMetric( + LoggingMarkers.INVOKER_CONTAINERPOOL_MEMORY("busy"), + memoryConsumptionOf(busyPool)) + MetricEmitter.emitHistogramMetric( + LoggingMarkers.INVOKER_CONTAINERPOOL_MEMORY("prewarmed"), + memoryConsumptionOf(prewarmedPool)) + MetricEmitter.emitHistogramMetric(LoggingMarkers.INVOKER_CONTAINERPOOL_MEMORY("max"), poolConfig.userMemory.toMB) + } + + // Key is ColdStartKey, value is the number of cold Start in minute + var coldStartCount = immutable.Map.empty[ColdStartKey, Int] + + adjustPrewarmedContainer(true, false) + + // check periodically, adjust prewarmed container(delete if unused for some time and create some increment containers) + // add some random amount to this schedule to avoid a herd of container removal + creation + val interval = poolConfig.prewarmExpirationCheckInterval + poolConfig.prewarmExpirationCheckIntervalVariance + .map(v => + Random + .nextInt(v.toSeconds.toInt)) + .getOrElse(0) + .seconds + + if (prewarmConfig.exists(!_.reactive.isEmpty)) { + context.system.scheduler.schedule( + poolConfig.prewarmExpirationCheckInitDelay, + interval, + self, + AdjustPrewarmedContainer) + } + + val resourceSubmitter = context.system.scheduler.schedule(0.seconds, poolConfig.memorySyncInterval) { + syncMemoryInfo + } + + private def logContainerStart(c: ContainerCreationMessage, action: WhiskAction, containerState: String): Unit = { + val FQN = c.action + if (FQN.namespace.name == "whisk.system" && FQN.fullPath.segments > 2) { + MetricEmitter.emitCounterMetric(LoggingMarkers.INVOKER_SHAREDPACKAGE(FQN.fullPath.asString)) + } + + MetricEmitter.emitCounterMetric( + LoggingMarkers.INVOKER_CONTAINER_START( + containerState, + c.invocationNamespace, + c.action.namespace.toString, + c.action.name.toString)) + } + + def receive: Receive = { + case PrewarmContainer(maxConcurrent) => + if (prewarmConfigQueue.isEmpty) { + preWarmScheduler.map(_.cancel()) + preWarmScheduler = None + } else { + for (_ <- 1 to maxConcurrent if !prewarmConfigQueue.isEmpty) { + val ((codeExec, byteSize, ttl), newQueue) = prewarmConfigQueue.dequeue + prewarmConfigQueue = newQueue + prewarmContainer(codeExec, byteSize, ttl) + } + } + + case Creation(create: ContainerCreationMessage, action: WhiskAction) => + if (shuttingDown) { + val message = + s"creationId: ${create.creationId}, invoker is shutting down, reschedule ${action.fullyQualifiedName(false)}" + val ack = ContainerCreationAckMessage( + create.transid, + create.creationId, + create.invocationNamespace, + create.action, + create.revision, + create.whiskActionMetaData, + instance, + create.schedulerHost, + create.rpcPort, + create.retryCount, + Some(ShuttingDownError), + Some(message)) + logging.warn(this, message) + sendAckToScheduler(create.rootSchedulerIndex, ack) + } else { + logging.info(this, s"received a container creation message: ${create.creationId}") Review comment: We leave info logs when an activation flows through the system. Similarly, we can track the container creation flow with this kind of log. I think we can keep this as info. ########## File path: core/invoker/src/main/resources/application.conf ########## @@ -60,9 +60,13 @@ whisk { user-memory: 1024 m concurrent-peek-factor: 0.5 #factor used to limit message peeking: 0 < factor <= 1.0 - larger number improves concurrent processing, but increases risk of message loss during invoker crash akka-client: false # if true, use PoolingContainerClient for HTTP from invoker to action container (otherwise use ApacheBlockingContainerClient) - prewarm-expiration-check-interval: 1 minute # period to check for prewarm expiration + prewarm-expiration-check-init-delay: 10 minute # the init delay time for the first check + prewarm-expiration-check-interval: 10 minute # period to check for prewarm expiration prewarm-expiration-check-interval-variance: 10 seconds # varies expiration across invokers to avoid many concurrent expirations prewarm-expiration-limit: 100 # number of prewarms to expire in one expiration cycle (remaining expired will be considered for expiration in next cycle) + prewarm-max-retry-limit: 5 # max retry limit for create prewarm Review comment: Basically, this max limit is reached when the subsequent 5 retries are failed. How about changing this to `max subsequent retry limit to create prewarm containers`? Would be worth mentioning that the count would be reinitialized when creation is succeeded at any time. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org