gaborgsomogyi commented on a change in pull request #22138: [SPARK-25151][SS] 
Apply Apache Commons Pool to KafkaDataConsumer
URL: https://github.com/apache/spark/pull/22138#discussion_r302985230
 
 

 ##########
 File path: 
external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/InternalKafkaConsumerPoolSuite.scala
 ##########
 @@ -0,0 +1,316 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.kafka010
+
+import java.{util => ju}
+
+import scala.collection.JavaConverters._
+
+import org.apache.kafka.clients.consumer.ConsumerConfig._
+import org.apache.kafka.common.TopicPartition
+import org.apache.kafka.common.serialization.ByteArrayDeserializer
+
+import org.apache.spark.SparkEnv
+import org.apache.spark.sql.kafka010.KafkaDataConsumer.CacheKey
+import org.apache.spark.sql.test.SharedSQLContext
+
+class InternalKafkaConsumerPoolSuite extends SharedSQLContext {
+  import org.apache.spark.sql.kafka010.InternalKafkaConsumerPool.PoolConfig._
+
+  test("basic multiple borrows and returns for single key") {
+    val pool = InternalKafkaConsumerPool.build
+
+    val topic = "topic"
+    val partitionId = 0
+    val topicPartition = new TopicPartition(topic, partitionId)
+
+    val kafkaParams: ju.Map[String, Object] = getTestKafkaParams
+
+    val key = new CacheKey(topicPartition, kafkaParams)
+
+    val pooledObjects = (0 to 2).map { _ =>
+      val pooledObject = pool.borrowObject(key, kafkaParams)
+      assertPooledObject(pooledObject, topicPartition, kafkaParams)
+      pooledObject
+    }
+
+    assertPoolStateForKey(pool, key, numIdle = 0, numActive = 3, numTotal = 3)
+    assertPoolState(pool, numIdle = 0, numActive = 3, numTotal = 3)
+
+    val pooledObject2 = pool.borrowObject(key, kafkaParams)
+
+    assertPooledObject(pooledObject2, topicPartition, kafkaParams)
+    assertPoolStateForKey(pool, key, numIdle = 0, numActive = 4, numTotal = 4)
+    assertPoolState(pool, numIdle = 0, numActive = 4, numTotal = 4)
+
+    pooledObjects.foreach(pool.returnObject)
+
+    assertPoolStateForKey(pool, key, numIdle = 3, numActive = 1, numTotal = 4)
+    assertPoolState(pool, numIdle = 3, numActive = 1, numTotal = 4)
+
+    pool.returnObject(pooledObject2)
+
+    // we only allow three idle objects per key
+    assertPoolStateForKey(pool, key, numIdle = 3, numActive = 0, numTotal = 3)
+    assertPoolState(pool, numIdle = 3, numActive = 0, numTotal = 3)
+
+    pool.close()
+  }
+
+  test("basic borrow and return for multiple keys") {
+    val pool = InternalKafkaConsumerPool.build
+
+    val kafkaParams = getTestKafkaParams
+    val topicPartitions: List[TopicPartition] = for (
+      topic <- List("topic", "topic2");
+      partitionId <- 0 to 5
+    ) yield new TopicPartition(topic, partitionId)
+
+    val keys: List[CacheKey] = topicPartitions.map { part =>
+      new CacheKey(part, kafkaParams)
+    }
+
+    // while in loop pool doesn't still exceed total pool size
+    val keyToPooledObjectPairs = borrowObjectsPerKey(pool, kafkaParams, keys)
+
+    assertPoolState(pool, numIdle = 0, numActive = 
keyToPooledObjectPairs.length,
+      numTotal = keyToPooledObjectPairs.length)
+
+    returnObjects(pool, keyToPooledObjectPairs)
+
+    assertPoolState(pool, numIdle = keyToPooledObjectPairs.length, numActive = 
0,
+      numTotal = keyToPooledObjectPairs.length)
+
+    pool.close()
+  }
+
+  test("borrow more than soft max capacity from pool which is neither free 
space nor idle object") {
+    val capacity = 16
+
+    val newConf = Seq(
+      CONFIG_NAME_CAPACITY -> capacity.toString,
+      CONFIG_NAME_MIN_EVICTABLE_IDLE_TIME_MILLIS -> (-1).toString,
+      CONFIG_NAME_EVICTOR_THREAD_RUN_INTERVAL_MILLIS -> (-1).toString)
+
+    withSparkConf(newConf: _*) {
+      val pool = InternalKafkaConsumerPool.build
+
+      val kafkaParams = getTestKafkaParams
+      val topicPartitions: List[TopicPartition] = for (
+        partitionId <- (0 until capacity).toList
+      ) yield new TopicPartition("topic", partitionId)
+
+      val keys: List[CacheKey] = topicPartitions.map { part =>
+        new CacheKey(part, kafkaParams)
+      }
+
+      // while in loop pool doesn't still exceed soft max pool size
+      val keyToPooledObjectPairs = borrowObjectsPerKey(pool, kafkaParams, keys)
+
+      val moreTopicPartition = new TopicPartition("topic2", 0)
+      val newCacheKey = new CacheKey(moreTopicPartition, kafkaParams)
+
+      // exceeds soft max pool size, and also no idle object for cleaning up
+      // but pool will borrow a new object
+      pool.borrowObject(newCacheKey, kafkaParams)
+
+      assertPoolState(pool, numIdle = 0, numActive = 
keyToPooledObjectPairs.length + 1,
+        numTotal = keyToPooledObjectPairs.length + 1)
+
+      pool.close()
+    }
+  }
+
+  test("borrow more than soft max capacity from pool frees up idle objects 
automatically") {
+    val capacity = 16
+
+    val newConf = Seq(
+      CONFIG_NAME_CAPACITY -> capacity.toString,
+      CONFIG_NAME_MIN_EVICTABLE_IDLE_TIME_MILLIS -> (-1).toString,
+      CONFIG_NAME_EVICTOR_THREAD_RUN_INTERVAL_MILLIS -> (-1).toString)
+
+    withSparkConf(newConf: _*) {
+      val pool = InternalKafkaConsumerPool.build
+
+      val kafkaParams = getTestKafkaParams
+      val topicPartitions: List[TopicPartition] = for (
+        partitionId <- (0 until capacity).toList
+      ) yield new TopicPartition("topic", partitionId)
+
+      val keys: List[CacheKey] = topicPartitions.map { part =>
+        new CacheKey(part, kafkaParams)
+      }
+
+      // borrow objects which makes pool reaching soft capacity
+      val keyToPooledObjectPairs = borrowObjectsPerKey(pool, kafkaParams, keys)
+
+      // return 20% of objects to ensure there're some idle objects to free up 
later
+      val numToReturn = (keyToPooledObjectPairs.length * 0.2).toInt
+      returnObjects(pool, keyToPooledObjectPairs.take(numToReturn))
+
+      assertPoolState(pool, numIdle = numToReturn,
+        numActive = keyToPooledObjectPairs.length - numToReturn,
+        numTotal = keyToPooledObjectPairs.length)
+
+      // borrow a new object: there should be some idle objects to clean up
+      val moreTopicPartition = new TopicPartition("topic2", 0)
+      val newCacheKey = new CacheKey(moreTopicPartition, kafkaParams)
+
+      val newObject = pool.borrowObject(newCacheKey, kafkaParams)
+      assertPooledObject(newObject, moreTopicPartition, kafkaParams)
+      assertPoolStateForKey(pool, newCacheKey, numIdle = 0, numActive = 1, 
numTotal = 1)
+
+      // at least one of idle object should be freed up
+      assert(pool.getNumIdle < numToReturn)
+      // we can determine number of active objects correctly
+      assert(pool.getNumActive === keyToPooledObjectPairs.length - numToReturn 
+ 1)
+      // total objects should be more than number of active + 1 but can't 
expect exact number
+      assert(pool.getTotal > keyToPooledObjectPairs.length - numToReturn + 1)
+
+      pool.close()
+    }
+  }
+
+  test("evicting idle objects on background") {
+    import org.scalatest.time.SpanSugar._
+
+    val minEvictableIdleTimeMillis = 3 * 1000 // 3 seconds
+    val evictorThreadRunIntervalMillis = 500 // triggering multiple evictions 
by intention
+
+    val newConf = Seq(
+      CONFIG_NAME_MIN_EVICTABLE_IDLE_TIME_MILLIS -> 
minEvictableIdleTimeMillis.toString,
+      CONFIG_NAME_EVICTOR_THREAD_RUN_INTERVAL_MILLIS -> 
evictorThreadRunIntervalMillis.toString)
+
+    withSparkConf(newConf: _*) {
+      val pool = InternalKafkaConsumerPool.build
+
+      val kafkaParams = getTestKafkaParams
+      val topicPartitions: List[TopicPartition] = for (
+        partitionId <- (0 until 10).toList
+      ) yield new TopicPartition("topic", partitionId)
+
+      val keys: List[CacheKey] = topicPartitions.map { part =>
+        new CacheKey(part, kafkaParams)
+      }
+
+      // borrow and return some consumers to ensure some partitions are being 
idle
+      // this test covers the use cases: rebalance / topic removal happens 
while running query
+      val keyToPooledObjectPairs = borrowObjectsPerKey(pool, kafkaParams, keys)
+      val objectsToReturn = 
keyToPooledObjectPairs.filter(_._1.topicPartition.partition() % 2 == 0)
+      returnObjects(pool, objectsToReturn)
+
+      // wait up to twice than minEvictableIdleTimeMillis to ensure evictor 
thread to clear up
+      // idle objects
+      eventually(timeout((minEvictableIdleTimeMillis.toLong * 2).seconds),
+        interval(evictorThreadRunIntervalMillis.milliseconds)) {
+        assertPoolState(pool, numIdle = 0, numActive = 5, numTotal = 5)
+      }
+
+      pool.close()
+    }
+  }
+
+  private def assertPooledObject(
+      pooledObject: InternalKafkaConsumer,
+      expectedTopicPartition: TopicPartition,
+      expectedKafkaParams: ju.Map[String, Object]): Unit = {
+    assert(pooledObject != null)
+    assert(pooledObject.kafkaParams === expectedKafkaParams)
+    assert(pooledObject.topicPartition === expectedTopicPartition)
+  }
+
+  private def assertPoolState(pool: InternalKafkaConsumerPool, numIdle: Int,
+                              numActive: Int, numTotal: Int): Unit = {
 
 Review comment:
   Nit: indent

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to