[GitHub] [spark] gaborgsomogyi commented on a change in pull request #29729: [SPARK-32032][SS] Avoid infinite wait in driver because of poll(long) API

GitBox Sat, 12 Sep 2020 13:35:19 -0700


gaborgsomogyi commented on a change in pull request #29729:
URL: https://github.com/apache/spark/pull/29729#discussion_r487063789




##########
File path: 
external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReader.scala
##########
@@ -105,34 +102,16 @@ private[kafka010] class KafkaOffsetReader(
     minPartitions.map(_ > numTopicPartitions).getOrElse(false)
   }
 
-  private def nextGroupId(): String = {
-    groupId = driverGroupIdPrefix + "-" + nextId
-    nextId += 1
-    groupId
-  }
-
   override def toString(): String = consumerStrategy.toString
 
   /**
    * Closes the connection to Kafka, and cleans up state.
    */
   def close(): Unit = {
-    if (_consumer != null) uninterruptibleThreadRunner.runUninterruptibly { 
stopConsumer() }
+    if (_admin != null) uninterruptibleThreadRunner.runUninterruptibly { 
stopAdmin() }

Review comment:
       I've left `UninterruptibleThreadRunner` for now but I think it can be 
removed since there is no evidence that `AdminClient` suffers from the same 
issue.

##########
File path: 
external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReader.scala
##########
@@ -213,64 +180,70 @@ private[kafka010] class KafkaOffsetReader(
       assert(partitions.asScala == partitionTimestamps.keySet,
         "If starting/endingOffsetsByTimestamp contains specific offsets, you 
must specify all " +
           s"topics. Specified: ${partitionTimestamps.keySet} Assigned: 
${partitions.asScala}")
-      logDebug(s"Partitions assigned to consumer: $partitions. Seeking to 
$partitionTimestamps")
+      logDebug(s"Assigned partitions: $partitions. Seeking to 
$partitionTimestamps")
     }
 
     val fnRetrievePartitionOffsets: ju.Set[TopicPartition] => 
Map[TopicPartition, Long] = { _ => {
-        val converted = partitionTimestamps.map { case (tp, timestamp) =>
-          tp -> java.lang.Long.valueOf(timestamp)
+        val listOffsetsParams = partitionTimestamps.map { p =>
+          p._1 -> OffsetSpec.forTimestamp(p._2)
         }.asJava
+        admin.listOffsets(listOffsetsParams, 
listOffsetsOptions()).all().get().asScala.map {
+          case (tp, offsetSpec) =>
+            if (failsOnNoMatchingOffset) {
+              assert(offsetSpec.offset() != 
OffsetFetchResponse.INVALID_OFFSET, "No offset " +
+                s"matched from request of topic-partition $tp and timestamp " +
+                s"${partitionTimestamps(tp)}.")
+            }
 
-        val offsetForTime: ju.Map[TopicPartition, OffsetAndTimestamp] =
-          consumer.offsetsForTimes(converted)
-
-        offsetForTime.asScala.map { case (tp, offsetAndTimestamp) =>
-          if (failsOnNoMatchingOffset) {
-            assert(offsetAndTimestamp != null, "No offset matched from request 
of " +
-              s"topic-partition $tp and timestamp ${partitionTimestamps(tp)}.")
-          }
-
-          if (offsetAndTimestamp == null) {
-            tp -> KafkaOffsetRangeLimit.LATEST
-          } else {
-            tp -> offsetAndTimestamp.offset()
-          }
+            if (offsetSpec == null) {

Review comment:
       Different API, different result. `offsetSpec` can be null.

##########
File path: 
external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala
##########
@@ -96,8 +96,7 @@ private[kafka010] class KafkaSourceProvider extends 
DataSourceRegister
     val kafkaOffsetReader = new KafkaOffsetReader(
       strategy(caseInsensitiveParameters),
       kafkaParamsForDriver(specifiedKafkaParams),
-      caseInsensitiveParameters,
-      driverGroupIdPrefix = s"$uniqueGroupId-driver")

Review comment:
       GroupId removal in this file since `AdminClient` doesn't require it.

##########
File path: 
external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchSourceSuite.scala
##########
@@ -689,57 +691,6 @@ abstract class KafkaMicroBatchSourceSuiteBase extends 
KafkaSourceSuiteBase {
     )
   }
 
-  test("allow group.id prefix") {

Review comment:
       No need the tests because driver is not user `KafkaConsumer` => no 
GroupId needed.

##########
File path: 
external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaRelationSuite.scala
##########
@@ -465,41 +463,6 @@ abstract class KafkaRelationSuiteBase extends QueryTest 
with SharedSparkSession
     testBadOptions("subscribePattern" -> "")("pattern to subscribe is empty")
   }
 
-  test("allow group.id prefix") {

Review comment:
       Same here.

##########
File path: 
external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchSourceSuite.scala
##########
@@ -608,7 +608,9 @@ abstract class KafkaMicroBatchSourceSuiteBase extends 
KafkaSourceSuiteBase {
     // in executors.
     val query = kafka.map(kv => kv._2.toInt).writeStream.foreach(new 
ForeachWriter[Int] {
       override def open(partitionId: Long, version: Long): Boolean = {
+        // Re-create topic since Kafka auto topic creation is not supported by 
Spark
         KafkaSourceSuite.globalTestUtils.deleteTopic(topic)
+        KafkaSourceSuite.globalTestUtils.createTopic(topic)

Review comment:
       This needed because we don't use `KafkaConsumer` and `AdminClient` 
doesn't provide auto topic creation. Please see doc for the rationale.

##########
File path: 
external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchSourceSuite.scala
##########
@@ -689,57 +691,6 @@ abstract class KafkaMicroBatchSourceSuiteBase extends 
KafkaSourceSuiteBase {
     )
   }
 
-  test("allow group.id prefix") {

Review comment:
       No need the tests because driver is not using `KafkaConsumer` => no 
GroupId needed.

##########
File path: 
external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReader.scala
##########
@@ -105,34 +102,16 @@ private[kafka010] class KafkaOffsetReader(
     minPartitions.map(_ > numTopicPartitions).getOrElse(false)
   }
 
-  private def nextGroupId(): String = {
-    groupId = driverGroupIdPrefix + "-" + nextId
-    nextId += 1
-    groupId
-  }
-
   override def toString(): String = consumerStrategy.toString
 
   /**
    * Closes the connection to Kafka, and cleans up state.
    */
   def close(): Unit = {
-    if (_consumer != null) uninterruptibleThreadRunner.runUninterruptibly { 
stopConsumer() }
+    if (_admin != null) uninterruptibleThreadRunner.runUninterruptibly { 
stopAdmin() }

Review comment:
       I've left `UninterruptibleThreadRunner` for now but I think it can be 
removed since there is no evidence that `AdminClient` suffers from the same 
issue.

##########
File path: 
external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReader.scala
##########
@@ -213,64 +180,70 @@ private[kafka010] class KafkaOffsetReader(
       assert(partitions.asScala == partitionTimestamps.keySet,
         "If starting/endingOffsetsByTimestamp contains specific offsets, you 
must specify all " +
           s"topics. Specified: ${partitionTimestamps.keySet} Assigned: 
${partitions.asScala}")
-      logDebug(s"Partitions assigned to consumer: $partitions. Seeking to 
$partitionTimestamps")
+      logDebug(s"Assigned partitions: $partitions. Seeking to 
$partitionTimestamps")
     }
 
     val fnRetrievePartitionOffsets: ju.Set[TopicPartition] => 
Map[TopicPartition, Long] = { _ => {
-        val converted = partitionTimestamps.map { case (tp, timestamp) =>
-          tp -> java.lang.Long.valueOf(timestamp)
+        val listOffsetsParams = partitionTimestamps.map { p =>
+          p._1 -> OffsetSpec.forTimestamp(p._2)
         }.asJava
+        admin.listOffsets(listOffsetsParams, 
listOffsetsOptions()).all().get().asScala.map {
+          case (tp, offsetSpec) =>
+            if (failsOnNoMatchingOffset) {
+              assert(offsetSpec.offset() != 
OffsetFetchResponse.INVALID_OFFSET, "No offset " +
+                s"matched from request of topic-partition $tp and timestamp " +
+                s"${partitionTimestamps(tp)}.")
+            }
 
-        val offsetForTime: ju.Map[TopicPartition, OffsetAndTimestamp] =
-          consumer.offsetsForTimes(converted)
-
-        offsetForTime.asScala.map { case (tp, offsetAndTimestamp) =>
-          if (failsOnNoMatchingOffset) {
-            assert(offsetAndTimestamp != null, "No offset matched from request 
of " +
-              s"topic-partition $tp and timestamp ${partitionTimestamps(tp)}.")
-          }
-
-          if (offsetAndTimestamp == null) {
-            tp -> KafkaOffsetRangeLimit.LATEST
-          } else {
-            tp -> offsetAndTimestamp.offset()
-          }
+            if (offsetSpec == null) {

Review comment:
       Different API, different result. `offsetSpec` can be null.

##########
File path: 
external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala
##########
@@ -96,8 +96,7 @@ private[kafka010] class KafkaSourceProvider extends 
DataSourceRegister
     val kafkaOffsetReader = new KafkaOffsetReader(
       strategy(caseInsensitiveParameters),
       kafkaParamsForDriver(specifiedKafkaParams),
-      caseInsensitiveParameters,
-      driverGroupIdPrefix = s"$uniqueGroupId-driver")

Review comment:
       GroupId removal in this file since `AdminClient` doesn't require it.

##########
File path: 
external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchSourceSuite.scala
##########
@@ -689,57 +691,6 @@ abstract class KafkaMicroBatchSourceSuiteBase extends 
KafkaSourceSuiteBase {
     )
   }
 
-  test("allow group.id prefix") {

Review comment:
       No need the tests because driver is not user `KafkaConsumer` => no 
GroupId needed.

##########
File path: 
external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaRelationSuite.scala
##########
@@ -465,41 +463,6 @@ abstract class KafkaRelationSuiteBase extends QueryTest 
with SharedSparkSession
     testBadOptions("subscribePattern" -> "")("pattern to subscribe is empty")
   }
 
-  test("allow group.id prefix") {

Review comment:
       Same here.

##########
File path: 
external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchSourceSuite.scala
##########
@@ -608,7 +608,9 @@ abstract class KafkaMicroBatchSourceSuiteBase extends 
KafkaSourceSuiteBase {
     // in executors.
     val query = kafka.map(kv => kv._2.toInt).writeStream.foreach(new 
ForeachWriter[Int] {
       override def open(partitionId: Long, version: Long): Boolean = {
+        // Re-create topic since Kafka auto topic creation is not supported by 
Spark
         KafkaSourceSuite.globalTestUtils.deleteTopic(topic)
+        KafkaSourceSuite.globalTestUtils.createTopic(topic)

Review comment:
       This needed because we don't use `KafkaConsumer` and `AdminClient` 
doesn't provide auto topic creation. Please see doc for the rationale.

##########
File path: 
external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchSourceSuite.scala
##########
@@ -689,57 +691,6 @@ abstract class KafkaMicroBatchSourceSuiteBase extends 
KafkaSourceSuiteBase {
     )
   }
 
-  test("allow group.id prefix") {

Review comment:
       No need the tests because driver is not using `KafkaConsumer` => no 
GroupId needed.

##########
File path: 
external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReader.scala
##########
@@ -105,34 +102,16 @@ private[kafka010] class KafkaOffsetReader(
     minPartitions.map(_ > numTopicPartitions).getOrElse(false)
   }
 
-  private def nextGroupId(): String = {
-    groupId = driverGroupIdPrefix + "-" + nextId
-    nextId += 1
-    groupId
-  }
-
   override def toString(): String = consumerStrategy.toString
 
   /**
    * Closes the connection to Kafka, and cleans up state.
    */
   def close(): Unit = {
-    if (_consumer != null) uninterruptibleThreadRunner.runUninterruptibly { 
stopConsumer() }
+    if (_admin != null) uninterruptibleThreadRunner.runUninterruptibly { 
stopAdmin() }

Review comment:
       I've left `UninterruptibleThreadRunner` for now but I think it can be 
removed since there is no evidence that `AdminClient` suffers from the same 
issue.

##########
File path: 
external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReader.scala
##########
@@ -213,64 +180,70 @@ private[kafka010] class KafkaOffsetReader(
       assert(partitions.asScala == partitionTimestamps.keySet,
         "If starting/endingOffsetsByTimestamp contains specific offsets, you 
must specify all " +
           s"topics. Specified: ${partitionTimestamps.keySet} Assigned: 
${partitions.asScala}")
-      logDebug(s"Partitions assigned to consumer: $partitions. Seeking to 
$partitionTimestamps")
+      logDebug(s"Assigned partitions: $partitions. Seeking to 
$partitionTimestamps")
     }
 
     val fnRetrievePartitionOffsets: ju.Set[TopicPartition] => 
Map[TopicPartition, Long] = { _ => {
-        val converted = partitionTimestamps.map { case (tp, timestamp) =>
-          tp -> java.lang.Long.valueOf(timestamp)
+        val listOffsetsParams = partitionTimestamps.map { p =>
+          p._1 -> OffsetSpec.forTimestamp(p._2)
         }.asJava
+        admin.listOffsets(listOffsetsParams, 
listOffsetsOptions()).all().get().asScala.map {
+          case (tp, offsetSpec) =>
+            if (failsOnNoMatchingOffset) {
+              assert(offsetSpec.offset() != 
OffsetFetchResponse.INVALID_OFFSET, "No offset " +
+                s"matched from request of topic-partition $tp and timestamp " +
+                s"${partitionTimestamps(tp)}.")
+            }
 
-        val offsetForTime: ju.Map[TopicPartition, OffsetAndTimestamp] =
-          consumer.offsetsForTimes(converted)
-
-        offsetForTime.asScala.map { case (tp, offsetAndTimestamp) =>
-          if (failsOnNoMatchingOffset) {
-            assert(offsetAndTimestamp != null, "No offset matched from request 
of " +
-              s"topic-partition $tp and timestamp ${partitionTimestamps(tp)}.")
-          }
-
-          if (offsetAndTimestamp == null) {
-            tp -> KafkaOffsetRangeLimit.LATEST
-          } else {
-            tp -> offsetAndTimestamp.offset()
-          }
+            if (offsetSpec == null) {

Review comment:
       Different API, different result. `offsetSpec` can be null.

##########
File path: 
external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala
##########
@@ -96,8 +96,7 @@ private[kafka010] class KafkaSourceProvider extends 
DataSourceRegister
     val kafkaOffsetReader = new KafkaOffsetReader(
       strategy(caseInsensitiveParameters),
       kafkaParamsForDriver(specifiedKafkaParams),
-      caseInsensitiveParameters,
-      driverGroupIdPrefix = s"$uniqueGroupId-driver")

Review comment:
       GroupId removal in this file since `AdminClient` doesn't require it.

##########
File path: 
external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchSourceSuite.scala
##########
@@ -689,57 +691,6 @@ abstract class KafkaMicroBatchSourceSuiteBase extends 
KafkaSourceSuiteBase {
     )
   }
 
-  test("allow group.id prefix") {

Review comment:
       No need the tests because driver is not user `KafkaConsumer` => no 
GroupId needed.

##########
File path: 
external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaRelationSuite.scala
##########
@@ -465,41 +463,6 @@ abstract class KafkaRelationSuiteBase extends QueryTest 
with SharedSparkSession
     testBadOptions("subscribePattern" -> "")("pattern to subscribe is empty")
   }
 
-  test("allow group.id prefix") {

Review comment:
       Same here.

##########
File path: 
external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchSourceSuite.scala
##########
@@ -608,7 +608,9 @@ abstract class KafkaMicroBatchSourceSuiteBase extends 
KafkaSourceSuiteBase {
     // in executors.
     val query = kafka.map(kv => kv._2.toInt).writeStream.foreach(new 
ForeachWriter[Int] {
       override def open(partitionId: Long, version: Long): Boolean = {
+        // Re-create topic since Kafka auto topic creation is not supported by 
Spark
         KafkaSourceSuite.globalTestUtils.deleteTopic(topic)
+        KafkaSourceSuite.globalTestUtils.createTopic(topic)

Review comment:
       This needed because we don't use `KafkaConsumer` and `AdminClient` 
doesn't provide auto topic creation. Please see doc for the rationale.

##########
File path: 
external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchSourceSuite.scala
##########
@@ -689,57 +691,6 @@ abstract class KafkaMicroBatchSourceSuiteBase extends 
KafkaSourceSuiteBase {
     )
   }
 
-  test("allow group.id prefix") {

Review comment:
       No need the tests because driver is not using `KafkaConsumer` => no 
GroupId needed.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [spark] gaborgsomogyi commented on a change in pull request #29729: [SPARK-32032][SS] Avoid infinite wait in driver because of poll(long) API

Reply via email to