Re: [PR] [spark] Add scan.maxRecordsPerPartition config to split log table input partitions [fluss]

via GitHub Wed, 03 Jun 2026 04:49:35 -0700


fresh-borzoni commented on code in PR #3260:
URL: https://github.com/apache/fluss/pull/3260#discussion_r3344934114



##########
fluss-spark/fluss-spark-common/src/main/scala/org/apache/fluss/spark/read/FlussBatch.scala:
##########
@@ -129,26 +130,58 @@ class FlussAppendBatch(
   }
 
   override def planInputPartitions(): Array[InputPartition] = {
-    val bucketOffsetsRetrieverImpl = new BucketOffsetsRetrieverImpl(admin, 
tablePath)
+    val maxRecordsPerPartition: Option[Long] = {
+      val value = 
flussConfig.getLong(SparkFlussConf.SCAN_MAX_RECORDS_PER_PARTITION, 0)
+      if (value > 0) Some(value) else None
+    }
+
+    val bucketOffsetsRetrieverImpl = maxRecordsPerPartition match {
+      case Some(_) => new BucketOffsetsRetrieverImpl(admin, tablePath, true)
+      case _ => new BucketOffsetsRetrieverImpl(admin, tablePath)
+    }
     val buckets = (0 until tableInfo.getNumBuckets).toSeq
 
+    def splitOffsetRange(
+        tableBucket: TableBucket,
+        startOffset: Long,
+        stopOffset: Long,
+        maxRecords: Long): Seq[InputPartition] = {
+      if (

Review Comment:
   what about empty tables and buckets?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] [spark] Add scan.maxRecordsPerPartition config to split log table input partitions [fluss]

Reply via email to