cloud-fan commented on code in PR #54330:
URL: https://github.com/apache/spark/pull/54330#discussion_r2876772563
##########
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceRDD.scala:
##########
@@ -50,9 +50,22 @@ private case class ReaderState(reader: PartitionReader[_],
iterator: MetricsIter
// TODO: we should have 2 RDDs: an RDD[InternalRow] for row-based scan, an
`RDD[ColumnarBatch]` for
// columnar scan.
+/**
+ * An RDD that reads data from a V2 data source.
+ *
+ * This RDD handles both row-based and columnar reads, tracks custom metrics
from the data source,
+ * and ensures that task completion listeners are added only once per thread
to avoid duplicate
+ * metric updates and resource cleanup.
+ *
+ * @param sc The Spark context
+ * @param inputPartitions The input partitions to read from
+ * @param partitionReaderFactory Factory for creating partition readers
+ * @param columnarReads Whether to use columnar reads
+ * @param customMetrics Custom metrics defined by the data source
+ */
class DataSourceRDD(
sc: SparkContext,
- @transient private val inputPartitions: Seq[Seq[InputPartition]],
+ @transient private val inputPartitions: Seq[Option[InputPartition]],
Review Comment:
hmm what does a None input partition mean? Will it be all None or it can be
mixed?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]