aokolnychyi commented on a change in pull request #32921:
URL: https://github.com/apache/spark/pull/32921#discussion_r657439358
##########
File path:
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/BatchScanExec.scala
##########
@@ -17,38 +17,91 @@
package org.apache.spark.sql.execution.datasources.v2
+import com.google.common.base.Objects
+
+import org.apache.spark.SparkException
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.plans.QueryPlan
-import org.apache.spark.sql.connector.read.{InputPartition,
PartitionReaderFactory, Scan}
+import org.apache.spark.sql.catalyst.plans.physical.UnknownPartitioning
+import org.apache.spark.sql.catalyst.util.truncatedString
+import org.apache.spark.sql.connector.read.{InputPartition,
PartitionReaderFactory, Scan, SupportsRuntimeFiltering}
+import org.apache.spark.sql.execution.datasources.DataSourceStrategy
/**
* Physical plan node for scanning a batch of data from a data source v2.
*/
case class BatchScanExec(
output: Seq[AttributeReference],
- @transient scan: Scan) extends DataSourceV2ScanExecBase {
+ @transient scan: Scan,
+ runtimeFilters: Seq[Expression]) extends DataSourceV2ScanExecBase {
@transient lazy val batch = scan.toBatch
// TODO: unify the equal/hashCode implementation for all data source v2
query plans.
override def equals(other: Any): Boolean = other match {
- case other: BatchScanExec => this.batch == other.batch
- case _ => false
+ case other: BatchScanExec =>
+ this.batch == other.batch && this.runtimeFilters == other.runtimeFilters
+ case _ =>
+ false
}
- override def hashCode(): Int = batch.hashCode()
+ override def hashCode(): Int = Objects.hashCode(batch, runtimeFilters)
@transient override lazy val partitions: Seq[InputPartition] =
batch.planInputPartitions()
+ @transient private lazy val filteredPartitions: Seq[InputPartition] = {
+ val dataSourceFilters = runtimeFilters.flatMap {
+ case DynamicPruningExpression(e) =>
DataSourceStrategy.translateRuntimeFilter(e)
+ case _ => None
+ }
+
+ if (dataSourceFilters.nonEmpty) {
+ // the cast is safe as runtime filters are only assigned if the scan can
be filtered
+ val filterableScan = scan.asInstanceOf[SupportsRuntimeFiltering]
+ val canChangeNumPartitions = outputPartitioning match {
+ case UnknownPartitioning(0) => true
+ case _ => false
+ }
+ filterableScan.filter(dataSourceFilters.toArray, canChangeNumPartitions)
+
+ // call toBatch again to get new partitions
+ val newPartitions = scan.toBatch.planInputPartitions()
Review comment:
Even though I am calling toBatch one more time, I still use the original
`readerFactory`.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]