Re: [PR] feat: CometNativeWriteExec support with native scan as a child [datafusion-comet]

via GitHub Thu, 04 Dec 2025 07:39:50 -0800


mbutrovich commented on code in PR #2839:
URL: https://github.com/apache/datafusion-comet/pull/2839#discussion_r2589575195



##########
spark/src/test/scala/org/apache/comet/parquet/CometParquetWriterSuite.scala:
##########
@@ -34,122 +34,167 @@ import org.apache.comet.testing.{DataGenOptions, 
FuzzDataGenerator, SchemaGenOpt
 
 class CometParquetWriterSuite extends CometTestBase {
 
-  test("basic parquet write") {
-    // no support for fully native scan as input yet
-    assume(CometConf.COMET_NATIVE_SCAN_IMPL.get() != 
CometConf.SCAN_NATIVE_DATAFUSION)
+  private def createTestData(inputDir: File): String = {
+    val inputPath = new File(inputDir, "input.parquet").getAbsolutePath
+    val schema = FuzzDataGenerator.generateSchema(
+      SchemaGenOptions(generateArray = false, generateStruct = false, 
generateMap = false))
+    val df = FuzzDataGenerator.generateDataFrame(
+      new Random(42),
+      spark,
+      schema,
+      1000,
+      DataGenOptions(generateNegativeZero = false))
+    withSQLConf(
+      CometConf.COMET_NATIVE_PARQUET_WRITE_ENABLED.key -> "false",
+      SQLConf.SESSION_LOCAL_TIMEZONE.key -> "America/Denver") {
+      df.write.parquet(inputPath)
+    }
+    inputPath
+  }
+
+  private def writeWithCometNativeWriteExec(
+      inputPath: String,
+      outputPath: String): Option[QueryExecution] = {
+    val df = spark.read.parquet(inputPath)
+
+    // Use a listener to capture the execution plan during write
+    var capturedPlan: Option[QueryExecution] = None
+
+    val listener = new org.apache.spark.sql.util.QueryExecutionListener {
+      override def onSuccess(funcName: String, qe: QueryExecution, durationNs: 
Long): Unit = {
+        // Capture plans from write operations
+        if (funcName == "save" || funcName.contains("command")) {
+          capturedPlan = Some(qe)
+        }
+      }
+
+      override def onFailure(
+          funcName: String,
+          qe: QueryExecution,
+          exception: Exception): Unit = {}
+    }
+
+    spark.listenerManager.register(listener)
+
+    try {
+      // Perform native write
+      df.write.parquet(outputPath)
+
+      // Wait for listener to be called with timeout
+      val maxWaitTimeMs = 15000
+      val checkIntervalMs = 100
+      val maxIterations = maxWaitTimeMs / checkIntervalMs
+      var iterations = 0
+
+      while (capturedPlan.isEmpty && iterations < maxIterations) {
+        Thread.sleep(checkIntervalMs)
+        iterations += 1
+      }
+
+      // Verify that CometNativeWriteExec was used
+      assert(
+        capturedPlan.isDefined,
+        s"Listener was not called within ${maxWaitTimeMs}ms - no execution 
plan captured")
+
+      capturedPlan.foreach { qe =>
+        val executedPlan = qe.executedPlan
+        val hasNativeWrite = executedPlan.exists {
+          case _: CometNativeWriteExec => true
+          case d: DataWritingCommandExec =>
+            d.child.exists {
+              case _: CometNativeWriteExec => true
+              case _ => false
+            }
+          case _ => false
+        }
+
+        assert(
+          hasNativeWrite,
+          s"Expected CometNativeWriteExec in the plan, but 
got:\n${executedPlan.treeString}")
+      }
+    } finally {
+      spark.listenerManager.unregister(listener)
+    }
+    capturedPlan
+  }
+
+  private def verifyWrittenFile(outputPath: String): Unit = {
+    // Verify the data was written correctly
+    val resultDf = spark.read.parquet(outputPath)
+    assert(resultDf.count() == 1000, "Expected 1000 rows to be written")
+
+    // Verify multiple part files were created
+    val outputDir = new File(outputPath)
+    val partFiles = outputDir.listFiles().filter(_.getName.startsWith("part-"))
+    // With 1000 rows and default parallelism, we should get multiple 
partitions
+    assert(partFiles.length > 1, "Expected multiple part files to be created")

Review Comment:
   I just moved that logic. Since this is a pretty early proof-of-concept 
feature from @andygrove I'm not too inclined to change test behavior in this PR.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] feat: CometNativeWriteExec support with native scan as a child [datafusion-comet]

Reply via email to