Re: [PR] Test input_file_name, input_file_block_start & input_file_block_length when scan falls back [incubator-gluten]

via GitHub Wed, 10 Jul 2024 03:43:49 -0700


gaoyangxiaozhu commented on code in PR #6318:
URL: https://github.com/apache/incubator-gluten/pull/6318#discussion_r1672037659



##########
gluten-ut/spark35/src/test/scala/org/apache/spark/sql/GlutenColumnExpressionSuite.scala:
##########
@@ -18,38 +18,31 @@ package org.apache.spark.sql
 
 import org.apache.spark.sql.execution.ProjectExec
 import org.apache.spark.sql.functions.{expr, input_file_name}
-import org.apache.spark.sql.types._
 
 class GlutenColumnExpressionSuite extends ColumnExpressionSuite with 
GlutenSQLTestsTrait {
-  testGluten("input_file_name with scan is fallback") {
-    withTempPath {
-      dir =>
-        val rawData = Seq(
-          Row(1, "Alice", Seq(Row(Seq(1, 2, 3)))),
-          Row(2, "Bob", Seq(Row(Seq(4, 5)))),
-          Row(3, "Charlie", Seq(Row(Seq(6, 7, 8, 9))))
-        )
-        val schema = StructType(
-          Array(
-            StructField("id", IntegerType, nullable = false),
-            StructField("name", StringType, nullable = false),
-            StructField(
-              "nested_column",
-              ArrayType(
-                StructType(Array(
-                  StructField("array_in_struct", ArrayType(IntegerType), 
nullable = true)
-                ))),
-              nullable = true)
-          ))
-        val data: DataFrame = 
spark.createDataFrame(sparkContext.parallelize(rawData), schema)
-        data.write.parquet(dir.getCanonicalPath)
+  import testImplicits._
+  testGluten(
+    "input_file_name, input_file_block_start, input_file_block_length with 
scan is fallback") {
+    withSQLConf(("spark.gluten.sql.columnar.filescan", "false")) {
+      withTempPath {
+        dir =>
+          val data = sparkContext.parallelize(0 to 10).toDF("id")
+          data.write.parquet(dir.getCanonicalPath)
 
-        val q =
-          spark.read.parquet(dir.getCanonicalPath).select(input_file_name(), 
expr("nested_column"))
-        val firstRow = q.head()
-        assert(firstRow.getString(0).contains(dir.toURI.getPath))
-        val project = q.queryExecution.executedPlan.collect { case p: 
ProjectExec => p }
-        assert(project.size == 1)
+          val q =
+            spark.read
+              .parquet(dir.getCanonicalPath)
+              .select(
+                input_file_name(),
+                expr("input_file_block_start()"),
+                expr("input_file_block_length()"))
+          val firstRow = q.head()
+          assert(firstRow.getString(0).contains(dir.toURI.getPath))
+          assert(firstRow.getLong(1) == 0)
+          assert(firstRow.getLong(2) > 0)
+          val project = q.queryExecution.executedPlan.collect { case p: 
ProjectExec => p }

Review Comment:
   yes, if project has `input_file` related exprs and the child scan is 
fallback, then we also need fallback project, that's current how `input_file_*` 
works. ket me refactor the test name.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] Test input_file_name, input_file_block_start & input_file_block_length when scan falls back [incubator-gluten]

Reply via email to