Re: [PR] Refactor UT of input_file_name with scan is fallback more robust and cover all supported exprs [incubator-gluten]

via GitHub Wed, 10 Jul 2024 02:40:33 -0700


PHILO-HE commented on code in PR #6318:
URL: https://github.com/apache/incubator-gluten/pull/6318#discussion_r1671950903



##########
gluten-ut/spark35/src/test/scala/org/apache/spark/sql/GlutenColumnExpressionSuite.scala:
##########
@@ -18,38 +18,31 @@ package org.apache.spark.sql
 
 import org.apache.spark.sql.execution.ProjectExec
 import org.apache.spark.sql.functions.{expr, input_file_name}
-import org.apache.spark.sql.types._
 
 class GlutenColumnExpressionSuite extends ColumnExpressionSuite with 
GlutenSQLTestsTrait {
-  testGluten("input_file_name with scan is fallback") {
-    withTempPath {
-      dir =>
-        val rawData = Seq(
-          Row(1, "Alice", Seq(Row(Seq(1, 2, 3)))),
-          Row(2, "Bob", Seq(Row(Seq(4, 5)))),
-          Row(3, "Charlie", Seq(Row(Seq(6, 7, 8, 9))))
-        )
-        val schema = StructType(
-          Array(
-            StructField("id", IntegerType, nullable = false),
-            StructField("name", StringType, nullable = false),
-            StructField(
-              "nested_column",
-              ArrayType(
-                StructType(Array(
-                  StructField("array_in_struct", ArrayType(IntegerType), 
nullable = true)
-                ))),
-              nullable = true)
-          ))
-        val data: DataFrame = 
spark.createDataFrame(sparkContext.parallelize(rawData), schema)
-        data.write.parquet(dir.getCanonicalPath)
+  import testImplicits._
+  testGluten(
+    "input_file_name, input_file_block_start, input_file_block_length with 
scan is fallback") {
+    withSQLConf(("spark.gluten.sql.columnar.filescan", "false")) {
+      withTempPath {
+        dir =>
+          val data = sparkContext.parallelize(0 to 10).toDF("id")
+          data.write.parquet(dir.getCanonicalPath)
 
-        val q =
-          spark.read.parquet(dir.getCanonicalPath).select(input_file_name(), 
expr("nested_column"))
-        val firstRow = q.head()
-        assert(firstRow.getString(0).contains(dir.toURI.getPath))
-        val project = q.queryExecution.executedPlan.collect { case p: 
ProjectExec => p }
-        assert(project.size == 1)
+          val q =
+            spark.read
+              .parquet(dir.getCanonicalPath)
+              .select(
+                input_file_name(),
+                expr("input_file_block_start()"),
+                expr("input_file_block_length()"))
+          val firstRow = q.head()
+          assert(firstRow.getString(0).contains(dir.toURI.getPath))
+          assert(firstRow.getLong(1) == 0)
+          assert(firstRow.getLong(2) > 0)
+          val project = q.queryExecution.executedPlan.collect { case p: 
ProjectExec => p }

Review Comment:
   So we expect project is not offloaded? Then, do we need the above checking 
for result row?
   Maybe, make the test name more informative, e.g., "input_file_name, 
input_file_block_start and input_file_block_length should fall back if scan 
falls back".



##########
gluten-ut/spark32/src/test/scala/org/apache/spark/sql/GlutenColumnExpressionSuite.scala:
##########
@@ -18,38 +18,31 @@ package org.apache.spark.sql
 
 import org.apache.spark.sql.execution.ProjectExec
 import org.apache.spark.sql.functions.{expr, input_file_name}
-import org.apache.spark.sql.types.{ArrayType, IntegerType, StringType, 
StructField, StructType}
 
 class GlutenColumnExpressionSuite extends ColumnExpressionSuite with 
GlutenSQLTestsTrait {
-  testGluten("input_file_name with scan is fallback") {
-    withTempPath {
-      dir =>
-        val rawData = Seq(
-          Row(1, "Alice", Seq(Row(Seq(1, 2, 3)))),
-          Row(2, "Bob", Seq(Row(Seq(4, 5)))),
-          Row(3, "Charlie", Seq(Row(Seq(6, 7, 8, 9))))
-        )
-        val schema = StructType(
-          Array(
-            StructField("id", IntegerType, nullable = false),
-            StructField("name", StringType, nullable = false),
-            StructField(
-              "nested_column",
-              ArrayType(
-                StructType(Array(
-                  StructField("array_in_struct", ArrayType(IntegerType), 
nullable = true)
-                ))),
-              nullable = true)
-          ))
-        val data: DataFrame = 
spark.createDataFrame(sparkContext.parallelize(rawData), schema)
-        data.write.parquet(dir.getCanonicalPath)
+  import testImplicits._
+  testGluten(
+    "input_file_name, input_file_block_start, input_file_block_length with 
scan is fallback") {
+    withSQLConf(("spark.gluten.sql.columnar.filescan", "false")) {
+      withTempPath {
+        dir =>
+          val data = sparkContext.parallelize(0 to 10).toDF("id")
+          data.write.parquet(dir.getCanonicalPath)
 
-        val q =
-          spark.read.parquet(dir.getCanonicalPath).select(input_file_name(), 
expr("nested_column"))
-        val firstRow = q.head()
-        assert(firstRow.getString(0).contains(dir.toURI.getPath))
-        val project = q.queryExecution.executedPlan.collect { case p: 
ProjectExec => p }
-        assert(project.size == 1)
+          val q =
+            spark.read
+              .parquet(dir.getCanonicalPath)
+              .select(
+                input_file_name(),
+                expr("input_file_block_start()"),

Review Comment:
   Nit: maybe, just use input_file_block_start() instead of having `expr` for 
parsing the string. Same for other places.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] Refactor UT of input_file_name with scan is fallback more robust and cover all supported exprs [incubator-gluten]

Reply via email to