[GitHub] [spark] olaky commented on a diff in pull request #39408: [SPARK-41896][SQL] Filtering by row index returns empty results

GitBox Fri, 13 Jan 2023 03:26:27 -0800


olaky commented on code in PR #39408:
URL: https://github.com/apache/spark/pull/39408#discussion_r1069282419



##########
sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileMetadataStructSuite.scala:
##########
@@ -677,4 +677,90 @@ class FileMetadataStructSuite extends QueryTest with 
SharedSparkSession {
     assert(analyzedStruct.fields.forall(!_.nullable), 
analyzedStruct.fields.mkString(", "))
     assert(executedStruct.fields.forall(!_.nullable), 
executedStruct.fields.mkString(", "))
   }
+
+  test("SPARK-41896: Filter on row_index and a stored column at the same 
time") {
+    withTempPath { dir =>
+      val storedIdName = "stored_id"
+      val storedIdUpperLimitExclusive = 520
+      val rowIndexLowerLimitInclusive = 10
+
+      spark.range(start = 500, end = 600)
+        .toDF(storedIdName)
+        .write
+        .format("parquet")
+        .save(dir.getAbsolutePath)
+
+      // Select stored_id 510 to 519 via a stored_id and row_index filter.
+      val collectedRows = spark.read.load(dir.getAbsolutePath)
+        .select(storedIdName, METADATA_ROW_INDEX)
+        .where(col(storedIdName).lt(lit(storedIdUpperLimitExclusive)))
+        .where(col(METADATA_ROW_INDEX).geq(lit(rowIndexLowerLimitInclusive)))
+        .collect()
+
+      assert(collectedRows.length === 10)
+      assert(collectedRows.forall(_.getLong(0) < storedIdUpperLimitExclusive))
+      assert(collectedRows.forall(_.getLong(1) >= rowIndexLowerLimitInclusive))
+    }
+  }
+
+  test("SPARK-41896: Filter on constant and generated metadata attributes at 
the same time") {
+    withTempPath { dir =>
+      val idColumnName = "id"
+      val partitionColumnName = "partition"
+      val numFiles = 4
+      val totalNumRows = 40
+
+      spark.range(end = totalNumRows)
+        .toDF(idColumnName)
+        .withColumn(partitionColumnName, col(idColumnName).mod(lit(numFiles)))
+        .write
+        .partitionBy(partitionColumnName)
+        .format("parquet")
+        .save(dir.getAbsolutePath)
+
+      // Get one file path.
+      val randomTableFilePath = spark.read.load(dir.getAbsolutePath)
+        .select(METADATA_FILE_PATH).collect().head.getString(0)
+
+      val halfTheNumberOfRowsPerFile = totalNumRows / (numFiles * 2)
+      val collectedRows = spark.read.load(dir.getAbsolutePath)
+        .where(col(METADATA_FILE_PATH).equalTo(lit(randomTableFilePath)))
+        .where(col(METADATA_ROW_INDEX).leq(lit(halfTheNumberOfRowsPerFile)))
+        .collect()
+
+      // The query will match half the rows in one of the files.
+      assert(collectedRows.length === halfTheNumberOfRowsPerFile)
+    }
+  }
+
+  test("SPARK-41896: Filter by a function that takes the metadata struct as 
argument") {
+    withTempPath { dir =>
+      val idColumnName = "id"
+      val numFiles = 4
+      spark.range(end = numFiles)
+        .toDF(idColumnName)
+        .withColumn("partition", col(idColumnName))
+        .write
+        .format("parquet")
+        .partitionBy("partition")
+        .save(dir.getAbsolutePath)
+
+      // Select path and partition value for a random file.
+      val testFileData = spark.read.load(dir.getAbsolutePath)
+        .select(idColumnName, METADATA_FILE_PATH).collect().head
+      val testFilePartition = testFileData.getLong(0)
+      val testFilePath = testFileData.getString(1)
+
+      // Create and use a filter using the file path.
+      spark.udf.register("isTestFile",
+        (metadata: Row) => { metadata.getAs[String]("file_path") == 
testFilePath })
+      val udfFilterResult = spark.read.load(dir.getAbsolutePath)
+        .select(idColumnName, METADATA_FILE_PATH)
+        .where("isTestFile(_metadata)")

Review Comment:
   I did assume that this works because at some point in the evaluation the 
plan has to contain GetStructField, and where I transform the plan this is 
already the case.
   I am changing the implementation and then we will not depend on this any more



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [spark] olaky commented on a diff in pull request #39408: [SPARK-41896][SQL] Filtering by row index returns empty results

Reply via email to