[GitHub] [spark] cloud-fan commented on a change in pull request #34451: [SPARK-37038][SQL] DSV2 Sample Push Down

GitBox Mon, 01 Nov 2021 06:39:48 -0700


cloud-fan commented on a change in pull request #34451:
URL: https://github.com/apache/spark/pull/34451#discussion_r740217364




##########
File path: 
external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/V2JDBCTest.scala
##########
@@ -284,4 +288,83 @@ private[v2] trait V2JDBCTest extends SharedSparkSession 
with DockerIntegrationFu
       testIndexUsingSQL(s"$catalogName.new_table")
     }
   }
+
+  def supportsTableSample: Boolean = false
+
+  test("Test TABLESAMPLE") {
+    require(supportsTableSample)
+    withTable(s"$catalogName.new_table") {
+      sql(s"CREATE TABLE $catalogName.new_table (col1 INT, col2 INT)")
+      spark.range(10).select($"id" * 2, $"id" * 2 + 
1).write.insertInto(s"$catalogName.new_table")
+
+      val df1 = sql(s"SELECT col1 FROM $catalogName.new_table TABLESAMPLE 
(BUCKET 6 OUT OF 10)" +
+        s" REPEATABLE (12345)")
+      val scan1 = df1.queryExecution.optimizedPlan.collectFirst {
+        case s: DataSourceV2ScanRelation => s
+      }.get
+      assert(scan1.schema.names.sameElements(Seq("col1")))
+
+      val sample1 = df1.queryExecution.optimizedPlan.collect {
+        case s: Sample => s
+      }
+      assert(sample1.isEmpty)
+      assert(df1.collect().length <= 7)
+
+      val df2 = sql(s"SELECT * FROM $catalogName.new_table TABLESAMPLE (50 
PERCENT)" +
+        s" REPEATABLE (12345)")
+      val sample2 = df2.queryExecution.optimizedPlan.collect {
+        case s: Sample => s
+      }
+      assert(sample2.isEmpty)
+      assert(df2.collect().length <= 7)
+
+      val df3 = sql(s"SELECT col1 FROM $catalogName.new_table TABLESAMPLE 
(BUCKET 6 OUT OF 10)" +
+        s" LIMIT 2")
+      val sample3 = df3.queryExecution.optimizedPlan.collect {
+        case s: Sample => s
+      }
+      assert(sample3.isEmpty)
+      df3.queryExecution.optimizedPlan.collectFirst {
+        case s@DataSourceV2ScanRelation(_, scan, _) => scan match {
+          case v1: V1ScanWrapper =>
+            assert(v1.pushedDownOperators.limit.nonEmpty &&
+              v1.pushedDownOperators.limit.get === 2)

Review comment:
       nit: `assert(v1.pushedDownOperators.limit == Some(2))`

##########
File path: 
external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/V2JDBCTest.scala
##########
@@ -284,4 +288,83 @@ private[v2] trait V2JDBCTest extends SharedSparkSession 
with DockerIntegrationFu
       testIndexUsingSQL(s"$catalogName.new_table")
     }
   }
+
+  def supportsTableSample: Boolean = false
+
+  test("Test TABLESAMPLE") {
+    require(supportsTableSample)
+    withTable(s"$catalogName.new_table") {
+      sql(s"CREATE TABLE $catalogName.new_table (col1 INT, col2 INT)")
+      spark.range(10).select($"id" * 2, $"id" * 2 + 
1).write.insertInto(s"$catalogName.new_table")
+
+      val df1 = sql(s"SELECT col1 FROM $catalogName.new_table TABLESAMPLE 
(BUCKET 6 OUT OF 10)" +
+        s" REPEATABLE (12345)")
+      val scan1 = df1.queryExecution.optimizedPlan.collectFirst {
+        case s: DataSourceV2ScanRelation => s
+      }.get
+      assert(scan1.schema.names.sameElements(Seq("col1")))
+
+      val sample1 = df1.queryExecution.optimizedPlan.collect {
+        case s: Sample => s
+      }
+      assert(sample1.isEmpty)
+      assert(df1.collect().length <= 7)
+
+      val df2 = sql(s"SELECT * FROM $catalogName.new_table TABLESAMPLE (50 
PERCENT)" +
+        s" REPEATABLE (12345)")
+      val sample2 = df2.queryExecution.optimizedPlan.collect {
+        case s: Sample => s
+      }
+      assert(sample2.isEmpty)
+      assert(df2.collect().length <= 7)
+
+      val df3 = sql(s"SELECT col1 FROM $catalogName.new_table TABLESAMPLE 
(BUCKET 6 OUT OF 10)" +
+        s" LIMIT 2")
+      val sample3 = df3.queryExecution.optimizedPlan.collect {
+        case s: Sample => s
+      }
+      assert(sample3.isEmpty)
+      df3.queryExecution.optimizedPlan.collectFirst {
+        case s@DataSourceV2ScanRelation(_, scan, _) => scan match {

Review comment:
       ```suggestion
           case s @ DataSourceV2ScanRelation(_, scan, _) => scan match {
   ```




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [spark] cloud-fan commented on a change in pull request #34451: [SPARK-37038][SQL] DSV2 Sample Push Down

Reply via email to