[spark] branch master updated: [SPARK-37165][SQL] Add REPEATABLE in TABLESAMPLE to specify seed

huaxingao Sat, 30 Oct 2021 09:27:57 -0700

This is an automated email from the ASF dual-hosted git repository.

huaxingao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new b0548c6  [SPARK-37165][SQL] Add REPEATABLE in TABLESAMPLE to specify 
seed
b0548c6 is described below

commit b0548c66672b54dfa82914c09caa581e0ff33947
Author: Huaxin Gao <[email protected]>
AuthorDate: Sat Oct 30 09:27:11 2021 -0700

    [SPARK-37165][SQL] Add REPEATABLE in TABLESAMPLE to specify seed
    
    ### What changes were proposed in this pull request?
    
    Add REPEATABLE in SQL syntax TABLESAMPLE so user can specify seed.
    
    ### Why are the changes needed?
    
    Current syntax for TABLESAMPLE:
    
    - TABLESAMPLE(x PERCENT)
    - TABLESAMPLE(BUCKET x OUT OF y)
    
    `Dataset.sample` has a param to specify seed, so we should allow SQL has a 
way to specify seed too.
    ```
      def sample(fraction: Double, seed: Long): Dataset[T] = {
        sample(withReplacement = false, fraction = fraction, seed = seed)
      }
    ```
    Most of the DBMS uses REPEATABLE to let user specify seed, e.g. DB2, we 
will follow the same way.
    
    <img width="1032" alt="Screen Shot 2021-10-29 at 8 46 04 AM" 
src="https://user-images.githubusercontent.com/13592258/139465718-285ab5fb-a9cf-4bef-bc32-88301745b12b.png";>
    
    ### Does this PR introduce _any_ user-facing change?
    
    Yes
    new SQL syntax
    
    - TABLESAMPLE(x PERCENT) [REPEATABLE (seed)]
    - TABLESAMPLE(BUCKET x OUT OF y) [REPEATABLE (seed)]
    
    ### How was this patch tested?
    new UT
    
    Closes #34442 from huaxingao/sample_syntax.
    
    Authored-by: Huaxin Gao <[email protected]>
    Signed-off-by: Huaxin Gao <[email protected]>
---
 docs/sql-ref-ansi-compliance.md                     |  1 +
 .../org/apache/spark/sql/catalyst/parser/SqlBase.g4 |  5 ++++-
 .../spark/sql/catalyst/parser/AstBuilder.scala      | 21 ++++++++++++++-------
 .../scala/org/apache/spark/sql/SQLQuerySuite.scala  | 16 ++++++++++++++++
 4 files changed, 35 insertions(+), 8 deletions(-)

diff --git a/docs/sql-ref-ansi-compliance.md b/docs/sql-ref-ansi-compliance.md
index 4527faa..2de8ba7 100644
--- a/docs/sql-ref-ansi-compliance.md
+++ b/docs/sql-ref-ansi-compliance.md
@@ -493,6 +493,7 @@ Below is a list of all the keywords in Spark SQL.
 |REGEXP|non-reserved|non-reserved|not a keyword|
 |RENAME|non-reserved|non-reserved|non-reserved|
 |REPAIR|non-reserved|non-reserved|non-reserved|
+|REPEATABLE|non-reserved|non-reserved|non-reserved|
 |REPLACE|non-reserved|non-reserved|non-reserved|
 |RESET|non-reserved|non-reserved|non-reserved|
 |RESPECT|non-reserved|non-reserved|non-reserved|
diff --git 
a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 
b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
index 32b080c..e1bccf6 100644
--- 
a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
+++ 
b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
@@ -674,7 +674,7 @@ joinCriteria
     ;
 
 sample
-    : TABLESAMPLE '(' sampleMethod? ')'
+    : TABLESAMPLE '(' sampleMethod? ')' (REPEATABLE '('seed=INTEGER_VALUE')')?
     ;
 
 sampleMethod
@@ -1194,6 +1194,7 @@ ansiNonReserved
     | REFRESH
     | RENAME
     | REPAIR
+    | REPEATABLE
     | REPLACE
     | RESET
     | RESPECT
@@ -1460,6 +1461,7 @@ nonReserved
     | REFRESH
     | RENAME
     | REPAIR
+    | REPEATABLE
     | REPLACE
     | RESET
     | RESPECT
@@ -1726,6 +1728,7 @@ REFERENCES: 'REFERENCES';
 REFRESH: 'REFRESH';
 RENAME: 'RENAME';
 REPAIR: 'REPAIR';
+REPEATABLE: 'REPEATABLE';
 REPLACE: 'REPLACE';
 RESET: 'RESET';
 RESPECT: 'RESPECT';
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
index 768d406..722a055 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
@@ -1174,13 +1174,14 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] 
with SQLConfHelper with Logg
    *
    * This currently supports the following sampling methods:
    * - TABLESAMPLE(x ROWS): Sample the table down to the given number of rows.
-   * - TABLESAMPLE(x PERCENT): Sample the table down to the given percentage. 
Note that percentages
-   * are defined as a number between 0 and 100.
-   * - TABLESAMPLE(BUCKET x OUT OF y): Sample the table down to a 'x' divided 
by 'y' fraction.
+   * - TABLESAMPLE(x PERCENT) [REPEATABLE (y)]: Sample the table down to the 
given percentage with
+   * seed 'y'. Note that percentages are defined as a number between 0 and 100.
+   * - TABLESAMPLE(BUCKET x OUT OF y) [REPEATABLE (z)]: Sample the table down 
to a 'x' divided by
+   * 'y' fraction with seed 'z'.
    */
   private def withSample(ctx: SampleContext, query: LogicalPlan): LogicalPlan 
= withOrigin(ctx) {
     // Create a sampled plan if we need one.
-    def sample(fraction: Double): Sample = {
+    def sample(fraction: Double, seed: Long): Sample = {
       // The range of fraction accepted by Sample is [0, 1]. Because Hive's 
block sampling
       // function takes X PERCENT as the input and the range of X is [0, 100], 
we need to
       // adjust the fraction.
@@ -1188,13 +1189,19 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] 
with SQLConfHelper with Logg
       validate(fraction >= 0.0 - eps && fraction <= 1.0 + eps,
         s"Sampling fraction ($fraction) must be on interval [0, 1]",
         ctx)
-      Sample(0.0, fraction, withReplacement = false, (math.random * 
1000).toInt, query)
+      Sample(0.0, fraction, withReplacement = false, seed, query)
     }
 
     if (ctx.sampleMethod() == null) {
       throw QueryParsingErrors.emptyInputForTableSampleError(ctx)
     }
 
+    val seed = if (ctx.seed != null) {
+      ctx.seed.getText.toLong
+    } else {
+      (math.random * 1000).toLong
+    }
+
     ctx.sampleMethod() match {
       case ctx: SampleByRowsContext =>
         Limit(expression(ctx.expression), query)
@@ -1202,7 +1209,7 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with 
SQLConfHelper with Logg
       case ctx: SampleByPercentileContext =>
         val fraction = ctx.percentage.getText.toDouble
         val sign = if (ctx.negativeSign == null) 1 else -1
-        sample(sign * fraction / 100.0d)
+        sample(sign * fraction / 100.0d, seed)
 
       case ctx: SampleByBytesContext =>
         val bytesStr = ctx.bytes.getText
@@ -1222,7 +1229,7 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with 
SQLConfHelper with Logg
         }
 
       case ctx: SampleByBucketContext =>
-        sample(ctx.numerator.getText.toDouble / 
ctx.denominator.getText.toDouble)
+        sample(ctx.numerator.getText.toDouble / 
ctx.denominator.getText.toDouble, seed)
     }
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 11b7ee6..0b8e10c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -4211,6 +4211,22 @@ class SQLQuerySuite extends QueryTest with 
SharedSparkSession with AdaptiveSpark
       sql("SELECT * FROM testData, LATERAL (SELECT * FROM testData)").collect()
     }
   }
+
+  test("TABLE SAMPLE") {
+    withTable("test") {
+      sql("CREATE TABLE test(c int) USING PARQUET")
+      for (i <- 0 to 20) {
+        sql(s"INSERT INTO test VALUES ($i)")
+      }
+      val df1 = sql("SELECT * FROM test TABLESAMPLE (20 PERCENT) REPEATABLE 
(12345)")
+      val df2 = sql("SELECT * FROM test TABLESAMPLE (20 PERCENT) REPEATABLE 
(12345)")
+      checkAnswer(df1, df2)
+
+      val df3 = sql("SELECT * FROM test TABLESAMPLE (BUCKET 4 OUT OF 10) 
REPEATABLE (6789)")
+      val df4 = sql("SELECT * FROM test TABLESAMPLE (BUCKET 4 OUT OF 10) 
REPEATABLE (6789)")
+      checkAnswer(df3, df4)
+    }
+  }
 }
 
 case class Foo(bar: Option[String])

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[spark] branch master updated: [SPARK-37165][SQL] Add REPEATABLE in TABLESAMPLE to specify seed

Reply via email to