This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new c2e3cf1  [SPARK-37666][SQL] Set `GCM` as the default mode in 
`aes_encrypt()`/`aes_decrypt()`
c2e3cf1 is described below

commit c2e3cf19f46b06cf26956f18a027c7ef2ee4c2e7
Author: Max Gekk <[email protected]>
AuthorDate: Fri Dec 17 09:50:28 2021 +0800

    [SPARK-37666][SQL] Set `GCM` as the default mode in 
`aes_encrypt()`/`aes_decrypt()`
    
    ### What changes were proposed in this pull request?
    In the PR, I propose to use the `GCM` mode ([Galois/Counter 
Mode](https://en.wikipedia.org/wiki/Galois/Counter_Mode)) as the default in the 
AES functions `aes_encrypt()` and `aes_decrypt()`. Also I modified AES tests 
for the `ECB` mode by specifying the mode explicitly, and expression examples 
(AesEncrypt was excluded from checks because `GCM` includes random vectors in 
its results, and expression examples became unstable).
    
    ### Why are the changes needed?
    Current mode `ECB` is not semantically secure, see [Why shouldn't I use ECB 
encryption?](https://crypto.stackexchange.com/questions/20941/why-shouldnt-i-use-ecb-encryption)
    
    ### Does this PR introduce _any_ user-facing change?
    No. Since the `aes_encrypt()`/`aes_decrypt()` functions haven't been 
released yet, we can change the default mode to more secure one `GCM`.
    
    ### How was this patch tested?
    By running the affected test suites:
    ```
    $ build/sbt "test:testOnly org.apache.spark.sql.DataFrameFunctionsSuite"
    $ build/sbt "test:testOnly org.apache.spark.sql.MiscFunctionsSuite"
    $ build/sbt "sql/testOnly *ExpressionsSchemaSuite"
    $ build/sbt "sql/test:testOnly 
org.apache.spark.sql.expressions.ExpressionInfoSuite"
    ```
    
    Closes #34925 from MaxGekk/aes-gcm-default.
    
    Authored-by: Max Gekk <[email protected]>
    Signed-off-by: Wenchen Fan <[email protected]>
---
 .../spark/sql/catalyst/expressions/misc.scala      | 22 ++++++++++++----------
 .../sql-functions/sql-expression-schema.md         |  4 ++--
 .../apache/spark/sql/DataFrameFunctionsSuite.scala | 10 +++++-----
 .../sql/expressions/ExpressionInfoSuite.scala      |  4 +++-
 4 files changed, 22 insertions(+), 18 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala
index 36330ee..941ccb7 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala
@@ -315,7 +315,7 @@ case class CurrentUser() extends LeafExpression with 
Unevaluable {
   usage = """
     _FUNC_(expr, key[, mode[, padding]]) - Returns an encrypted value of 
`expr` using AES in given `mode` with the specified `padding`.
       Key lengths of 16, 24 and 32 bits are supported. Supported combinations 
of (`mode`, `padding`) are ('ECB', 'PKCS') and ('GCM', 'NONE').
-      The default mode is ECB.
+      The default mode is GCM.
   """,
   arguments = """
     Arguments:
@@ -328,8 +328,10 @@ case class CurrentUser() extends LeafExpression with 
Unevaluable {
   """,
   examples = """
     Examples:
-      > SELECT base64(_FUNC_('Spark', 'abcdefghijklmnop'));
-       4Hv0UKCx6nfUeAoPZo1z+w==
+      > SELECT hex(_FUNC_('Spark', '0000111122223333'));
+       83F16B2AA704794132802D248E6BFD4E380078182D1544813898AC97E709B28A94
+      > SELECT hex(_FUNC_('Spark SQL', '0000111122223333', 'GCM'));
+       
6E7CA17BBB468D3084B5744BCA729FB7B2B7BCB8E4472847D02670489D95FA97DBBA7D3210
       > SELECT base64(_FUNC_('Spark SQL', '1234567890abcdef', 'ECB', 'PKCS'));
        3lmwu+Mw0H3fi5NDvcu9lg==
   """,
@@ -359,7 +361,7 @@ case class AesEncrypt(
   def this(input: Expression, key: Expression, mode: Expression) =
     this(input, key, mode, Literal("DEFAULT"))
   def this(input: Expression, key: Expression) =
-    this(input, key, Literal("ECB"))
+    this(input, key, Literal("GCM"))
 
   def exprsReplaced: Seq[Expression] = Seq(input, key, mode, padding)
   protected def withNewChildInternal(newChild: Expression): AesEncrypt =
@@ -375,9 +377,9 @@ case class AesEncrypt(
  */
 @ExpressionDescription(
   usage = """
-    _FUNC_(expr, key[, mode[, padding]]) - Returns a decrepted value of `expr` 
using AES in `mode` with `padding`.
+    _FUNC_(expr, key[, mode[, padding]]) - Returns a decrypted value of `expr` 
using AES in `mode` with `padding`.
       Key lengths of 16, 24 and 32 bits are supported. Supported combinations 
of (`mode`, `padding`) are ('ECB', 'PKCS') and ('GCM', 'NONE').
-      The default mode is ECB.
+      The default mode is GCM.
   """,
   arguments = """
     Arguments:
@@ -390,11 +392,11 @@ case class AesEncrypt(
   """,
   examples = """
     Examples:
-      > SELECT _FUNC_(unbase64('4Hv0UKCx6nfUeAoPZo1z+w=='), 
'abcdefghijklmnop');
+      > SELECT 
_FUNC_(unhex('83F16B2AA704794132802D248E6BFD4E380078182D1544813898AC97E709B28A94'),
 '0000111122223333');
        Spark
-      > SELECT _FUNC_(unbase64('3lmwu+Mw0H3fi5NDvcu9lg=='), 
'1234567890abcdef', 'ECB', 'PKCS');
+      > SELECT 
_FUNC_(unhex('6E7CA17BBB468D3084B5744BCA729FB7B2B7BCB8E4472847D02670489D95FA97DBBA7D3210'),
 '0000111122223333', 'GCM');
        Spark SQL
-      > SELECT 
_FUNC_(unbase64('2sXi+jZd/ws+qFC1Tnzvvde5lz+8Haryz9HHBiyrVohXUG7LHA=='), 
'1234567890abcdef', 'GCM');
+      > SELECT _FUNC_(unbase64('3lmwu+Mw0H3fi5NDvcu9lg=='), 
'1234567890abcdef', 'ECB', 'PKCS');
        Spark SQL
   """,
   since = "3.3.0",
@@ -423,7 +425,7 @@ case class AesDecrypt(
   def this(input: Expression, key: Expression, mode: Expression) =
     this(input, key, mode, Literal("DEFAULT"))
   def this(input: Expression, key: Expression) =
-    this(input, key, Literal("ECB"))
+    this(input, key, Literal("GCM"))
 
   def exprsReplaced: Seq[Expression] = Seq(input, key)
   protected def withNewChildInternal(newChild: Expression): AesDecrypt =
diff --git a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md 
b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md
index 46f170a..2c9fc6f 100644
--- a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md
+++ b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md
@@ -11,8 +11,8 @@
 | org.apache.spark.sql.catalyst.expressions.Acosh | acosh | SELECT acosh(1) | 
struct<ACOSH(1):double> |
 | org.apache.spark.sql.catalyst.expressions.Add | + | SELECT 1 + 2 | struct<(1 
+ 2):int> |
 | org.apache.spark.sql.catalyst.expressions.AddMonths | add_months | SELECT 
add_months('2016-08-31', 1) | struct<add_months(2016-08-31, 1):date> |
-| org.apache.spark.sql.catalyst.expressions.AesDecrypt | aes_decrypt | SELECT 
aes_decrypt(unbase64('4Hv0UKCx6nfUeAoPZo1z+w=='), 'abcdefghijklmnop') | 
struct<aesdecrypt(unbase64(4Hv0UKCx6nfUeAoPZo1z+w==), abcdefghijklmnop):binary> 
|
-| org.apache.spark.sql.catalyst.expressions.AesEncrypt | aes_encrypt | SELECT 
base64(aes_encrypt('Spark', 'abcdefghijklmnop')) | 
struct<base64(aesencrypt(Spark, abcdefghijklmnop, ECB, DEFAULT)):string> |
+| org.apache.spark.sql.catalyst.expressions.AesDecrypt | aes_decrypt | SELECT 
aes_decrypt(unhex('83F16B2AA704794132802D248E6BFD4E380078182D1544813898AC97E709B28A94'),
 '0000111122223333') | 
struct<aesdecrypt(unhex(83F16B2AA704794132802D248E6BFD4E380078182D1544813898AC97E709B28A94),
 0000111122223333):binary> |
+| org.apache.spark.sql.catalyst.expressions.AesEncrypt | aes_encrypt | SELECT 
hex(aes_encrypt('Spark', '0000111122223333')) | struct<hex(aesencrypt(Spark, 
0000111122223333, GCM, DEFAULT)):string> |
 | org.apache.spark.sql.catalyst.expressions.And | and | SELECT true and true | 
struct<(true AND true):boolean> |
 | org.apache.spark.sql.catalyst.expressions.ArrayAggregate | aggregate | 
SELECT aggregate(array(1, 2, 3), 0, (acc, x) -> acc + x) | 
struct<aggregate(array(1, 2, 3), 0, lambdafunction((namedlambdavariable() + 
namedlambdavariable()), namedlambdavariable(), namedlambdavariable()), 
lambdafunction(namedlambdavariable(), namedlambdavariable())):int> |
 | org.apache.spark.sql.catalyst.expressions.ArrayContains | array_contains | 
SELECT array_contains(array(1, 2, 3), 2) | struct<array_contains(array(1, 2, 
3), 2):boolean> |
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
index 9159aae..ed3a68c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
@@ -286,10 +286,10 @@ class DataFrameFunctionsSuite extends QueryTest with 
SharedSparkSession {
       (key32, encryptedText32, encryptedEmptyText32)).foreach {
       case (key, encryptedText, encryptedEmptyText) =>
         checkAnswer(
-          df1.selectExpr(s"base64(aes_encrypt(value, '$key'))"),
+          df1.selectExpr(s"base64(aes_encrypt(value, '$key', 'ECB'))"),
           Seq(Row(encryptedText), Row(encryptedEmptyText)))
         checkAnswer(
-          df1.selectExpr(s"base64(aes_encrypt(binary(value), '$key'))"),
+          df1.selectExpr(s"base64(aes_encrypt(binary(value), '$key', 'ECB'))"),
           Seq(Row(encryptedText), Row(encryptedEmptyText)))
     }
 
@@ -336,10 +336,10 @@ class DataFrameFunctionsSuite extends QueryTest with 
SharedSparkSession {
       ("value32", key32)).foreach {
       case (colName, key) =>
         checkAnswer(
-          df2.selectExpr(s"cast(aes_decrypt(unbase64($colName), '$key') as 
string)"),
+          df2.selectExpr(s"cast(aes_decrypt(unbase64($colName), '$key', 'ECB') 
as string)"),
           Seq(Row("Spark"), Row("")))
         checkAnswer(
-          df2.selectExpr(s"cast(aes_decrypt(unbase64($colName), 
binary('$key')) as string)"),
+          df2.selectExpr(s"cast(aes_decrypt(unbase64($colName), 
binary('$key'), 'ECB') as string)"),
           Seq(Row("Spark"), Row("")))
     }
 
@@ -380,7 +380,7 @@ class DataFrameFunctionsSuite extends QueryTest with 
SharedSparkSession {
       ("value32", dummyKey32)).foreach {
       case (colName, key) =>
         val e = intercept[Exception] {
-          df2.selectExpr(s"aes_decrypt(unbase64($colName), 
binary('$key'))").collect
+          df2.selectExpr(s"aes_decrypt(unbase64($colName), binary('$key'), 
'ECB')").collect
         }
         assert(e.getMessage.contains("BadPaddingException"))
     }
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/expressions/ExpressionInfoSuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/expressions/ExpressionInfoSuite.scala
index 11c06d3..7f25831 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/expressions/ExpressionInfoSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/expressions/ExpressionInfoSuite.scala
@@ -190,7 +190,9 @@ class ExpressionInfoSuite extends SparkFunSuite with 
SharedSparkSession {
       "org.apache.spark.sql.catalyst.expressions.SparkVersion",
       // Throws an error
       "org.apache.spark.sql.catalyst.expressions.RaiseError",
-      classOf[CurrentUser].getName)
+      classOf[CurrentUser].getName,
+      // The encrypt expression includes a random initialization vector to its 
encrypted result
+      classOf[AesEncrypt].getName)
 
     val parFuncs = new 
ParVector(spark.sessionState.functionRegistry.listFunction().toVector)
     parFuncs.foreach { funcId =>

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to