This is an automated email from the ASF dual-hosted git repository.
wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new c2e3cf1 [SPARK-37666][SQL] Set `GCM` as the default mode in
`aes_encrypt()`/`aes_decrypt()`
c2e3cf1 is described below
commit c2e3cf19f46b06cf26956f18a027c7ef2ee4c2e7
Author: Max Gekk <[email protected]>
AuthorDate: Fri Dec 17 09:50:28 2021 +0800
[SPARK-37666][SQL] Set `GCM` as the default mode in
`aes_encrypt()`/`aes_decrypt()`
### What changes were proposed in this pull request?
In the PR, I propose to use the `GCM` mode ([Galois/Counter
Mode](https://en.wikipedia.org/wiki/Galois/Counter_Mode)) as the default in the
AES functions `aes_encrypt()` and `aes_decrypt()`. Also I modified AES tests
for the `ECB` mode by specifying the mode explicitly, and expression examples
(AesEncrypt was excluded from checks because `GCM` includes random vectors in
its results, and expression examples became unstable).
### Why are the changes needed?
Current mode `ECB` is not semantically secure, see [Why shouldn't I use ECB
encryption?](https://crypto.stackexchange.com/questions/20941/why-shouldnt-i-use-ecb-encryption)
### Does this PR introduce _any_ user-facing change?
No. Since the `aes_encrypt()`/`aes_decrypt()` functions haven't been
released yet, we can change the default mode to more secure one `GCM`.
### How was this patch tested?
By running the affected test suites:
```
$ build/sbt "test:testOnly org.apache.spark.sql.DataFrameFunctionsSuite"
$ build/sbt "test:testOnly org.apache.spark.sql.MiscFunctionsSuite"
$ build/sbt "sql/testOnly *ExpressionsSchemaSuite"
$ build/sbt "sql/test:testOnly
org.apache.spark.sql.expressions.ExpressionInfoSuite"
```
Closes #34925 from MaxGekk/aes-gcm-default.
Authored-by: Max Gekk <[email protected]>
Signed-off-by: Wenchen Fan <[email protected]>
---
.../spark/sql/catalyst/expressions/misc.scala | 22 ++++++++++++----------
.../sql-functions/sql-expression-schema.md | 4 ++--
.../apache/spark/sql/DataFrameFunctionsSuite.scala | 10 +++++-----
.../sql/expressions/ExpressionInfoSuite.scala | 4 +++-
4 files changed, 22 insertions(+), 18 deletions(-)
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala
index 36330ee..941ccb7 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala
@@ -315,7 +315,7 @@ case class CurrentUser() extends LeafExpression with
Unevaluable {
usage = """
_FUNC_(expr, key[, mode[, padding]]) - Returns an encrypted value of
`expr` using AES in given `mode` with the specified `padding`.
Key lengths of 16, 24 and 32 bits are supported. Supported combinations
of (`mode`, `padding`) are ('ECB', 'PKCS') and ('GCM', 'NONE').
- The default mode is ECB.
+ The default mode is GCM.
""",
arguments = """
Arguments:
@@ -328,8 +328,10 @@ case class CurrentUser() extends LeafExpression with
Unevaluable {
""",
examples = """
Examples:
- > SELECT base64(_FUNC_('Spark', 'abcdefghijklmnop'));
- 4Hv0UKCx6nfUeAoPZo1z+w==
+ > SELECT hex(_FUNC_('Spark', '0000111122223333'));
+ 83F16B2AA704794132802D248E6BFD4E380078182D1544813898AC97E709B28A94
+ > SELECT hex(_FUNC_('Spark SQL', '0000111122223333', 'GCM'));
+
6E7CA17BBB468D3084B5744BCA729FB7B2B7BCB8E4472847D02670489D95FA97DBBA7D3210
> SELECT base64(_FUNC_('Spark SQL', '1234567890abcdef', 'ECB', 'PKCS'));
3lmwu+Mw0H3fi5NDvcu9lg==
""",
@@ -359,7 +361,7 @@ case class AesEncrypt(
def this(input: Expression, key: Expression, mode: Expression) =
this(input, key, mode, Literal("DEFAULT"))
def this(input: Expression, key: Expression) =
- this(input, key, Literal("ECB"))
+ this(input, key, Literal("GCM"))
def exprsReplaced: Seq[Expression] = Seq(input, key, mode, padding)
protected def withNewChildInternal(newChild: Expression): AesEncrypt =
@@ -375,9 +377,9 @@ case class AesEncrypt(
*/
@ExpressionDescription(
usage = """
- _FUNC_(expr, key[, mode[, padding]]) - Returns a decrepted value of `expr`
using AES in `mode` with `padding`.
+ _FUNC_(expr, key[, mode[, padding]]) - Returns a decrypted value of `expr`
using AES in `mode` with `padding`.
Key lengths of 16, 24 and 32 bits are supported. Supported combinations
of (`mode`, `padding`) are ('ECB', 'PKCS') and ('GCM', 'NONE').
- The default mode is ECB.
+ The default mode is GCM.
""",
arguments = """
Arguments:
@@ -390,11 +392,11 @@ case class AesEncrypt(
""",
examples = """
Examples:
- > SELECT _FUNC_(unbase64('4Hv0UKCx6nfUeAoPZo1z+w=='),
'abcdefghijklmnop');
+ > SELECT
_FUNC_(unhex('83F16B2AA704794132802D248E6BFD4E380078182D1544813898AC97E709B28A94'),
'0000111122223333');
Spark
- > SELECT _FUNC_(unbase64('3lmwu+Mw0H3fi5NDvcu9lg=='),
'1234567890abcdef', 'ECB', 'PKCS');
+ > SELECT
_FUNC_(unhex('6E7CA17BBB468D3084B5744BCA729FB7B2B7BCB8E4472847D02670489D95FA97DBBA7D3210'),
'0000111122223333', 'GCM');
Spark SQL
- > SELECT
_FUNC_(unbase64('2sXi+jZd/ws+qFC1Tnzvvde5lz+8Haryz9HHBiyrVohXUG7LHA=='),
'1234567890abcdef', 'GCM');
+ > SELECT _FUNC_(unbase64('3lmwu+Mw0H3fi5NDvcu9lg=='),
'1234567890abcdef', 'ECB', 'PKCS');
Spark SQL
""",
since = "3.3.0",
@@ -423,7 +425,7 @@ case class AesDecrypt(
def this(input: Expression, key: Expression, mode: Expression) =
this(input, key, mode, Literal("DEFAULT"))
def this(input: Expression, key: Expression) =
- this(input, key, Literal("ECB"))
+ this(input, key, Literal("GCM"))
def exprsReplaced: Seq[Expression] = Seq(input, key)
protected def withNewChildInternal(newChild: Expression): AesDecrypt =
diff --git a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md
b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md
index 46f170a..2c9fc6f 100644
--- a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md
+++ b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md
@@ -11,8 +11,8 @@
| org.apache.spark.sql.catalyst.expressions.Acosh | acosh | SELECT acosh(1) |
struct<ACOSH(1):double> |
| org.apache.spark.sql.catalyst.expressions.Add | + | SELECT 1 + 2 | struct<(1
+ 2):int> |
| org.apache.spark.sql.catalyst.expressions.AddMonths | add_months | SELECT
add_months('2016-08-31', 1) | struct<add_months(2016-08-31, 1):date> |
-| org.apache.spark.sql.catalyst.expressions.AesDecrypt | aes_decrypt | SELECT
aes_decrypt(unbase64('4Hv0UKCx6nfUeAoPZo1z+w=='), 'abcdefghijklmnop') |
struct<aesdecrypt(unbase64(4Hv0UKCx6nfUeAoPZo1z+w==), abcdefghijklmnop):binary>
|
-| org.apache.spark.sql.catalyst.expressions.AesEncrypt | aes_encrypt | SELECT
base64(aes_encrypt('Spark', 'abcdefghijklmnop')) |
struct<base64(aesencrypt(Spark, abcdefghijklmnop, ECB, DEFAULT)):string> |
+| org.apache.spark.sql.catalyst.expressions.AesDecrypt | aes_decrypt | SELECT
aes_decrypt(unhex('83F16B2AA704794132802D248E6BFD4E380078182D1544813898AC97E709B28A94'),
'0000111122223333') |
struct<aesdecrypt(unhex(83F16B2AA704794132802D248E6BFD4E380078182D1544813898AC97E709B28A94),
0000111122223333):binary> |
+| org.apache.spark.sql.catalyst.expressions.AesEncrypt | aes_encrypt | SELECT
hex(aes_encrypt('Spark', '0000111122223333')) | struct<hex(aesencrypt(Spark,
0000111122223333, GCM, DEFAULT)):string> |
| org.apache.spark.sql.catalyst.expressions.And | and | SELECT true and true |
struct<(true AND true):boolean> |
| org.apache.spark.sql.catalyst.expressions.ArrayAggregate | aggregate |
SELECT aggregate(array(1, 2, 3), 0, (acc, x) -> acc + x) |
struct<aggregate(array(1, 2, 3), 0, lambdafunction((namedlambdavariable() +
namedlambdavariable()), namedlambdavariable(), namedlambdavariable()),
lambdafunction(namedlambdavariable(), namedlambdavariable())):int> |
| org.apache.spark.sql.catalyst.expressions.ArrayContains | array_contains |
SELECT array_contains(array(1, 2, 3), 2) | struct<array_contains(array(1, 2,
3), 2):boolean> |
diff --git
a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
index 9159aae..ed3a68c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
@@ -286,10 +286,10 @@ class DataFrameFunctionsSuite extends QueryTest with
SharedSparkSession {
(key32, encryptedText32, encryptedEmptyText32)).foreach {
case (key, encryptedText, encryptedEmptyText) =>
checkAnswer(
- df1.selectExpr(s"base64(aes_encrypt(value, '$key'))"),
+ df1.selectExpr(s"base64(aes_encrypt(value, '$key', 'ECB'))"),
Seq(Row(encryptedText), Row(encryptedEmptyText)))
checkAnswer(
- df1.selectExpr(s"base64(aes_encrypt(binary(value), '$key'))"),
+ df1.selectExpr(s"base64(aes_encrypt(binary(value), '$key', 'ECB'))"),
Seq(Row(encryptedText), Row(encryptedEmptyText)))
}
@@ -336,10 +336,10 @@ class DataFrameFunctionsSuite extends QueryTest with
SharedSparkSession {
("value32", key32)).foreach {
case (colName, key) =>
checkAnswer(
- df2.selectExpr(s"cast(aes_decrypt(unbase64($colName), '$key') as
string)"),
+ df2.selectExpr(s"cast(aes_decrypt(unbase64($colName), '$key', 'ECB')
as string)"),
Seq(Row("Spark"), Row("")))
checkAnswer(
- df2.selectExpr(s"cast(aes_decrypt(unbase64($colName),
binary('$key')) as string)"),
+ df2.selectExpr(s"cast(aes_decrypt(unbase64($colName),
binary('$key'), 'ECB') as string)"),
Seq(Row("Spark"), Row("")))
}
@@ -380,7 +380,7 @@ class DataFrameFunctionsSuite extends QueryTest with
SharedSparkSession {
("value32", dummyKey32)).foreach {
case (colName, key) =>
val e = intercept[Exception] {
- df2.selectExpr(s"aes_decrypt(unbase64($colName),
binary('$key'))").collect
+ df2.selectExpr(s"aes_decrypt(unbase64($colName), binary('$key'),
'ECB')").collect
}
assert(e.getMessage.contains("BadPaddingException"))
}
diff --git
a/sql/core/src/test/scala/org/apache/spark/sql/expressions/ExpressionInfoSuite.scala
b/sql/core/src/test/scala/org/apache/spark/sql/expressions/ExpressionInfoSuite.scala
index 11c06d3..7f25831 100644
---
a/sql/core/src/test/scala/org/apache/spark/sql/expressions/ExpressionInfoSuite.scala
+++
b/sql/core/src/test/scala/org/apache/spark/sql/expressions/ExpressionInfoSuite.scala
@@ -190,7 +190,9 @@ class ExpressionInfoSuite extends SparkFunSuite with
SharedSparkSession {
"org.apache.spark.sql.catalyst.expressions.SparkVersion",
// Throws an error
"org.apache.spark.sql.catalyst.expressions.RaiseError",
- classOf[CurrentUser].getName)
+ classOf[CurrentUser].getName,
+ // The encrypt expression includes a random initialization vector to its
encrypted result
+ classOf[AesEncrypt].getName)
val parFuncs = new
ParVector(spark.sessionState.functionRegistry.listFunction().toVector)
parFuncs.foreach { funcId =>
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]