This is an automated email from the ASF dual-hosted git repository.
yao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kyuubi.git
The following commit(s) were added to refs/heads/master by this push:
new a834ed3ef [KYUUBI #4530] [AUTHZ] Support non-English chars for MASK,
MASK_SHOW_FIRST_4, and MASK_SHOW_FIRST_4
a834ed3ef is described below
commit a834ed3efb19c94035b38e7f03a442d3ce9b5423
Author: huangzhir <[email protected]>
AuthorDate: Mon Apr 10 10:26:28 2023 +0800
[KYUUBI #4530] [AUTHZ] Support non-English chars for MASK,
MASK_SHOW_FIRST_4, and MASK_SHOW_FIRST_4
### _Why are the changes needed?_
To fix https://github.com/apache/kyuubi/issues/4530.
1. The reason for issue https://github.com/apache/kyuubi/issues/4530 is
that MASK_SHOW_FIRST_4 and MASK_SHOW_LAST_4 mask types are currently
implemented using the regexp_replace method, which only replaces English
letters and digits, but ignores other languages, such as Chinese.
2. To fix this issue, I modified the regexp_replace method to replace
no-english characters to 'U' letters, so they will also be masked properly.
### _How was this patch tested?_
- [ ] Add some test cases that check the changes thoroughly including
negative and positive cases if possible
- [ ] Add screenshots for manual tests if appropriate
- [x] [Run
test](https://kyuubi.readthedocs.io/en/master/develop_tools/testing.html#running-tests)
locally before make a pull request
Closes #4643 from huangzhir/fixbug-datamask.
Closes #4530
abe45b278 [huangzhir] fix nit
f74e582ed [huangzhir] Move the data preparation to setup,some tests were
modified due to changes in the data.
fb3f89e15 [huangzhir] 1. Modified test methods to perform end-to-end
testing. 2. Mask data should not ignore spaces.
bb6406c81 [huangzhir] Rollback unnecessary changes, add tests using SQL
queries, and modify the Scala style checking code.
7754d74fd [huangzhir] Switching the plan.Replace all characters except
English letters and numbers with a single character 'U'.Preserve the " "
character.
a905817a0 [huangzhir] fix
ce23bcd1b [huangzhir] Regression testing is to keep the original tests
unchanged, and only add the "regexp_replace" test method.
a39f185dd [huangzhir] 1. Use a ‘密’ replacer for it Chinese chars 2. Use a
separate ut cases for testing this regexp_replace method.
94b05db89 [huangzhir] [KYUUBI #4530] [AUTHZ] fixbug support
MASK_SHOW_FIRST_4 和 MASK_SHOW_FIRST_4 chinese data mask
0fc1065ca [huangzhir] fixbug support MASK_SHOW_FIRST_4 和 MASK_SHOW_FIRST_4
chinese data mask
Authored-by: huangzhir <[email protected]>
Signed-off-by: Kent Yao <[email protected]>
---
.../authz/ranger/SparkRangerAdminPlugin.scala | 3 +-
.../authz/ranger/SparkRangerAdminPluginSuite.scala | 9 +-
.../ranger/datamasking/DataMaskingTestBase.scala | 103 +++++++++++++++++----
3 files changed, 92 insertions(+), 23 deletions(-)
diff --git
a/extensions/spark/kyuubi-spark-authz/src/main/scala/org/apache/kyuubi/plugin/spark/authz/ranger/SparkRangerAdminPlugin.scala
b/extensions/spark/kyuubi-spark-authz/src/main/scala/org/apache/kyuubi/plugin/spark/authz/ranger/SparkRangerAdminPlugin.scala
index 78e59ff89..8332b27f0 100644
---
a/extensions/spark/kyuubi-spark-authz/src/main/scala/org/apache/kyuubi/plugin/spark/authz/ranger/SparkRangerAdminPlugin.scala
+++
b/extensions/spark/kyuubi-spark-authz/src/main/scala/org/apache/kyuubi/plugin/spark/authz/ranger/SparkRangerAdminPlugin.scala
@@ -136,7 +136,8 @@ object SparkRangerAdminPlugin extends
RangerBasePlugin("spark", "sparkSql")
val upper = s"regexp_replace($expr, '[A-Z]', 'X'$pos)"
val lower = s"regexp_replace($upper, '[a-z]', 'x'$pos)"
val digits = s"regexp_replace($lower, '[0-9]', 'n'$pos)"
- digits
+ val other = s"regexp_replace($digits, '[^A-Za-z0-9]', 'U'$pos)"
+ other
}
/**
diff --git
a/extensions/spark/kyuubi-spark-authz/src/test/scala/org/apache/kyuubi/plugin/spark/authz/ranger/SparkRangerAdminPluginSuite.scala
b/extensions/spark/kyuubi-spark-authz/src/test/scala/org/apache/kyuubi/plugin/spark/authz/ranger/SparkRangerAdminPluginSuite.scala
index 8711a7287..3338a3314 100644
---
a/extensions/spark/kyuubi-spark-authz/src/test/scala/org/apache/kyuubi/plugin/spark/authz/ranger/SparkRangerAdminPluginSuite.scala
+++
b/extensions/spark/kyuubi-spark-authz/src/test/scala/org/apache/kyuubi/plugin/spark/authz/ranger/SparkRangerAdminPluginSuite.scala
@@ -50,11 +50,14 @@ class SparkRangerAdminPluginSuite extends AnyFunSuite {
}
assert(getMaskingExpr(buildAccessRequest(bob, "value1")).get ===
"md5(cast(value1 as string))")
assert(getMaskingExpr(buildAccessRequest(bob, "value2")).get ===
- "regexp_replace(regexp_replace(regexp_replace(value2, '[A-Z]', 'X'),
'[a-z]', 'x')," +
- " '[0-9]', 'n')")
+ "regexp_replace(regexp_replace(regexp_replace(regexp_replace(value2,
'[A-Z]', 'X')," +
+ " '[a-z]', 'x'), '[0-9]', 'n'), '[^A-Za-z0-9]', 'U')")
assert(getMaskingExpr(buildAccessRequest(bob, "value3")).get contains
"regexp_replace")
assert(getMaskingExpr(buildAccessRequest(bob, "value4")).get ===
"date_trunc('YEAR', value4)")
- assert(getMaskingExpr(buildAccessRequest(bob, "value5")).get contains
"regexp_replace")
+ assert(getMaskingExpr(buildAccessRequest(bob, "value5")).get ===
+ "concat(regexp_replace(regexp_replace(regexp_replace(regexp_replace(" +
+ "left(value5, length(value5) - 4), '[A-Z]', 'X'), '[a-z]', 'x')," +
+ " '[0-9]', 'n'), '[^A-Za-z0-9]', 'U'), right(value5, 4))")
Seq("admin", "alice").foreach { user =>
val ugi = UserGroupInformation.createRemoteUser(user)
diff --git
a/extensions/spark/kyuubi-spark-authz/src/test/scala/org/apache/kyuubi/plugin/spark/authz/ranger/datamasking/DataMaskingTestBase.scala
b/extensions/spark/kyuubi-spark-authz/src/test/scala/org/apache/kyuubi/plugin/spark/authz/ranger/datamasking/DataMaskingTestBase.scala
index 3585397c6..29a709311 100644
---
a/extensions/spark/kyuubi-spark-authz/src/test/scala/org/apache/kyuubi/plugin/spark/authz/ranger/datamasking/DataMaskingTestBase.scala
+++
b/extensions/spark/kyuubi-spark-authz/src/test/scala/org/apache/kyuubi/plugin/spark/authz/ranger/datamasking/DataMaskingTestBase.scala
@@ -55,6 +55,17 @@ trait DataMaskingTestBase extends AnyFunSuite with
SparkSessionProvider with Bef
"SELECT 20, 2, 'kyuubi', 'y', timestamp'2018-11-17 12:34:56', 'world'")
sql("INSERT INTO default.src " +
"SELECT 30, 3, 'spark', 'a', timestamp'2018-11-17 12:34:56', 'world'")
+
+ // scalastyle:off
+ val value1 = "hello WORD 123 ~!@# AßþΔЙקم๗ቐあア叶葉엽"
+ val value2 = "AßþΔЙקم๗ቐあア叶葉엽 hello WORD 123 ~!@#"
+ // AßþΔЙקم๗ቐあア叶葉엽 reference
https://zh.wikipedia.org/zh-cn/Unicode#XML.E5.92.8CUnicode
+ // scalastyle:on
+ sql(s"INSERT INTO default.src " +
+ s"SELECT 10, 4, '$value1', '$value1', timestamp'2018-11-17 12:34:56',
'$value1'")
+ sql("INSERT INTO default.src " +
+ s"SELECT 11, 5, '$value2', '$value2', timestamp'2018-11-17 12:34:56',
'$value2'")
+
sql(s"CREATE TABLE default.unmasked $format AS SELECT * FROM default.src")
}
@@ -74,23 +85,30 @@ trait DataMaskingTestBase extends AnyFunSuite with
SparkSessionProvider with Bef
}
test("simple query with a user doesn't have mask rules") {
- checkAnswer("kent", "SELECT key FROM default.src order by key",
Seq(Row(1), Row(20), Row(30)))
+ checkAnswer(
+ "kent",
+ "SELECT key FROM default.src order by key",
+ Seq(Row(1), Row(10), Row(11), Row(20), Row(30)))
}
test("simple query with a user has mask rules") {
val result =
Seq(Row(md5Hex("1"), "xxxxx", "worlx", Timestamp.valueOf("2018-01-01
00:00:00"), "Xorld"))
- checkAnswer("bob", "SELECT value1, value2, value3, value4, value5 FROM
default.src", result)
checkAnswer(
"bob",
- "SELECT value1 as key, value2, value3, value4, value5 FROM default.src",
+ "SELECT value1, value2, value3, value4, value5 FROM default.src " +
+ "where key = 1",
+ result)
+ checkAnswer(
+ "bob",
+ "SELECT value1 as key, value2, value3, value4, value5 FROM default.src
where key = 1",
result)
}
test("star") {
val result =
Seq(Row(1, md5Hex("1"), "xxxxx", "worlx", Timestamp.valueOf("2018-01-01
00:00:00"), "Xorld"))
- checkAnswer("bob", "SELECT * FROM default.src", result)
+ checkAnswer("bob", "SELECT * FROM default.src where key = 1", result)
}
test("simple udf") {
@@ -98,7 +116,8 @@ trait DataMaskingTestBase extends AnyFunSuite with
SparkSessionProvider with Bef
Seq(Row(md5Hex("1"), "xxxxx", "worlx", Timestamp.valueOf("2018-01-01
00:00:00"), "Xorld"))
checkAnswer(
"bob",
- "SELECT max(value1), max(value2), max(value3), max(value4), max(value5)
FROM default.src",
+ "SELECT max(value1), max(value2), max(value3), max(value4), max(value5)
FROM default.src" +
+ " where key = 1",
result)
}
@@ -109,7 +128,7 @@ trait DataMaskingTestBase extends AnyFunSuite with
SparkSessionProvider with Bef
"bob",
"SELECT coalesce(max(value1), 1), coalesce(max(value2), 1),
coalesce(max(value3), 1), " +
"coalesce(max(value4), timestamp '2018-01-01 22:33:44'),
coalesce(max(value5), 1) " +
- "FROM default.src",
+ "FROM default.src where key = 1",
result)
}
@@ -119,13 +138,16 @@ trait DataMaskingTestBase extends AnyFunSuite with
SparkSessionProvider with Bef
checkAnswer(
"bob",
"SELECT value1, value2, value3, value4, value5 FROM default.src WHERE
value2 in " +
- "(SELECT value2 as key FROM default.src)",
+ "(SELECT value2 as key FROM default.src where key = 1)",
result)
}
test("create a unmasked table as select from a masked one") {
withCleanTmpResources(Seq(("default.src2", "table"))) {
- doAs("bob", sql(s"CREATE TABLE default.src2 $format AS SELECT value1
FROM default.src"))
+ doAs(
+ "bob",
+ sql(s"CREATE TABLE default.src2 $format AS SELECT value1 FROM
default.src " +
+ s"where key = 1"))
checkAnswer("bob", "SELECT value1 FROM default.src2",
Seq(Row(md5Hex("1"))))
}
}
@@ -133,12 +155,24 @@ trait DataMaskingTestBase extends AnyFunSuite with
SparkSessionProvider with Bef
test("insert into a unmasked table from a masked one") {
withCleanTmpResources(Seq(("default.src2", "table"), ("default.src3",
"table"))) {
doAs("bob", sql(s"CREATE TABLE default.src2 (value1 string) $format"))
- doAs("bob", sql(s"INSERT INTO default.src2 SELECT value1 from
default.src"))
- doAs("bob", sql(s"INSERT INTO default.src2 SELECT value1 as v from
default.src"))
+ doAs(
+ "bob",
+ sql(s"INSERT INTO default.src2 SELECT value1 from default.src " +
+ s"where key = 1"))
+ doAs(
+ "bob",
+ sql(s"INSERT INTO default.src2 SELECT value1 as v from default.src " +
+ s"where key = 1"))
checkAnswer("bob", "SELECT value1 FROM default.src2",
Seq(Row(md5Hex("1")), Row(md5Hex("1"))))
doAs("bob", sql(s"CREATE TABLE default.src3 (k int, value string)
$format"))
- doAs("bob", sql(s"INSERT INTO default.src3 SELECT key, value1 from
default.src"))
- doAs("bob", sql(s"INSERT INTO default.src3 SELECT key, value1 as v from
default.src"))
+ doAs(
+ "bob",
+ sql(s"INSERT INTO default.src3 SELECT key, value1 from default.src " +
+ s"where key = 1"))
+ doAs(
+ "bob",
+ sql(s"INSERT INTO default.src3 SELECT key, value1 as v from
default.src " +
+ s"where key = 1"))
checkAnswer("bob", "SELECT value FROM default.src3",
Seq(Row(md5Hex("1")), Row(md5Hex("1"))))
}
}
@@ -152,7 +186,7 @@ trait DataMaskingTestBase extends AnyFunSuite with
SparkSessionProvider with Bef
test("self join on a masked table") {
val s = "SELECT a.value1, b.value1 FROM default.src a" +
- " join default.src b on a.value1=b.value1"
+ " join default.src b on a.value1=b.value1 where a.key = 1 and b.key = 1 "
checkAnswer("bob", s, Seq(Row(md5Hex("1"), md5Hex("1"))))
// just for testing query multiple times, don't delete it
checkAnswer("bob", s, Seq(Row(md5Hex("1"), md5Hex("1"))))
@@ -228,17 +262,18 @@ trait DataMaskingTestBase extends AnyFunSuite with
SparkSessionProvider with Bef
test("union an unmasked table") {
val s = """
SELECT value1 from (
- SELECT a.value1 FROM default.src a
+ SELECT a.value1 FROM default.src a where a.key = 1
union
(SELECT b.value1 FROM default.unmasked b)
) c order by value1
"""
- checkAnswer("bob", s, Seq(Row("1"), Row("2"), Row("3"), Row(md5Hex("1"))))
+ doAs("bob", sql(s).show)
+ checkAnswer("bob", s, Seq(Row("1"), Row("2"), Row("3"), Row("4"),
Row("5"), Row(md5Hex("1"))))
}
test("union a masked table") {
- val s = "SELECT a.value1 FROM default.src a union" +
- " (SELECT b.value1 FROM default.src b)"
+ val s = "SELECT a.value1 FROM default.src a where a.key = 1 union" +
+ " (SELECT b.value1 FROM default.src b where b.key = 1)"
checkAnswer("bob", s, Seq(Row(md5Hex("1"))))
}
@@ -252,12 +287,42 @@ trait DataMaskingTestBase extends AnyFunSuite with
SparkSessionProvider with Bef
withCleanTmpResources(Seq(("default.perm_view", "view"))) {
checkAnswer(
"perm_view_user",
- "SELECT value1, value2 FROM default.src where key < 20",
+ "SELECT value1, value2 FROM default.src where key = 1",
Seq(Row(1, "hello")))
checkAnswer(
"perm_view_user",
- "SELECT value1, value2 FROM default.perm_view where key < 20",
+ "SELECT value1, value2 FROM default.perm_view where key = 1",
Seq(Row(md5Hex("1"), "hello")))
}
}
+
+ // This test only includes a small subset of UCS-2 characters.
+ // But in theory, it should work for all characters
+ test("test MASK,MASK_SHOW_FIRST_4,MASK_SHOW_LAST_4 rule with non-English
character set") {
+ val s1 = s"SELECT * FROM default.src where key = 10"
+ val s2 = s"SELECT * FROM default.src where key = 11"
+ // scalastyle:off
+ checkAnswer(
+ "bob",
+ s1,
+ Seq(Row(
+ 10,
+ md5Hex("4"),
+ "xxxxxUXXXXUnnnUUUUUUXUUUUUUUUUUUUU",
+ "hellxUXXXXUnnnUUUUUUXUUUUUUUUUUUUU",
+ Timestamp.valueOf("2018-01-01 00:00:00"),
+ "xxxxxUXXXXUnnnUUUUUUXUUUUUUUUUア叶葉엽")))
+ checkAnswer(
+ "bob",
+ s2,
+ Seq(Row(
+ 11,
+ md5Hex("5"),
+ "XUUUUUUUUUUUUUUxxxxxUXXXXUnnnUUUUU",
+ "AßþΔUUUUUUUUUUUxxxxxUXXXXUnnnUUUUU",
+ Timestamp.valueOf("2018-01-01 00:00:00"),
+ "XUUUUUUUUUUUUUUxxxxxUXXXXUnnnU~!@#")))
+ // scalastyle:on
+ }
+
}