This is an automated email from the ASF dual-hosted git repository.

yao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kyuubi.git


The following commit(s) were added to refs/heads/master by this push:
     new a834ed3ef [KYUUBI #4530]  [AUTHZ] Support non-English chars for MASK, 
MASK_SHOW_FIRST_4, and MASK_SHOW_FIRST_4
a834ed3ef is described below

commit a834ed3efb19c94035b38e7f03a442d3ce9b5423
Author: huangzhir <[email protected]>
AuthorDate: Mon Apr 10 10:26:28 2023 +0800

    [KYUUBI #4530]  [AUTHZ] Support non-English chars for MASK, 
MASK_SHOW_FIRST_4, and MASK_SHOW_FIRST_4
    
    ### _Why are the changes needed?_
    To fix https://github.com/apache/kyuubi/issues/4530.
    1. The reason for issue https://github.com/apache/kyuubi/issues/4530  is 
that MASK_SHOW_FIRST_4 and MASK_SHOW_LAST_4 mask types are currently 
implemented using the regexp_replace method, which only replaces English 
letters and digits, but ignores other languages, such as Chinese.
    2. To fix this issue, I modified the regexp_replace method to replace 
no-english characters to 'U' letters, so they will also be masked properly.
    
    ### _How was this patch tested?_
    
    - [ ] Add some test cases that check the changes thoroughly including 
negative and positive cases if possible
    
    - [ ] Add screenshots for manual tests if appropriate
    
    - [x] [Run 
test](https://kyuubi.readthedocs.io/en/master/develop_tools/testing.html#running-tests)
 locally before make a pull request
    
    Closes #4643 from huangzhir/fixbug-datamask.
    
    Closes #4530
    
    abe45b278 [huangzhir] fix nit
    f74e582ed [huangzhir] Move the data preparation to setup,some tests were 
modified due to changes in the data.
    fb3f89e15 [huangzhir] 1. Modified test methods to perform end-to-end 
testing. 2. Mask data should not ignore spaces.
    bb6406c81 [huangzhir] Rollback unnecessary changes, add tests using SQL 
queries, and modify the Scala style checking code.
    7754d74fd [huangzhir] Switching the plan.Replace all characters except 
English letters and numbers with a single character 'U'.Preserve the " " 
character.
    a905817a0 [huangzhir] fix
    ce23bcd1b [huangzhir] Regression testing is to keep the original tests 
unchanged, and only add the "regexp_replace" test method.
    a39f185dd [huangzhir] 1. Use a ‘密’ replacer for it Chinese chars 2. Use a 
separate ut cases for testing this regexp_replace method.
    94b05db89 [huangzhir] [KYUUBI #4530] [AUTHZ] fixbug support 
MASK_SHOW_FIRST_4 和 MASK_SHOW_FIRST_4 chinese data mask
    0fc1065ca [huangzhir] fixbug support MASK_SHOW_FIRST_4 和 MASK_SHOW_FIRST_4 
chinese data mask
    
    Authored-by: huangzhir <[email protected]>
    Signed-off-by: Kent Yao <[email protected]>
---
 .../authz/ranger/SparkRangerAdminPlugin.scala      |   3 +-
 .../authz/ranger/SparkRangerAdminPluginSuite.scala |   9 +-
 .../ranger/datamasking/DataMaskingTestBase.scala   | 103 +++++++++++++++++----
 3 files changed, 92 insertions(+), 23 deletions(-)

diff --git 
a/extensions/spark/kyuubi-spark-authz/src/main/scala/org/apache/kyuubi/plugin/spark/authz/ranger/SparkRangerAdminPlugin.scala
 
b/extensions/spark/kyuubi-spark-authz/src/main/scala/org/apache/kyuubi/plugin/spark/authz/ranger/SparkRangerAdminPlugin.scala
index 78e59ff89..8332b27f0 100644
--- 
a/extensions/spark/kyuubi-spark-authz/src/main/scala/org/apache/kyuubi/plugin/spark/authz/ranger/SparkRangerAdminPlugin.scala
+++ 
b/extensions/spark/kyuubi-spark-authz/src/main/scala/org/apache/kyuubi/plugin/spark/authz/ranger/SparkRangerAdminPlugin.scala
@@ -136,7 +136,8 @@ object SparkRangerAdminPlugin extends 
RangerBasePlugin("spark", "sparkSql")
     val upper = s"regexp_replace($expr, '[A-Z]', 'X'$pos)"
     val lower = s"regexp_replace($upper, '[a-z]', 'x'$pos)"
     val digits = s"regexp_replace($lower, '[0-9]', 'n'$pos)"
-    digits
+    val other = s"regexp_replace($digits, '[^A-Za-z0-9]', 'U'$pos)"
+    other
   }
 
   /**
diff --git 
a/extensions/spark/kyuubi-spark-authz/src/test/scala/org/apache/kyuubi/plugin/spark/authz/ranger/SparkRangerAdminPluginSuite.scala
 
b/extensions/spark/kyuubi-spark-authz/src/test/scala/org/apache/kyuubi/plugin/spark/authz/ranger/SparkRangerAdminPluginSuite.scala
index 8711a7287..3338a3314 100644
--- 
a/extensions/spark/kyuubi-spark-authz/src/test/scala/org/apache/kyuubi/plugin/spark/authz/ranger/SparkRangerAdminPluginSuite.scala
+++ 
b/extensions/spark/kyuubi-spark-authz/src/test/scala/org/apache/kyuubi/plugin/spark/authz/ranger/SparkRangerAdminPluginSuite.scala
@@ -50,11 +50,14 @@ class SparkRangerAdminPluginSuite extends AnyFunSuite {
     }
     assert(getMaskingExpr(buildAccessRequest(bob, "value1")).get === 
"md5(cast(value1 as string))")
     assert(getMaskingExpr(buildAccessRequest(bob, "value2")).get ===
-      "regexp_replace(regexp_replace(regexp_replace(value2, '[A-Z]', 'X'), 
'[a-z]', 'x')," +
-      " '[0-9]', 'n')")
+      "regexp_replace(regexp_replace(regexp_replace(regexp_replace(value2, 
'[A-Z]', 'X')," +
+      " '[a-z]', 'x'), '[0-9]', 'n'), '[^A-Za-z0-9]', 'U')")
     assert(getMaskingExpr(buildAccessRequest(bob, "value3")).get contains 
"regexp_replace")
     assert(getMaskingExpr(buildAccessRequest(bob, "value4")).get === 
"date_trunc('YEAR', value4)")
-    assert(getMaskingExpr(buildAccessRequest(bob, "value5")).get contains 
"regexp_replace")
+    assert(getMaskingExpr(buildAccessRequest(bob, "value5")).get ===
+      "concat(regexp_replace(regexp_replace(regexp_replace(regexp_replace(" +
+      "left(value5, length(value5) - 4), '[A-Z]', 'X'), '[a-z]', 'x')," +
+      " '[0-9]', 'n'), '[^A-Za-z0-9]', 'U'), right(value5, 4))")
 
     Seq("admin", "alice").foreach { user =>
       val ugi = UserGroupInformation.createRemoteUser(user)
diff --git 
a/extensions/spark/kyuubi-spark-authz/src/test/scala/org/apache/kyuubi/plugin/spark/authz/ranger/datamasking/DataMaskingTestBase.scala
 
b/extensions/spark/kyuubi-spark-authz/src/test/scala/org/apache/kyuubi/plugin/spark/authz/ranger/datamasking/DataMaskingTestBase.scala
index 3585397c6..29a709311 100644
--- 
a/extensions/spark/kyuubi-spark-authz/src/test/scala/org/apache/kyuubi/plugin/spark/authz/ranger/datamasking/DataMaskingTestBase.scala
+++ 
b/extensions/spark/kyuubi-spark-authz/src/test/scala/org/apache/kyuubi/plugin/spark/authz/ranger/datamasking/DataMaskingTestBase.scala
@@ -55,6 +55,17 @@ trait DataMaskingTestBase extends AnyFunSuite with 
SparkSessionProvider with Bef
       "SELECT 20, 2, 'kyuubi', 'y', timestamp'2018-11-17 12:34:56', 'world'")
     sql("INSERT INTO default.src " +
       "SELECT 30, 3, 'spark', 'a', timestamp'2018-11-17 12:34:56', 'world'")
+
+    // scalastyle:off
+    val value1 = "hello WORD 123 ~!@# AßþΔЙקم๗ቐあア叶葉엽"
+    val value2 = "AßþΔЙקم๗ቐあア叶葉엽 hello WORD 123 ~!@#"
+    // AßþΔЙקم๗ቐあア叶葉엽 reference 
https://zh.wikipedia.org/zh-cn/Unicode#XML.E5.92.8CUnicode
+    // scalastyle:on
+    sql(s"INSERT INTO default.src " +
+      s"SELECT 10, 4, '$value1', '$value1', timestamp'2018-11-17 12:34:56', 
'$value1'")
+    sql("INSERT INTO default.src " +
+      s"SELECT 11, 5, '$value2', '$value2', timestamp'2018-11-17 12:34:56', 
'$value2'")
+
     sql(s"CREATE TABLE default.unmasked $format AS SELECT * FROM default.src")
   }
 
@@ -74,23 +85,30 @@ trait DataMaskingTestBase extends AnyFunSuite with 
SparkSessionProvider with Bef
   }
 
   test("simple query with a user doesn't have mask rules") {
-    checkAnswer("kent", "SELECT key FROM default.src order by key", 
Seq(Row(1), Row(20), Row(30)))
+    checkAnswer(
+      "kent",
+      "SELECT key FROM default.src order by key",
+      Seq(Row(1), Row(10), Row(11), Row(20), Row(30)))
   }
 
   test("simple query with a user has mask rules") {
     val result =
       Seq(Row(md5Hex("1"), "xxxxx", "worlx", Timestamp.valueOf("2018-01-01 
00:00:00"), "Xorld"))
-    checkAnswer("bob", "SELECT value1, value2, value3, value4, value5 FROM 
default.src", result)
     checkAnswer(
       "bob",
-      "SELECT value1 as key, value2, value3, value4, value5 FROM default.src",
+      "SELECT value1, value2, value3, value4, value5 FROM default.src " +
+        "where key = 1",
+      result)
+    checkAnswer(
+      "bob",
+      "SELECT value1 as key, value2, value3, value4, value5 FROM default.src 
where key = 1",
       result)
   }
 
   test("star") {
     val result =
       Seq(Row(1, md5Hex("1"), "xxxxx", "worlx", Timestamp.valueOf("2018-01-01 
00:00:00"), "Xorld"))
-    checkAnswer("bob", "SELECT * FROM default.src", result)
+    checkAnswer("bob", "SELECT * FROM default.src where key = 1", result)
   }
 
   test("simple udf") {
@@ -98,7 +116,8 @@ trait DataMaskingTestBase extends AnyFunSuite with 
SparkSessionProvider with Bef
       Seq(Row(md5Hex("1"), "xxxxx", "worlx", Timestamp.valueOf("2018-01-01 
00:00:00"), "Xorld"))
     checkAnswer(
       "bob",
-      "SELECT max(value1), max(value2), max(value3), max(value4), max(value5) 
FROM default.src",
+      "SELECT max(value1), max(value2), max(value3), max(value4), max(value5) 
FROM default.src" +
+        " where key = 1",
       result)
   }
 
@@ -109,7 +128,7 @@ trait DataMaskingTestBase extends AnyFunSuite with 
SparkSessionProvider with Bef
       "bob",
       "SELECT coalesce(max(value1), 1), coalesce(max(value2), 1), 
coalesce(max(value3), 1), " +
         "coalesce(max(value4), timestamp '2018-01-01 22:33:44'), 
coalesce(max(value5), 1) " +
-        "FROM default.src",
+        "FROM default.src where key = 1",
       result)
   }
 
@@ -119,13 +138,16 @@ trait DataMaskingTestBase extends AnyFunSuite with 
SparkSessionProvider with Bef
     checkAnswer(
       "bob",
       "SELECT value1, value2, value3, value4, value5 FROM default.src WHERE 
value2 in " +
-        "(SELECT value2 as key FROM default.src)",
+        "(SELECT value2 as key FROM default.src where key = 1)",
       result)
   }
 
   test("create a unmasked table as select from a masked one") {
     withCleanTmpResources(Seq(("default.src2", "table"))) {
-      doAs("bob", sql(s"CREATE TABLE default.src2 $format AS SELECT value1 
FROM default.src"))
+      doAs(
+        "bob",
+        sql(s"CREATE TABLE default.src2 $format AS SELECT value1 FROM 
default.src " +
+          s"where key = 1"))
       checkAnswer("bob", "SELECT value1 FROM default.src2", 
Seq(Row(md5Hex("1"))))
     }
   }
@@ -133,12 +155,24 @@ trait DataMaskingTestBase extends AnyFunSuite with 
SparkSessionProvider with Bef
   test("insert into a unmasked table from a masked one") {
     withCleanTmpResources(Seq(("default.src2", "table"), ("default.src3", 
"table"))) {
       doAs("bob", sql(s"CREATE TABLE default.src2 (value1 string) $format"))
-      doAs("bob", sql(s"INSERT INTO default.src2 SELECT value1 from 
default.src"))
-      doAs("bob", sql(s"INSERT INTO default.src2 SELECT value1 as v from 
default.src"))
+      doAs(
+        "bob",
+        sql(s"INSERT INTO default.src2 SELECT value1 from default.src " +
+          s"where key = 1"))
+      doAs(
+        "bob",
+        sql(s"INSERT INTO default.src2 SELECT value1 as v from default.src " +
+          s"where key = 1"))
       checkAnswer("bob", "SELECT value1 FROM default.src2", 
Seq(Row(md5Hex("1")), Row(md5Hex("1"))))
       doAs("bob", sql(s"CREATE TABLE default.src3 (k int, value string) 
$format"))
-      doAs("bob", sql(s"INSERT INTO default.src3 SELECT key, value1 from 
default.src"))
-      doAs("bob", sql(s"INSERT INTO default.src3 SELECT key, value1 as v from 
default.src"))
+      doAs(
+        "bob",
+        sql(s"INSERT INTO default.src3 SELECT key, value1 from default.src  " +
+          s"where key = 1"))
+      doAs(
+        "bob",
+        sql(s"INSERT INTO default.src3 SELECT key, value1 as v from 
default.src " +
+          s"where key = 1"))
       checkAnswer("bob", "SELECT value FROM default.src3", 
Seq(Row(md5Hex("1")), Row(md5Hex("1"))))
     }
   }
@@ -152,7 +186,7 @@ trait DataMaskingTestBase extends AnyFunSuite with 
SparkSessionProvider with Bef
 
   test("self join on a masked table") {
     val s = "SELECT a.value1, b.value1 FROM default.src a" +
-      " join default.src b on a.value1=b.value1"
+      " join default.src b on a.value1=b.value1 where a.key = 1 and b.key = 1 "
     checkAnswer("bob", s, Seq(Row(md5Hex("1"), md5Hex("1"))))
     // just for testing query multiple times, don't delete it
     checkAnswer("bob", s, Seq(Row(md5Hex("1"), md5Hex("1"))))
@@ -228,17 +262,18 @@ trait DataMaskingTestBase extends AnyFunSuite with 
SparkSessionProvider with Bef
   test("union an unmasked table") {
     val s = """
       SELECT value1 from (
-           SELECT a.value1 FROM default.src a
+           SELECT a.value1 FROM default.src a where a.key = 1
            union
           (SELECT b.value1 FROM default.unmasked b)
       ) c order by value1
       """
-    checkAnswer("bob", s, Seq(Row("1"), Row("2"), Row("3"), Row(md5Hex("1"))))
+    doAs("bob", sql(s).show)
+    checkAnswer("bob", s, Seq(Row("1"), Row("2"), Row("3"), Row("4"), 
Row("5"), Row(md5Hex("1"))))
   }
 
   test("union a masked table") {
-    val s = "SELECT a.value1 FROM default.src a union" +
-      " (SELECT b.value1 FROM default.src b)"
+    val s = "SELECT a.value1 FROM default.src a where a.key = 1 union" +
+      " (SELECT b.value1 FROM default.src b where b.key = 1)"
     checkAnswer("bob", s, Seq(Row(md5Hex("1"))))
   }
 
@@ -252,12 +287,42 @@ trait DataMaskingTestBase extends AnyFunSuite with 
SparkSessionProvider with Bef
     withCleanTmpResources(Seq(("default.perm_view", "view"))) {
       checkAnswer(
         "perm_view_user",
-        "SELECT value1, value2 FROM default.src where key < 20",
+        "SELECT value1, value2 FROM default.src where key = 1",
         Seq(Row(1, "hello")))
       checkAnswer(
         "perm_view_user",
-        "SELECT value1, value2 FROM default.perm_view where key < 20",
+        "SELECT value1, value2 FROM default.perm_view where key = 1",
         Seq(Row(md5Hex("1"), "hello")))
     }
   }
+
+  // This test only includes a small subset of UCS-2 characters.
+  // But in theory, it should work for all characters
+  test("test MASK,MASK_SHOW_FIRST_4,MASK_SHOW_LAST_4 rule  with non-English 
character set") {
+    val s1 = s"SELECT * FROM default.src where key = 10"
+    val s2 = s"SELECT * FROM default.src where key = 11"
+    // scalastyle:off
+    checkAnswer(
+      "bob",
+      s1,
+      Seq(Row(
+        10,
+        md5Hex("4"),
+        "xxxxxUXXXXUnnnUUUUUUXUUUUUUUUUUUUU",
+        "hellxUXXXXUnnnUUUUUUXUUUUUUUUUUUUU",
+        Timestamp.valueOf("2018-01-01 00:00:00"),
+        "xxxxxUXXXXUnnnUUUUUUXUUUUUUUUUア叶葉엽")))
+    checkAnswer(
+      "bob",
+      s2,
+      Seq(Row(
+        11,
+        md5Hex("5"),
+        "XUUUUUUUUUUUUUUxxxxxUXXXXUnnnUUUUU",
+        "AßþΔUUUUUUUUUUUxxxxxUXXXXUnnnUUUUU",
+        Timestamp.valueOf("2018-01-01 00:00:00"),
+        "XUUUUUUUUUUUUUUxxxxxUXXXXUnnnU~!@#")))
+    // scalastyle:on
+  }
+
 }

Reply via email to