dtenedor commented on code in PR #38146:
URL: https://github.com/apache/spark/pull/38146#discussion_r1008264498


##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/maskExpressions.scala:
##########
@@ -0,0 +1,222 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions
+
+import org.apache.spark.sql.catalyst.expressions.codegen._
+import org.apache.spark.sql.types.{AbstractDataType, DataType, StringType}
+import org.apache.spark.unsafe.types.UTF8String
+
+// scalastyle:off line.size.limit
+@ExpressionDescription(
+  usage =
+    """_FUNC_(input[, upperChar, lowerChar, digitChar, otherChar]) - masks the 
given string value""",
+  arguments = """
+    Arguments:
+      * input      - string value to mask. Supported types: STRING, VARCHAR, 
CHAR
+      * upperChar  - character to replace upper-case characters with. Specify 
-1 to retain original character. Default value: 'X'
+      * lowerChar  - character to replace lower-case characters with. Specify 
-1 to retain original character. Default value: 'x'
+      * digitChar  - character to replace digit characters with. Specify -1 to 
retain original character. Default value: 'n'
+      * otherChar  - character to replace all other characters with. Specify 
-1 to retain original character. Default value: -1
+  """,
+  examples = """
+    Examples:
+      > SELECT _FUNC_('abcd-EFGH-8765-4321');
+        xxxx-XXXX-nnnn-nnnn
+      > SELECT _FUNC_('abcd-EFGH-8765-4321', 'Q');
+        xxxx-QQQQ-nnnn-nnnn
+      > SELECT _FUNC_('AbCD123-@$#', 'Q','q');
+        QqQQnnn-@$#
+      > SELECT _FUNC_('AbCD123-@$#');
+        XxXXnnn-@$#
+      > SELECT _FUNC_('AbCD123-@$#', 'Q');
+        QxQQnnn-@$#
+      > SELECT _FUNC_('AbCD123-@$#', 'Q','q');
+        QqQQnnn-@$#
+      > SELECT _FUNC_('AbCD123-@$#', 'Q','q', 'd');
+        QqQQddd-@$#
+      > SELECT _FUNC_('AbCD123-@$#', 'Q','q', 'd', 'o');
+        QqQQdddoooo
+      > SELECT _FUNC_('AbCD123-@$#', -1, 'q', 'd', 'o');
+        AqCDdddoooo
+      > SELECT _FUNC_('AbCD123-@$#', -1,-1, 'd', 'o');
+        AbCDdddoooo
+      > SELECT _FUNC_('AbCD123-@$#', -1,-1, -1, 'o');
+        AbCD123oooo
+      > SELECT _FUNC_(NULL, -1,-1, -1, 'o');
+        NULL
+      > SELECT _FUNC_(NULL);
+        NULL
+      > SELECT _FUNC_('AbCD123-@$#', -1, -1, -1, -1);
+        AbCD123-@$#
+  """,
+  since = "3.4.0",
+  group = "string_funcs")
+// scalastyle:on line.size.limit
+case class Mask(
+    input: Expression,
+    upperChar: Expression,
+    lowerChar: Expression,
+    digitChar: Expression,
+    otherChar: Expression)
+    extends QuinaryExpression
+    with ImplicitCastInputTypes
+    with NullIntolerant {
+
+  def this(input: Expression) =
+    this(
+      input,
+      Literal(Mask.MASKED_UPPERCASE),
+      Literal(Mask.MASKED_LOWERCASE),
+      Literal(Mask.MASKED_DIGIT),
+      Literal(Mask.MASKED_IGNORE))
+
+  def this(input: Expression, upperChar: Expression) =
+    this(
+      input,
+      upperChar,
+      Literal(Mask.MASKED_LOWERCASE),
+      Literal(Mask.MASKED_DIGIT),
+      Literal(Mask.MASKED_IGNORE))
+
+  def this(input: Expression, upperChar: Expression, lowerChar: Expression) =
+    this(input, upperChar, lowerChar, Literal(Mask.MASKED_DIGIT), 
Literal(Mask.MASKED_IGNORE))
+
+  def this(
+      input: Expression,
+      upperChar: Expression,
+      lowerChar: Expression,
+      digitChar: Expression) =
+    this(input, upperChar, lowerChar, digitChar, Literal(Mask.MASKED_IGNORE))
+
+  /**
+   * Expected input types from child expressions. The i-th position in the 
returned seq indicates
+   * the type requirement for the i-th child.
+   *
+   * The possible values at each position are:
+   *   1. a specific data type, e.g. LongType, StringType. 2. a non-leaf 
abstract data type, e.g.
+   *      NumericType, IntegralType, FractionalType.
+   */
+  override def inputTypes: Seq[AbstractDataType] =
+    Seq(StringType, StringType, StringType, StringType, StringType)
+
+  /**
+   * Called by default [[eval]] implementation. If subclass of 
QuinaryExpression keep the default
+   * nullability, they can override this method to save null-check code. If we 
need full control
+   * of evaluation process, we should override [[eval]].
+   */
+  override protected def nullSafeEval(
+      input: Any,
+      upperChar: Any,
+      lowerChar: Any,
+      digitChar: Any,
+      otherChar: Any): Any =
+    Mask.transformInput(
+      input.asInstanceOf[UTF8String],
+      upperChar.asInstanceOf[UTF8String],
+      lowerChar.asInstanceOf[UTF8String],
+      digitChar.asInstanceOf[UTF8String],
+      otherChar.asInstanceOf[UTF8String])
+
+  /**
+   * Returns Java source code that can be compiled to evaluate this 
expression. The default
+   * behavior is to call the eval method of the expression. Concrete 
expression implementations
+   * should override this to do actual code generation.
+   *
+   * @param ctx
+   *   a [[CodegenContext]]
+   * @param ev
+   *   an [[ExprCode]] with unique terms.
+   * @return
+   *   an [[ExprCode]] containing the Java source code to generate the given 
expression
+   */
+  override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): 
ExprCode =
+    defineCodeGen(
+      ctx,
+      ev,
+      (input, upperChar, lowerChar, digitChar, otherChar) => {
+        s"org.apache.spark.sql.catalyst.expressions.Mask." +
+          s"transformInput($input, $upperChar, $lowerChar, $digitChar, 
$otherChar);"
+      })
+
+  /**
+   * Returns the [[DataType]] of the result of evaluating this expression. It 
is invalid to query
+   * the dataType of an unresolved expression (i.e., when `resolved` == false).
+   */
+  override def dataType: DataType = StringType
+
+  /**
+   * Returns a Seq of the children of this node. Children should not change. 
Immutability required
+   * for containsChild optimization
+   */
+  override def children: Seq[Expression] =
+    Seq(input, upperChar, lowerChar, digitChar, otherChar)
+
+  override protected def withNewChildrenInternal(newChildren: 
IndexedSeq[Expression]): Mask =
+    copy(
+      input = newChildren(0),
+      upperChar = newChildren(1),
+      lowerChar = newChildren(2),
+      digitChar = newChildren(3),
+      otherChar = newChildren(4))
+}
+
+case class MaskArgument(maskChar: Char, ignore: Boolean)
+
+object Mask {
+  private val MASKED_UPPERCASE = 'X'

Review Comment:
   please add a comment for each of these private vals mentioning what the 
character represents?



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/maskExpressions.scala:
##########
@@ -0,0 +1,222 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions
+
+import org.apache.spark.sql.catalyst.expressions.codegen._
+import org.apache.spark.sql.types.{AbstractDataType, DataType, StringType}
+import org.apache.spark.unsafe.types.UTF8String
+
+// scalastyle:off line.size.limit
+@ExpressionDescription(
+  usage =
+    """_FUNC_(input[, upperChar, lowerChar, digitChar, otherChar]) - masks the 
given string value""",

Review Comment:
   This description probably needs a bit more information. For example, can 
this mention:
   * The function replaces characters with 'X' or 'x', and numbers with 'n'
   * This can be useful for creating copies of tables with sensitive 
information removed, but retaining the same schema.
   * Error behavior: there are no error cases for this expression, it always 
returns a result string for every input string.



##########
sql/core/src/test/resources/sql-tests/inputs/string-functions.sql:
##########
@@ -58,6 +60,54 @@ SELECT substring('Spark SQL' from 5);
 SELECT substring('Spark SQL' from -3);
 SELECT substring('Spark SQL' from 5 for 1);
 
+-- mask function
+SELECT mask('AbCD123-@$#');
+SELECT mask('AbCD123-@$#', 'Q');
+SELECT mask('AbCD123-@$#', 'Q','q');
+SELECT mask('AbCD123-@$#', 'Q','q', 'd');
+SELECT mask('AbCD123-@$#', 'Q','q', 'd', 'o');
+SELECT mask('AbCD123-@$#', -1, 'q', 'd', 'o');
+SELECT mask('AbCD123-@$#', -1,-1, 'd', 'o');
+SELECT mask('AbCD123-@$#', -1,-1, -1, 'o');
+SELECT mask('AbCD123-@$#', -1, -1, -1, -1);
+SELECT mask(NULL);
+SELECT mask(NULL, -1, 'q', 'd', 'o');
+SELECT mask(NULL, -1,-1, 'd', 'o');
+SELECT mask(NULL, -1,-1, -1, 'o');
+SELECT mask(NULL, -1, -1, -1, -1);
+SELECT mask(c1) from values ('AbCD123-@$#') as tab(c1);
+SELECT mask(c1, 'Q') from values ('AbCD123-@$#') as tab(c1);
+SELECT mask(c1, 'Q','q')from values ('AbCD123-@$#') as tab(c1);
+SELECT mask(c1, 'Q','q', 'd') from values ('AbCD123-@$#') as tab(c1);
+SELECT mask(c1, 'Q','q', 'd', 'o') from values ('AbCD123-@$#') as tab(c1);
+SELECT mask(c1, -1, 'q', 'd', 'o') from values ('AbCD123-@$#') as tab(c1);
+SELECT mask(c1, -1,-1, 'd', 'o') from values ('AbCD123-@$#') as tab(c1);
+SELECT mask(c1, -1,-1, -1, 'o') from values ('AbCD123-@$#') as tab(c1);
+SELECT mask(c1, -1, -1, -1, -1) from values ('AbCD123-@$#') as tab(c1);
+SELECT mask('abcd-EFGH-8765-4321');
+SELECT mask('abcd-EFGH-8765-4321', 'Q');
+SELECT mask('abcd-EFGH-8765-4321', 'Q','q');
+SELECT mask('abcd-EFGH-8765-4321', 'Q','q', 'd');
+SELECT mask('abcd-EFGH-8765-4321', 'Q','q', 'd', '*');
+SELECT mask('abcd-EFGH-8765-4321', -1, 'q', 'd', '*');
+SELECT mask('abcd-EFGH-8765-4321', -1,-1, 'd', '*');
+SELECT mask('abcd-EFGH-8765-4321', -1,-1, -1, '*');
+SELECT mask('abcd-EFGH-8765-4321', -1, -1, -1, -1);
+SELECT mask(NULL);
+SELECT mask(NULL, -1, 'q', 'd', '*');
+SELECT mask(NULL, -1,-1, 'd', '*');
+SELECT mask(NULL, -1,-1, -1, '*');
+SELECT mask(NULL, -1, -1, -1, -1);
+SELECT mask(c1) from values ('abcd-EFGH-8765-4321') as tab(c1);
+SELECT mask(c1, 'Q') from values ('abcd-EFGH-8765-4321') as tab(c1);
+SELECT mask(c1, 'Q','q')from values ('abcd-EFGH-8765-4321') as tab(c1);
+SELECT mask(c1, 'Q','q', 'd') from values ('abcd-EFGH-8765-4321') as tab(c1);

Review Comment:
   for these cases, the replacement character arguments are all literal values. 
Can we add test cases where these arguments are column references? They can 
either return results, or errors if we decide to explicitly ban this behavior 
by e.g. implementing `checkInputDataTypes` to return an error otherwise.



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/maskExpressions.scala:
##########
@@ -0,0 +1,222 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions
+
+import org.apache.spark.sql.catalyst.expressions.codegen._
+import org.apache.spark.sql.types.{AbstractDataType, DataType, StringType}
+import org.apache.spark.unsafe.types.UTF8String
+
+// scalastyle:off line.size.limit
+@ExpressionDescription(
+  usage =
+    """_FUNC_(input[, upperChar, lowerChar, digitChar, otherChar]) - masks the 
given string value""",
+  arguments = """
+    Arguments:
+      * input      - string value to mask. Supported types: STRING, VARCHAR, 
CHAR
+      * upperChar  - character to replace upper-case characters with. Specify 
-1 to retain original character. Default value: 'X'
+      * lowerChar  - character to replace lower-case characters with. Specify 
-1 to retain original character. Default value: 'x'
+      * digitChar  - character to replace digit characters with. Specify -1 to 
retain original character. Default value: 'n'
+      * otherChar  - character to replace all other characters with. Specify 
-1 to retain original character. Default value: -1
+  """,
+  examples = """
+    Examples:
+      > SELECT _FUNC_('abcd-EFGH-8765-4321');
+        xxxx-XXXX-nnnn-nnnn
+      > SELECT _FUNC_('abcd-EFGH-8765-4321', 'Q');
+        xxxx-QQQQ-nnnn-nnnn
+      > SELECT _FUNC_('AbCD123-@$#', 'Q','q');
+        QqQQnnn-@$#
+      > SELECT _FUNC_('AbCD123-@$#');
+        XxXXnnn-@$#
+      > SELECT _FUNC_('AbCD123-@$#', 'Q');
+        QxQQnnn-@$#
+      > SELECT _FUNC_('AbCD123-@$#', 'Q','q');
+        QqQQnnn-@$#
+      > SELECT _FUNC_('AbCD123-@$#', 'Q','q', 'd');
+        QqQQddd-@$#
+      > SELECT _FUNC_('AbCD123-@$#', 'Q','q', 'd', 'o');
+        QqQQdddoooo
+      > SELECT _FUNC_('AbCD123-@$#', -1, 'q', 'd', 'o');
+        AqCDdddoooo
+      > SELECT _FUNC_('AbCD123-@$#', -1,-1, 'd', 'o');
+        AbCDdddoooo
+      > SELECT _FUNC_('AbCD123-@$#', -1,-1, -1, 'o');
+        AbCD123oooo
+      > SELECT _FUNC_(NULL, -1,-1, -1, 'o');

Review Comment:
   also add an example where one of the replacement characters is NULL? what 
happens then?



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/maskExpressions.scala:
##########
@@ -0,0 +1,222 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions
+
+import org.apache.spark.sql.catalyst.expressions.codegen._
+import org.apache.spark.sql.types.{AbstractDataType, DataType, StringType}
+import org.apache.spark.unsafe.types.UTF8String
+
+// scalastyle:off line.size.limit
+@ExpressionDescription(
+  usage =
+    """_FUNC_(input[, upperChar, lowerChar, digitChar, otherChar]) - masks the 
given string value""",
+  arguments = """
+    Arguments:
+      * input      - string value to mask. Supported types: STRING, VARCHAR, 
CHAR
+      * upperChar  - character to replace upper-case characters with. Specify 
-1 to retain original character. Default value: 'X'
+      * lowerChar  - character to replace lower-case characters with. Specify 
-1 to retain original character. Default value: 'x'
+      * digitChar  - character to replace digit characters with. Specify -1 to 
retain original character. Default value: 'n'
+      * otherChar  - character to replace all other characters with. Specify 
-1 to retain original character. Default value: -1
+  """,
+  examples = """
+    Examples:
+      > SELECT _FUNC_('abcd-EFGH-8765-4321');
+        xxxx-XXXX-nnnn-nnnn
+      > SELECT _FUNC_('abcd-EFGH-8765-4321', 'Q');
+        xxxx-QQQQ-nnnn-nnnn
+      > SELECT _FUNC_('AbCD123-@$#', 'Q','q');
+        QqQQnnn-@$#
+      > SELECT _FUNC_('AbCD123-@$#');
+        XxXXnnn-@$#
+      > SELECT _FUNC_('AbCD123-@$#', 'Q');
+        QxQQnnn-@$#
+      > SELECT _FUNC_('AbCD123-@$#', 'Q','q');
+        QqQQnnn-@$#
+      > SELECT _FUNC_('AbCD123-@$#', 'Q','q', 'd');
+        QqQQddd-@$#
+      > SELECT _FUNC_('AbCD123-@$#', 'Q','q', 'd', 'o');
+        QqQQdddoooo
+      > SELECT _FUNC_('AbCD123-@$#', -1, 'q', 'd', 'o');
+        AqCDdddoooo
+      > SELECT _FUNC_('AbCD123-@$#', -1,-1, 'd', 'o');
+        AbCDdddoooo
+      > SELECT _FUNC_('AbCD123-@$#', -1,-1, -1, 'o');

Review Comment:
   ```suggestion
         > SELECT _FUNC_('AbCD123-@$#', -1, -1, -1, 'o');
   ```



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/maskExpressions.scala:
##########
@@ -0,0 +1,222 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions
+
+import org.apache.spark.sql.catalyst.expressions.codegen._
+import org.apache.spark.sql.types.{AbstractDataType, DataType, StringType}
+import org.apache.spark.unsafe.types.UTF8String
+
+// scalastyle:off line.size.limit
+@ExpressionDescription(
+  usage =
+    """_FUNC_(input[, upperChar, lowerChar, digitChar, otherChar]) - masks the 
given string value""",
+  arguments = """
+    Arguments:
+      * input      - string value to mask. Supported types: STRING, VARCHAR, 
CHAR
+      * upperChar  - character to replace upper-case characters with. Specify 
-1 to retain original character. Default value: 'X'
+      * lowerChar  - character to replace lower-case characters with. Specify 
-1 to retain original character. Default value: 'x'
+      * digitChar  - character to replace digit characters with. Specify -1 to 
retain original character. Default value: 'n'
+      * otherChar  - character to replace all other characters with. Specify 
-1 to retain original character. Default value: -1
+  """,
+  examples = """
+    Examples:
+      > SELECT _FUNC_('abcd-EFGH-8765-4321');
+        xxxx-XXXX-nnnn-nnnn
+      > SELECT _FUNC_('abcd-EFGH-8765-4321', 'Q');
+        xxxx-QQQQ-nnnn-nnnn
+      > SELECT _FUNC_('AbCD123-@$#', 'Q','q');
+        QqQQnnn-@$#
+      > SELECT _FUNC_('AbCD123-@$#');
+        XxXXnnn-@$#
+      > SELECT _FUNC_('AbCD123-@$#', 'Q');
+        QxQQnnn-@$#
+      > SELECT _FUNC_('AbCD123-@$#', 'Q','q');
+        QqQQnnn-@$#
+      > SELECT _FUNC_('AbCD123-@$#', 'Q','q', 'd');
+        QqQQddd-@$#
+      > SELECT _FUNC_('AbCD123-@$#', 'Q','q', 'd', 'o');
+        QqQQdddoooo
+      > SELECT _FUNC_('AbCD123-@$#', -1, 'q', 'd', 'o');
+        AqCDdddoooo
+      > SELECT _FUNC_('AbCD123-@$#', -1,-1, 'd', 'o');

Review Comment:
   ```suggestion
         > SELECT _FUNC_('AbCD123-@$#', -1, -1, 'd', 'o');
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to