Re: [PR] [SPARK-48555][SQL][PYTHON][CONNECT] Support using Columns as parameters for several functions in pyspark/scala [spark]

via GitHub Tue, 11 Jun 2024 17:23:18 -0700


zhengruifeng commented on code in PR #46901:
URL: https://github.com/apache/spark/pull/46901#discussion_r1635625789



##########
python/pyspark/sql/functions/builtin.py:
##########
@@ -10949,9 +10954,20 @@ def substring(str: "ColumnOrName", pos: int, len: int) 
-> Column:
     >>> df = spark.createDataFrame([('abcd',)], ['s',])
     >>> df.select(substring(df.s, 1, 2).alias('s')).collect()
     [Row(s='ab')]
-    """
+    >>> df = spark.createDataFrame([('Spark', 2, 3)], ['s', 'p', 'l'])
+    >>> df.select(substring(df.s, 2, df.l).alias('s')).collect()
+    [Row(s='par')]
+    >>> df.select(substring(df.s, df.p, 3).alias('s')).collect()
+    [Row(s='par')]
+    >>> df.select(substring(df.s, df.p, df.l).alias('s')).collect()
+    [Row(s='par')]
+    """
+    # the native str type is shadowed by the function's `str` param
+    from builtins import str as StrType
     from pyspark.sql.classic.column import _to_java_column
 
+    pos = _to_java_column(pos) if isinstance(pos, (StrType, Column)) else pos

Review Comment:
   ```suggestion
       pos = pos if isinstance(pos, int) else _to_java_column(pos)
   ```
   then we don't need `StrType`



##########
python/pyspark/sql/functions/builtin.py:
##########
@@ -14034,9 +14053,22 @@ def array_position(col: "ColumnOrName", value: Any) -> 
Column:
     +-----------------------+
     |                      3|
     +-----------------------+
+
+    Example 6: Finding the position of a column's value in an array of integers
+
+    >>> from pyspark.sql import functions as sf
+    >>> df = spark.createDataFrame([([10, 20, 30], 20)], ['data', 'col'])
+    >>> df.select(sf.array_position(df.data, df.col)).show()
+    +-------------------------+
+    |array_position(data, col)|
+    +-------------------------+
+    |                        2|
+    +-------------------------+
+
     """
     from pyspark.sql.classic.column import _to_java_column
 
+    value = value._jc if isinstance(value, Column) else value

Review Comment:
   doesn't `_to_java_column` already support `Column` type input?



##########
python/pyspark/sql/functions/builtin.py:
##########
@@ -17267,9 +17317,21 @@ def map_contains_key(col: "ColumnOrName", value: Any) 
-> Column:
     +--------------------------+
     |                     false|
     +--------------------------+
+
+    Example 3: Check for key using a column
+
+    >>> from pyspark.sql import functions as sf
+    >>> df = spark.sql("SELECT map(1, 'a', 2, 'b') as data, 1 as key")
+    >>> df.select(sf.map_contains_key("data", sf.col("key"))).show()
+    +---------------------------+
+    |map_contains_key(data, key)|
+    +---------------------------+
+    |                       true|
+    +---------------------------+
     """
     from pyspark.sql.classic.column import _to_java_column
 
+    value = value._jc if isinstance(value, Column) else value

Review Comment:
   ditto



##########
connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala:
##########
@@ -4276,6 +4276,45 @@ object functions {
   def substring(str: Column, pos: Int, len: Int): Column =
     Column.fn("substring", str, lit(pos), lit(len))
 
+  /**
+   * Substring starts at `pos` and is of length `len` when str is String type 
or returns the slice
+   * of byte array that starts at `pos` in byte and is of length `len` when 
str is Binary type
+   *
+   * @note
+   *   The position is not zero based, but 1 based index.
+   *
+   * @group string_funcs
+   * @since 4.0.0
+   */
+  def substring(str: Column, pos: Column, len: Int): Column =
+    Column.fn("substring", str, pos, lit(len))
+
+  /**
+   * Substring starts at `pos` and is of length `len` when str is String type 
or returns the slice
+   * of byte array that starts at `pos` in byte and is of length `len` when 
str is Binary type
+   *
+   * @note
+   *   The position is not zero based, but 1 based index.
+   *
+   * @group string_funcs
+   * @since 4.0.0
+   */
+  def substring(str: Column, pos: Int, len: Column): Column =
+    Column.fn("substring", str, lit(pos), len)

Review Comment:
   ```suggestion
   
   ```
   
   let's only add signature `def substring(str: Column, pos: Column, len: 
Column): Column`
   
   referring to 
https://github.com/apache/spark/blob/a7da9b6b8aed99c5df23ccd83ab21a4a9a50d28a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala#L55-L57



##########
sql/core/src/main/scala/org/apache/spark/sql/functions.scala:
##########
@@ -4234,6 +4234,45 @@ object functions {
   def substring(str: Column, pos: Int, len: Int): Column =
     Column.fn("substring", str, lit(pos), lit(len))
 
+  /**
+   * Substring starts at `pos` and is of length `len` when str is String type 
or
+   * returns the slice of byte array that starts at `pos` in byte and is of 
length `len`
+   * when str is Binary type
+   *
+   * @note The position is not zero based, but 1 based index.
+   *
+   * @group string_funcs
+   * @since 4.0.0
+   */
+  def substring(str: Column, pos: Column, len: Int): Column =
+    Column.fn("substring", str, pos, lit(len))
+
+  /**
+   * Substring starts at `pos` and is of length `len` when str is String type 
or
+   * returns the slice of byte array that starts at `pos` in byte and is of 
length `len`
+   * when str is Binary type
+   *
+   * @note The position is not zero based, but 1 based index.
+   *
+   * @group string_funcs
+   * @since 4.0.0
+   */
+  def substring(str: Column, pos: Int, len: Column): Column =
+    Column.fn("substring", str, lit(pos), len)
+

Review Comment:
   ```suggestion
   
   ```



##########
python/pyspark/sql/functions/builtin.py:
##########
@@ -17267,9 +17317,21 @@ def map_contains_key(col: "ColumnOrName", value: Any) 
-> Column:
     +--------------------------+
     |                     false|
     +--------------------------+
+
+    Example 3: Check for key using a column
+
+    >>> from pyspark.sql import functions as sf
+    >>> df = spark.sql("SELECT map(1, 'a', 2, 'b') as data, 1 as key")
+    >>> df.select(sf.map_contains_key("data", sf.col("key"))).show()
+    +---------------------------+
+    |map_contains_key(data, key)|
+    +---------------------------+
+    |                       true|
+    +---------------------------+
     """
     from pyspark.sql.classic.column import _to_java_column
 
+    value = value._jc if isinstance(value, Column) else value

Review Comment:
   The `Example 3: Check for key using a column` was already supported



##########
python/pyspark/sql/functions/builtin.py:
##########
@@ -14470,9 +14505,21 @@ def array_remove(col: "ColumnOrName", element: Any) -> 
Column:
     +---------------------+
     |                   []|
     +---------------------+
+
+    Example 6: Removing a column's value from a simple array
+
+    >>> from pyspark.sql import functions as sf
+    >>> df = spark.createDataFrame([([1, 2, 3, 1, 1], 1)], ['data', 'col'])
+    >>> df.select(sf.array_remove(df.data, df.col)).show()
+    +-----------------------+
+    |array_remove(data, col)|
+    +-----------------------+
+    |                 [2, 3]|
+    +-----------------------+
     """
     from pyspark.sql.classic.column import _to_java_column
 
+    element = element._jc if isinstance(element, Column) else element

Review Comment:
   ditto



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] [SPARK-48555][SQL][PYTHON][CONNECT] Support using Columns as parameters for several functions in pyspark/scala [spark]

Reply via email to