zhengruifeng commented on code in PR #46901:
URL: https://github.com/apache/spark/pull/46901#discussion_r1635625789
##########
python/pyspark/sql/functions/builtin.py:
##########
@@ -10949,9 +10954,20 @@ def substring(str: "ColumnOrName", pos: int, len: int)
-> Column:
>>> df = spark.createDataFrame([('abcd',)], ['s',])
>>> df.select(substring(df.s, 1, 2).alias('s')).collect()
[Row(s='ab')]
- """
+ >>> df = spark.createDataFrame([('Spark', 2, 3)], ['s', 'p', 'l'])
+ >>> df.select(substring(df.s, 2, df.l).alias('s')).collect()
+ [Row(s='par')]
+ >>> df.select(substring(df.s, df.p, 3).alias('s')).collect()
+ [Row(s='par')]
+ >>> df.select(substring(df.s, df.p, df.l).alias('s')).collect()
+ [Row(s='par')]
+ """
+ # the native str type is shadowed by the function's `str` param
+ from builtins import str as StrType
from pyspark.sql.classic.column import _to_java_column
+ pos = _to_java_column(pos) if isinstance(pos, (StrType, Column)) else pos
Review Comment:
```suggestion
pos = pos if isinstance(pos, int) else _to_java_column(pos)
```
then we don't need `StrType`
##########
python/pyspark/sql/functions/builtin.py:
##########
@@ -14034,9 +14053,22 @@ def array_position(col: "ColumnOrName", value: Any) ->
Column:
+-----------------------+
| 3|
+-----------------------+
+
+ Example 6: Finding the position of a column's value in an array of integers
+
+ >>> from pyspark.sql import functions as sf
+ >>> df = spark.createDataFrame([([10, 20, 30], 20)], ['data', 'col'])
+ >>> df.select(sf.array_position(df.data, df.col)).show()
+ +-------------------------+
+ |array_position(data, col)|
+ +-------------------------+
+ | 2|
+ +-------------------------+
+
"""
from pyspark.sql.classic.column import _to_java_column
+ value = value._jc if isinstance(value, Column) else value
Review Comment:
doesn't `_to_java_column` already support `Column` type input?
##########
python/pyspark/sql/functions/builtin.py:
##########
@@ -17267,9 +17317,21 @@ def map_contains_key(col: "ColumnOrName", value: Any)
-> Column:
+--------------------------+
| false|
+--------------------------+
+
+ Example 3: Check for key using a column
+
+ >>> from pyspark.sql import functions as sf
+ >>> df = spark.sql("SELECT map(1, 'a', 2, 'b') as data, 1 as key")
+ >>> df.select(sf.map_contains_key("data", sf.col("key"))).show()
+ +---------------------------+
+ |map_contains_key(data, key)|
+ +---------------------------+
+ | true|
+ +---------------------------+
"""
from pyspark.sql.classic.column import _to_java_column
+ value = value._jc if isinstance(value, Column) else value
Review Comment:
ditto
##########
connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala:
##########
@@ -4276,6 +4276,45 @@ object functions {
def substring(str: Column, pos: Int, len: Int): Column =
Column.fn("substring", str, lit(pos), lit(len))
+ /**
+ * Substring starts at `pos` and is of length `len` when str is String type
or returns the slice
+ * of byte array that starts at `pos` in byte and is of length `len` when
str is Binary type
+ *
+ * @note
+ * The position is not zero based, but 1 based index.
+ *
+ * @group string_funcs
+ * @since 4.0.0
+ */
+ def substring(str: Column, pos: Column, len: Int): Column =
+ Column.fn("substring", str, pos, lit(len))
+
+ /**
+ * Substring starts at `pos` and is of length `len` when str is String type
or returns the slice
+ * of byte array that starts at `pos` in byte and is of length `len` when
str is Binary type
+ *
+ * @note
+ * The position is not zero based, but 1 based index.
+ *
+ * @group string_funcs
+ * @since 4.0.0
+ */
+ def substring(str: Column, pos: Int, len: Column): Column =
+ Column.fn("substring", str, lit(pos), len)
Review Comment:
```suggestion
```
let's only add signature `def substring(str: Column, pos: Column, len:
Column): Column`
referring to
https://github.com/apache/spark/blob/a7da9b6b8aed99c5df23ccd83ab21a4a9a50d28a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala#L55-L57
##########
sql/core/src/main/scala/org/apache/spark/sql/functions.scala:
##########
@@ -4234,6 +4234,45 @@ object functions {
def substring(str: Column, pos: Int, len: Int): Column =
Column.fn("substring", str, lit(pos), lit(len))
+ /**
+ * Substring starts at `pos` and is of length `len` when str is String type
or
+ * returns the slice of byte array that starts at `pos` in byte and is of
length `len`
+ * when str is Binary type
+ *
+ * @note The position is not zero based, but 1 based index.
+ *
+ * @group string_funcs
+ * @since 4.0.0
+ */
+ def substring(str: Column, pos: Column, len: Int): Column =
+ Column.fn("substring", str, pos, lit(len))
+
+ /**
+ * Substring starts at `pos` and is of length `len` when str is String type
or
+ * returns the slice of byte array that starts at `pos` in byte and is of
length `len`
+ * when str is Binary type
+ *
+ * @note The position is not zero based, but 1 based index.
+ *
+ * @group string_funcs
+ * @since 4.0.0
+ */
+ def substring(str: Column, pos: Int, len: Column): Column =
+ Column.fn("substring", str, lit(pos), len)
+
Review Comment:
```suggestion
```
##########
python/pyspark/sql/functions/builtin.py:
##########
@@ -17267,9 +17317,21 @@ def map_contains_key(col: "ColumnOrName", value: Any)
-> Column:
+--------------------------+
| false|
+--------------------------+
+
+ Example 3: Check for key using a column
+
+ >>> from pyspark.sql import functions as sf
+ >>> df = spark.sql("SELECT map(1, 'a', 2, 'b') as data, 1 as key")
+ >>> df.select(sf.map_contains_key("data", sf.col("key"))).show()
+ +---------------------------+
+ |map_contains_key(data, key)|
+ +---------------------------+
+ | true|
+ +---------------------------+
"""
from pyspark.sql.classic.column import _to_java_column
+ value = value._jc if isinstance(value, Column) else value
Review Comment:
The `Example 3: Check for key using a column` was already supported
##########
python/pyspark/sql/functions/builtin.py:
##########
@@ -14470,9 +14505,21 @@ def array_remove(col: "ColumnOrName", element: Any) ->
Column:
+---------------------+
| []|
+---------------------+
+
+ Example 6: Removing a column's value from a simple array
+
+ >>> from pyspark.sql import functions as sf
+ >>> df = spark.createDataFrame([([1, 2, 3, 1, 1], 1)], ['data', 'col'])
+ >>> df.select(sf.array_remove(df.data, df.col)).show()
+ +-----------------------+
+ |array_remove(data, col)|
+ +-----------------------+
+ | [2, 3]|
+ +-----------------------+
"""
from pyspark.sql.classic.column import _to_java_column
+ element = element._jc if isinstance(element, Column) else element
Review Comment:
ditto
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]