This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 920df93c41ed [SPARK-48877][PYTHON][DOCS] Test the default column name
of array functions
920df93c41ed is described below
commit 920df93c41edb76adbc9e0148c7fd2dc44a17b03
Author: Ruifeng Zheng <[email protected]>
AuthorDate: Fri Jul 12 17:11:41 2024 +0900
[SPARK-48877][PYTHON][DOCS] Test the default column name of array functions
### What changes were proposed in this pull request?
Test the default column name of array functions
### Why are the changes needed?
for test coverage, sometime the default column name is a problem
### Does this PR introduce _any_ user-facing change?
doc changes
### How was this patch tested?
CI
### Was this patch authored or co-authored using generative AI tooling?
No
Closes #47318 from zhengruifeng/py_avoid_alias_array_func.
Authored-by: Ruifeng Zheng <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
---
python/pyspark/sql/functions/builtin.py | 201 ++++++++++++++++----------------
1 file changed, 98 insertions(+), 103 deletions(-)
diff --git a/python/pyspark/sql/functions/builtin.py
b/python/pyspark/sql/functions/builtin.py
index 446ff2b1be93..0b464aa20710 100644
--- a/python/pyspark/sql/functions/builtin.py
+++ b/python/pyspark/sql/functions/builtin.py
@@ -13443,39 +13443,39 @@ def array(
>>> from pyspark.sql import functions as sf
>>> df = spark.createDataFrame([("Alice", "doctor"), ("Bob", "engineer")],
... ("name", "occupation"))
- >>> df.select(sf.array('name', 'occupation').alias("arr")).show()
- +---------------+
- | arr|
- +---------------+
- |[Alice, doctor]|
- |[Bob, engineer]|
- +---------------+
+ >>> df.select(sf.array('name', 'occupation')).show()
+ +-----------------------+
+ |array(name, occupation)|
+ +-----------------------+
+ | [Alice, doctor]|
+ | [Bob, engineer]|
+ +-----------------------+
Example 2: Usage of array function with Column objects.
>>> from pyspark.sql import functions as sf
>>> df = spark.createDataFrame([("Alice", "doctor"), ("Bob", "engineer")],
... ("name", "occupation"))
- >>> df.select(sf.array(df.name, df.occupation).alias("arr")).show()
- +---------------+
- | arr|
- +---------------+
- |[Alice, doctor]|
- |[Bob, engineer]|
- +---------------+
+ >>> df.select(sf.array(df.name, df.occupation)).show()
+ +-----------------------+
+ |array(name, occupation)|
+ +-----------------------+
+ | [Alice, doctor]|
+ | [Bob, engineer]|
+ +-----------------------+
Example 3: Single argument as list of column names.
>>> from pyspark.sql import functions as sf
>>> df = spark.createDataFrame([("Alice", "doctor"), ("Bob", "engineer")],
... ("name", "occupation"))
- >>> df.select(sf.array(['name', 'occupation']).alias("arr")).show()
- +---------------+
- | arr|
- +---------------+
- |[Alice, doctor]|
- |[Bob, engineer]|
- +---------------+
+ >>> df.select(sf.array(['name', 'occupation'])).show()
+ +-----------------------+
+ |array(name, occupation)|
+ +-----------------------+
+ | [Alice, doctor]|
+ | [Bob, engineer]|
+ +-----------------------+
Example 4: Usage of array function with columns of different types.
@@ -13483,26 +13483,26 @@ def array(
>>> df = spark.createDataFrame(
... [("Alice", 2, 22.2), ("Bob", 5, 36.1)],
... ("name", "age", "weight"))
- >>> df.select(sf.array(['age', 'weight']).alias("arr")).show()
- +-----------+
- | arr|
- +-----------+
- |[2.0, 22.2]|
- |[5.0, 36.1]|
- +-----------+
+ >>> df.select(sf.array(['age', 'weight'])).show()
+ +------------------+
+ |array(age, weight)|
+ +------------------+
+ | [2.0, 22.2]|
+ | [5.0, 36.1]|
+ +------------------+
Example 5: array function with a column containing null values.
>>> from pyspark.sql import functions as sf
>>> df = spark.createDataFrame([("Alice", None), ("Bob", "engineer")],
... ("name", "occupation"))
- >>> df.select(sf.array('name', 'occupation').alias("arr")).show()
- +---------------+
- | arr|
- +---------------+
- | [Alice, NULL]|
- |[Bob, engineer]|
- +---------------+
+ >>> df.select(sf.array('name', 'occupation')).show()
+ +-----------------------+
+ |array(name, occupation)|
+ +-----------------------+
+ | [Alice, NULL]|
+ | [Bob, engineer]|
+ +-----------------------+
"""
if len(cols) == 1 and isinstance(cols[0], (list, set)):
cols = cols[0] # type: ignore[assignment]
@@ -13540,13 +13540,13 @@ def array_contains(col: "ColumnOrName", value: Any)
-> Column:
>>> from pyspark.sql import functions as sf
>>> df = spark.createDataFrame([(["a", "b", "c"],), ([],)], ['data'])
- >>> df.select(sf.array_contains(df.data, "a").alias("contains_a")).show()
- +----------+
- |contains_a|
- +----------+
- | true|
- | false|
- +----------+
+ >>> df.select(sf.array_contains(df.data, "a")).show()
+ +-----------------------+
+ |array_contains(data, a)|
+ +-----------------------+
+ | true|
+ | false|
+ +-----------------------+
Example 2: Usage of array_contains function with a column.
@@ -13554,38 +13554,37 @@ def array_contains(col: "ColumnOrName", value: Any)
-> Column:
>>> df = spark.createDataFrame([(["a", "b", "c"], "c"),
... (["c", "d", "e"], "d"),
... (["e", "a", "c"], "b")], ["data", "item"])
- >>> df.select(sf.array_contains(df.data, sf.col("item"))
- ... .alias("data_contains_item")).show()
- +------------------+
- |data_contains_item|
- +------------------+
- | true|
- | true|
- | false|
- +------------------+
+ >>> df.select(sf.array_contains(df.data, sf.col("item"))).show()
+ +--------------------------+
+ |array_contains(data, item)|
+ +--------------------------+
+ | true|
+ | true|
+ | false|
+ +--------------------------+
Example 3: Attempt to use array_contains function with a null array.
>>> from pyspark.sql import functions as sf
>>> df = spark.createDataFrame([(None,), (["a", "b", "c"],)], ['data'])
- >>> df.select(sf.array_contains(df.data, "a").alias("contains_a")).show()
- +----------+
- |contains_a|
- +----------+
- | NULL|
- | true|
- +----------+
+ >>> df.select(sf.array_contains(df.data, "a")).show()
+ +-----------------------+
+ |array_contains(data, a)|
+ +-----------------------+
+ | NULL|
+ | true|
+ +-----------------------+
Example 4: Usage of array_contains with an array column containing null
values.
>>> from pyspark.sql import functions as sf
>>> df = spark.createDataFrame([(["a", None, "c"],)], ['data'])
- >>> df.select(sf.array_contains(df.data, "a").alias("contains_a")).show()
- +----------+
- |contains_a|
- +----------+
- | true|
- +----------+
+ >>> df.select(sf.array_contains(df.data, "a")).show()
+ +-----------------------+
+ |array_contains(data, a)|
+ +-----------------------+
+ | true|
+ +-----------------------+
"""
return _invoke_function_over_columns("array_contains", col, lit(value))
@@ -13620,49 +13619,49 @@ def arrays_overlap(a1: "ColumnOrName", a2:
"ColumnOrName") -> Column:
>>> from pyspark.sql import functions as sf
>>> df = spark.createDataFrame([(["a", "b"], ["b", "c"]), (["a"], ["b",
"c"])], ['x', 'y'])
- >>> df.select(sf.arrays_overlap(df.x, df.y).alias("overlap")).show()
- +-------+
- |overlap|
- +-------+
- | true|
- | false|
- +-------+
+ >>> df.select(sf.arrays_overlap(df.x, df.y)).show()
+ +--------------------+
+ |arrays_overlap(x, y)|
+ +--------------------+
+ | true|
+ | false|
+ +--------------------+
Example 2: Usage of arrays_overlap function with arrays containing null
elements.
>>> from pyspark.sql import functions as sf
>>> df = spark.createDataFrame([(["a", None], ["b", None]), (["a"], ["b",
"c"])], ['x', 'y'])
- >>> df.select(sf.arrays_overlap(df.x, df.y).alias("overlap")).show()
- +-------+
- |overlap|
- +-------+
- | NULL|
- | false|
- +-------+
+ >>> df.select(sf.arrays_overlap(df.x, df.y)).show()
+ +--------------------+
+ |arrays_overlap(x, y)|
+ +--------------------+
+ | NULL|
+ | false|
+ +--------------------+
Example 3: Usage of arrays_overlap function with arrays that are null.
>>> from pyspark.sql import functions as sf
>>> df = spark.createDataFrame([(None, ["b", "c"]), (["a"], None)], ['x',
'y'])
- >>> df.select(sf.arrays_overlap(df.x, df.y).alias("overlap")).show()
- +-------+
- |overlap|
- +-------+
- | NULL|
- | NULL|
- +-------+
+ >>> df.select(sf.arrays_overlap(df.x, df.y)).show()
+ +--------------------+
+ |arrays_overlap(x, y)|
+ +--------------------+
+ | NULL|
+ | NULL|
+ +--------------------+
Example 4: Usage of arrays_overlap on arrays with identical elements.
>>> from pyspark.sql import functions as sf
>>> df = spark.createDataFrame([(["a", "b"], ["a", "b"]), (["a"], ["a"])],
['x', 'y'])
- >>> df.select(sf.arrays_overlap(df.x, df.y).alias("overlap")).show()
- +-------+
- |overlap|
- +-------+
- | true|
- | true|
- +-------+
+ >>> df.select(sf.arrays_overlap(df.x, df.y)).show()
+ +--------------------+
+ |arrays_overlap(x, y)|
+ +--------------------+
+ | true|
+ | true|
+ +--------------------+
"""
return _invoke_function_over_columns("arrays_overlap", a1, a2)
@@ -14669,23 +14668,19 @@ def array_insert(arr: "ColumnOrName", pos:
Union["ColumnOrName", int], value: An
Example 4: Inserting a NULL value
>>> from pyspark.sql import functions as sf
- >>> from pyspark.sql.types import StringType
>>> df = spark.createDataFrame([(['a', 'b', 'c'],)], ['data'])
- >>> df.select(sf.array_insert(df.data, 2, sf.lit(None).cast(StringType()))
- ... .alias("result")).show()
- +---------------+
- | result|
- +---------------+
- |[a, NULL, b, c]|
- +---------------+
+ >>> df.select(sf.array_insert(df.data, 2, sf.lit(None))).show()
+ +---------------------------+
+ |array_insert(data, 2, NULL)|
+ +---------------------------+
+ | [a, NULL, b, c]|
+ +---------------------------+
Example 5: Inserting a value into a NULL array
>>> from pyspark.sql import functions as sf
>>> from pyspark.sql.types import ArrayType, IntegerType, StructType,
StructField
- >>> schema = StructType([
- ... StructField("data", ArrayType(IntegerType()), True)
- ... ])
+ >>> schema = StructType([StructField("data", ArrayType(IntegerType()),
True)])
>>> df = spark.createDataFrame([(None,)], schema=schema)
>>> df.select(sf.array_insert(df.data, 1, 5)).show()
+------------------------+
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]