This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new fdc11ab0494 [SPARK-40078][PYTHON][DOCS] Make pyspark.sql.column examples self-contained fdc11ab0494 is described below commit fdc11ab0494a681444e7a7e13f3f99d25fa6cf2f Author: Qian.Sun <qian.sun2...@gmail.com> AuthorDate: Wed Aug 24 08:57:33 2022 +0900 [SPARK-40078][PYTHON][DOCS] Make pyspark.sql.column examples self-contained ### What changes were proposed in this pull request? This PR proposes to add parameters/returns and improve the examples in `pyspark.sql.column` by making each example self-contained with a brief explanation and a bit more realistic example. ### Why are the changes needed? To make the documentation more readable and able to copy and paste directly in PySpark shell. ### Does this PR introduce _any_ user-facing change? Yes, it changes the documentation. ### How was this patch tested? Manually ran each doctest. Closes #37521 from dcoliversun/SPARK-40078. Authored-by: Qian.Sun <qian.sun2...@gmail.com> Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> --- python/pyspark/sql/column.py | 185 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 172 insertions(+), 13 deletions(-) diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py index 31954a95690..3746d8eba12 100644 --- a/python/pyspark/sql/column.py +++ b/python/pyspark/sql/column.py @@ -35,7 +35,7 @@ from py4j.java_gateway import JavaObject from pyspark import copy_func from pyspark.context import SparkContext -from pyspark.sql.types import DataType, StructField, StructType, IntegerType, StringType +from pyspark.sql.types import DataType if TYPE_CHECKING: from pyspark.sql._typing import ColumnOrName, LiteralType, DecimalLiteral, DateTimeLiteral @@ -187,18 +187,28 @@ class Column: """ A column in a DataFrame. - :class:`Column` instances can be created by:: + .. versionadded:: 1.3.0 + + Examples + -------- + Column instances can be created by - # 1. Select a column out of a DataFrame + >>> df = spark.createDataFrame( + ... [(2, "Alice"), (5, "Bob")], ["age", "name"]) - df.colName - df["colName"] + Select a column out of a DataFrame - # 2. Create from an expression - df.colName + 1 - 1 / df.colName + >>> df.name + Column<'name'> + >>> df["name"] + Column<'name'> - .. versionadded:: 1.3.0 + Create from an expression + + >>> df.age + 1 + Column<'(age + 1)'> + >>> 1 / df.age + Column<'(1 / age)'> """ def __init__(self, jc: JavaObject) -> None: @@ -405,6 +415,20 @@ class Column: .. versionadded:: 1.3.0 + Parameters + ---------- + key + a literal value, or a :class:`Column` expression. + The result will only be true at a location if item matches in the column. + + .. deprecated:: 3.0.0 + :class:`Column` as a parameter is deprecated. + + Returns + ------- + :class:`Column` + Column representing the item(s) got at position out of a list or by key out of a dict. + Examples -------- >>> df = spark.createDataFrame([([1, 2], {"key": "value"})], ["l", "d"]) @@ -430,6 +454,19 @@ class Column: .. versionadded:: 1.3.0 + Parameters + ---------- + name + a literal value, or a :class:`Column` expression. + The result will only be true at a location if field matches in the Column. + + .. deprecated:: 3.0.0 + :class:`Column` as a parameter is deprecated. + Returns + ------- + :class:`Column` + Column representing whether each element of Column gotten by name. + Examples -------- >>> from pyspark.sql import Row @@ -462,6 +499,20 @@ class Column: .. versionadded:: 3.1.0 + Parameters + ---------- + fieldName : str + a literal value. + The result will only be true at a location if any field matches in the Column. + col : :class:`Column` + A :class:`Column` expression for the column with `fieldName`. + + Returns + ------- + :class:`Column` + Column representing whether each element of Column + which field added/replaced by fieldName. + Examples -------- >>> from pyspark.sql import Row @@ -495,6 +546,17 @@ class Column: .. versionadded:: 3.1.0 + Parameters + ---------- + fieldNames : str + Desired field names (collects all positional arguments passed) + The result will drop at a location if any field matches in the Column. + + Returns + ------- + :class:`Column` + Column representing whether each element of Column with field dropped by fieldName. + Examples -------- >>> from pyspark.sql import Row @@ -570,6 +632,8 @@ class Column: Examples -------- + >>> df = spark.createDataFrame( + ... [(2, "Alice"), (5, "Bob")], ["age", "name"]) >>> df.filter(df.name.contains('o')).collect() [Row(age=5, name='Bob')] """ @@ -583,6 +647,8 @@ class Column: Examples -------- + >>> df = spark.createDataFrame( + ... [(2, "Alice"), (5, "Bob")], ["age", "name"]) >>> df.filter(df.name.startswith('Al')).collect() [Row(age=2, name='Alice')] >>> df.filter(df.name.startswith('^Al')).collect() @@ -598,6 +664,8 @@ class Column: Examples -------- + >>> df = spark.createDataFrame( + ... [(2, "Alice"), (5, "Bob")], ["age", "name"]) >>> df.filter(df.name.endswith('ice')).collect() [Row(age=2, name='Alice')] >>> df.filter(df.name.endswith('ice$')).collect() @@ -621,8 +689,16 @@ class Column: -------- pyspark.sql.Column.rlike + Returns + ------- + :class:`Column` + Column of booleans showing whether each element + in the Column is matched by SQL LIKE pattern. + Examples -------- + >>> df = spark.createDataFrame( + ... [(2, "Alice"), (5, "Bob")], ["age", "name"]) >>> df.filter(df.name.like('Al%')).collect() [Row(age=2, name='Alice')] """ @@ -639,8 +715,16 @@ class Column: other : str an extended regex expression + Returns + ------- + :class:`Column` + Column of booleans showing whether each element + in the Column is matched by extended regex expression. + Examples -------- + >>> df = spark.createDataFrame( + ... [(2, "Alice"), (5, "Bob")], ["age", "name"]) >>> df.filter(df.name.rlike('ice$')).collect() [Row(age=2, name='Alice')] """ @@ -663,8 +747,16 @@ class Column: -------- pyspark.sql.Column.rlike + Returns + ------- + :class:`Column` + Column of booleans showing whether each element + in the Column is matched by SQL LIKE pattern. + Examples -------- + >>> df = spark.createDataFrame( + ... [(2, "Alice"), (5, "Bob")], ["age", "name"]) >>> df.filter(df.name.ilike('%Ice')).collect() [Row(age=2, name='Alice')] """ @@ -692,8 +784,15 @@ class Column: length : :class:`Column` or int length of the substring + Returns + ------- + :class:`Column` + Column representing whether each element of Column is substr of origin Column. + Examples -------- + >>> df = spark.createDataFrame( + ... [(2, "Alice"), (5, "Bob")], ["age", "name"]) >>> df.select(df.name.substr(1, 3).alias("col")).collect() [Row(col='Ali'), Row(col='Bob')] """ @@ -720,8 +819,20 @@ class Column: .. versionadded:: 1.5.0 + Parameters + ---------- + cols + The result will only be true at a location if any value matches in the Column. + + Returns + ------- + :class:`Column` + Column of booleans showing whether each element in the Column is contained in cols. + Examples -------- + >>> df = spark.createDataFrame( + ... [(2, "Alice"), (5, "Bob")], ["age", "name"]) >>> df[df.name.isin("Bob", "Mike")].collect() [Row(age=5, name='Bob')] >>> df[df.age.isin([1, 2, 3])].collect() @@ -870,8 +981,15 @@ class Column: .. versionchanged:: 2.2.0 Added optional ``metadata`` argument. + Returns + ------- + :class:`Column` + Column representing whether each element of Column is aliased with new name or names. + Examples -------- + >>> df = spark.createDataFrame( + ... [(2, "Alice"), (5, "Bob")], ["age", "name"]) >>> df.select(df.age.alias("age2")).collect() [Row(age2=2), Row(age2=5)] >>> df.select(df.age.alias("age3", metadata={'max': 99})).schema['age3'].metadata['max'] @@ -903,8 +1021,22 @@ class Column: .. versionadded:: 1.3.0 + Parameters + ---------- + dataType : :class:`DataType` or str + a DataType or Python string literal with a DDL-formatted string + to use when parsing the column to the same type. + + Returns + ------- + :class:`Column` + Column representing whether each element of Column is cast into new type. + Examples -------- + >>> from pyspark.sql.types import StringType + >>> df = spark.createDataFrame( + ... [(2, "Alice"), (5, "Bob")], ["age", "name"]) >>> df.select(df.age.cast("string").alias('ages')).collect() [Row(ages='2'), Row(ages='5')] >>> df.select(df.age.cast(StringType()).alias('ages')).collect() @@ -934,8 +1066,23 @@ class Column: .. versionadded:: 1.3.0 + Parameters + ---------- + lowerBound : :class:`Column`, int, float, string, bool, datetime, date or Decimal + a boolean expression that boundary start, inclusive. + upperBound : :class:`Column`, int, float, string, bool, datetime, date or Decimal + a boolean expression that boundary end, inclusive. + + Returns + ------- + :class:`Column` + Column of booleans showing whether each element of Column + is between left and right (inclusive). + Examples -------- + >>> df = spark.createDataFrame( + ... [(2, "Alice"), (5, "Bob")], ["age", "name"]) >>> df.select(df.name, df.age.between(2, 4)).show() +-----+---------------------------+ | name|((age >= 2) AND (age <= 4))| @@ -960,9 +1107,16 @@ class Column: value a literal value, or a :class:`Column` expression. + Returns + ------- + :class:`Column` + Column representing whether each element of Column is in conditions. + Examples -------- >>> from pyspark.sql import functions as F + >>> df = spark.createDataFrame( + ... [(2, "Alice"), (5, "Bob")], ["age", "name"]) >>> df.select(df.name, F.when(df.age > 4, 1).when(df.age < 3, -1).otherwise(0)).show() +-----+------------------------------------------------------------+ | name|CASE WHEN (age > 4) THEN 1 WHEN (age < 3) THEN -1 ELSE 0 END| @@ -993,9 +1147,16 @@ class Column: value a literal value, or a :class:`Column` expression. + Returns + ------- + :class:`Column` + Column representing whether each element of Column is unmatched conditions. + Examples -------- >>> from pyspark.sql import functions as F + >>> df = spark.createDataFrame( + ... [(2, "Alice"), (5, "Bob")], ["age", "name"]) >>> df.select(df.name, F.when(df.age > 3, 1).otherwise(0)).show() +-----+-------------------------------------+ | name|CASE WHEN (age > 3) THEN 1 ELSE 0 END| @@ -1033,6 +1194,8 @@ class Column: .rowsBetween(Window.unboundedPreceding, Window.currentRow) >>> from pyspark.sql.functions import rank, min >>> from pyspark.sql.functions import desc + >>> df = spark.createDataFrame( + ... [(2, "Alice"), (5, "Bob")], ["age", "name"]) >>> df.withColumn("rank", rank().over(window)) \ .withColumn("min", min('age').over(window)).sort(desc("age")).show() +---+-----+----+---+ @@ -1068,11 +1231,7 @@ def _test() -> None: globs = pyspark.sql.column.__dict__.copy() spark = SparkSession.builder.master("local[4]").appName("sql.column tests").getOrCreate() - sc = spark.sparkContext globs["spark"] = spark - globs["df"] = sc.parallelize([(2, "Alice"), (5, "Bob")]).toDF( - StructType([StructField("age", IntegerType()), StructField("name", StringType())]) - ) (failure_count, test_count) = doctest.testmod( pyspark.sql.column, --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org