[spark] branch master updated: [SPARK-40078][PYTHON][DOCS] Make pyspark.sql.column examples self-contained

gurwls223 Tue, 23 Aug 2022 16:57:54 -0700

This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new fdc11ab0494 [SPARK-40078][PYTHON][DOCS] Make pyspark.sql.column 
examples self-contained
fdc11ab0494 is described below

commit fdc11ab0494a681444e7a7e13f3f99d25fa6cf2f
Author: Qian.Sun <qian.sun2...@gmail.com>
AuthorDate: Wed Aug 24 08:57:33 2022 +0900

    [SPARK-40078][PYTHON][DOCS] Make pyspark.sql.column examples self-contained
    
    ### What changes were proposed in this pull request?
    
    This PR proposes to add parameters/returns and improve the examples in 
`pyspark.sql.column` by making each example self-contained with a brief 
explanation and a bit more realistic example.
    
    ### Why are the changes needed?
    
    To make the documentation more readable and able to copy and paste directly 
in PySpark shell.
    
    ### Does this PR introduce _any_ user-facing change?
    
    Yes, it changes the documentation.
    
    ### How was this patch tested?
    
    Manually ran each doctest.
    
    Closes #37521 from dcoliversun/SPARK-40078.
    
    Authored-by: Qian.Sun <qian.sun2...@gmail.com>
    Signed-off-by: Hyukjin Kwon <gurwls...@apache.org>
---
 python/pyspark/sql/column.py | 185 ++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 172 insertions(+), 13 deletions(-)

diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py
index 31954a95690..3746d8eba12 100644
--- a/python/pyspark/sql/column.py
+++ b/python/pyspark/sql/column.py
@@ -35,7 +35,7 @@ from py4j.java_gateway import JavaObject
 
 from pyspark import copy_func
 from pyspark.context import SparkContext
-from pyspark.sql.types import DataType, StructField, StructType, IntegerType, 
StringType
+from pyspark.sql.types import DataType
 
 if TYPE_CHECKING:
     from pyspark.sql._typing import ColumnOrName, LiteralType, DecimalLiteral, 
DateTimeLiteral
@@ -187,18 +187,28 @@ class Column:
     """
     A column in a DataFrame.
 
-    :class:`Column` instances can be created by::
+    .. versionadded:: 1.3.0
+
+    Examples
+    --------
+    Column instances can be created by
 
-        # 1. Select a column out of a DataFrame
+    >>> df = spark.createDataFrame(
+    ...      [(2, "Alice"), (5, "Bob")], ["age", "name"])
 
-        df.colName
-        df["colName"]
+    Select a column out of a DataFrame
 
-        # 2. Create from an expression
-        df.colName + 1
-        1 / df.colName
+    >>> df.name
+    Column<'name'>
+    >>> df["name"]
+    Column<'name'>
 
-    .. versionadded:: 1.3.0
+    Create from an expression
+
+    >>> df.age + 1
+    Column<'(age + 1)'>
+    >>> 1 / df.age
+    Column<'(1 / age)'>
     """
 
     def __init__(self, jc: JavaObject) -> None:
@@ -405,6 +415,20 @@ class Column:
 
         .. versionadded:: 1.3.0
 
+        Parameters
+        ----------
+        key
+            a literal value, or a :class:`Column` expression.
+            The result will only be true at a location if item matches in the 
column.
+
+             .. deprecated:: 3.0.0
+                 :class:`Column` as a parameter is deprecated.
+
+        Returns
+        -------
+        :class:`Column`
+            Column representing the item(s) got at position out of a list or 
by key out of a dict.
+
         Examples
         --------
         >>> df = spark.createDataFrame([([1, 2], {"key": "value"})], ["l", 
"d"])
@@ -430,6 +454,19 @@ class Column:
 
         .. versionadded:: 1.3.0
 
+        Parameters
+        ----------
+        name
+            a literal value, or a :class:`Column` expression.
+            The result will only be true at a location if field matches in the 
Column.
+
+             .. deprecated:: 3.0.0
+                 :class:`Column` as a parameter is deprecated.
+        Returns
+        -------
+        :class:`Column`
+            Column representing whether each element of Column gotten by name.
+
         Examples
         --------
         >>> from pyspark.sql import Row
@@ -462,6 +499,20 @@ class Column:
 
         .. versionadded:: 3.1.0
 
+        Parameters
+        ----------
+        fieldName : str
+            a literal value.
+            The result will only be true at a location if any field matches in 
the Column.
+        col : :class:`Column`
+            A :class:`Column` expression for the column with `fieldName`.
+
+        Returns
+        -------
+        :class:`Column`
+            Column representing whether each element of Column
+            which field added/replaced by fieldName.
+
         Examples
         --------
         >>> from pyspark.sql import Row
@@ -495,6 +546,17 @@ class Column:
 
         .. versionadded:: 3.1.0
 
+        Parameters
+        ----------
+        fieldNames : str
+            Desired field names (collects all positional arguments passed)
+            The result will drop at a location if any field matches in the 
Column.
+
+        Returns
+        -------
+        :class:`Column`
+            Column representing whether each element of Column with field 
dropped by fieldName.
+
         Examples
         --------
         >>> from pyspark.sql import Row
@@ -570,6 +632,8 @@ class Column:
 
     Examples
     --------
+    >>> df = spark.createDataFrame(
+    ...      [(2, "Alice"), (5, "Bob")], ["age", "name"])
     >>> df.filter(df.name.contains('o')).collect()
     [Row(age=5, name='Bob')]
     """
@@ -583,6 +647,8 @@ class Column:
 
     Examples
     --------
+    >>> df = spark.createDataFrame(
+    ...      [(2, "Alice"), (5, "Bob")], ["age", "name"])
     >>> df.filter(df.name.startswith('Al')).collect()
     [Row(age=2, name='Alice')]
     >>> df.filter(df.name.startswith('^Al')).collect()
@@ -598,6 +664,8 @@ class Column:
 
     Examples
     --------
+    >>> df = spark.createDataFrame(
+    ...      [(2, "Alice"), (5, "Bob")], ["age", "name"])
     >>> df.filter(df.name.endswith('ice')).collect()
     [Row(age=2, name='Alice')]
     >>> df.filter(df.name.endswith('ice$')).collect()
@@ -621,8 +689,16 @@ class Column:
         --------
         pyspark.sql.Column.rlike
 
+        Returns
+        -------
+        :class:`Column`
+            Column of booleans showing whether each element
+            in the Column is matched by SQL LIKE pattern.
+
         Examples
         --------
+        >>> df = spark.createDataFrame(
+        ...      [(2, "Alice"), (5, "Bob")], ["age", "name"])
         >>> df.filter(df.name.like('Al%')).collect()
         [Row(age=2, name='Alice')]
         """
@@ -639,8 +715,16 @@ class Column:
         other : str
             an extended regex expression
 
+        Returns
+        -------
+        :class:`Column`
+            Column of booleans showing whether each element
+            in the Column is matched by extended regex expression.
+
         Examples
         --------
+        >>> df = spark.createDataFrame(
+        ...      [(2, "Alice"), (5, "Bob")], ["age", "name"])
         >>> df.filter(df.name.rlike('ice$')).collect()
         [Row(age=2, name='Alice')]
         """
@@ -663,8 +747,16 @@ class Column:
         --------
         pyspark.sql.Column.rlike
 
+        Returns
+        -------
+        :class:`Column`
+            Column of booleans showing whether each element
+            in the Column is matched by SQL LIKE pattern.
+
         Examples
         --------
+        >>> df = spark.createDataFrame(
+        ...      [(2, "Alice"), (5, "Bob")], ["age", "name"])
         >>> df.filter(df.name.ilike('%Ice')).collect()
         [Row(age=2, name='Alice')]
         """
@@ -692,8 +784,15 @@ class Column:
         length : :class:`Column` or int
             length of the substring
 
+        Returns
+        -------
+        :class:`Column`
+            Column representing whether each element of Column is substr of 
origin Column.
+
         Examples
         --------
+        >>> df = spark.createDataFrame(
+        ...      [(2, "Alice"), (5, "Bob")], ["age", "name"])
         >>> df.select(df.name.substr(1, 3).alias("col")).collect()
         [Row(col='Ali'), Row(col='Bob')]
         """
@@ -720,8 +819,20 @@ class Column:
 
         .. versionadded:: 1.5.0
 
+        Parameters
+        ----------
+        cols
+            The result will only be true at a location if any value matches in 
the Column.
+
+        Returns
+        -------
+        :class:`Column`
+            Column of booleans showing whether each element in the Column is 
contained in cols.
+
         Examples
         --------
+        >>> df = spark.createDataFrame(
+        ...      [(2, "Alice"), (5, "Bob")], ["age", "name"])
         >>> df[df.name.isin("Bob", "Mike")].collect()
         [Row(age=5, name='Bob')]
         >>> df[df.age.isin([1, 2, 3])].collect()
@@ -870,8 +981,15 @@ class Column:
             .. versionchanged:: 2.2.0
                Added optional ``metadata`` argument.
 
+        Returns
+        -------
+        :class:`Column`
+            Column representing whether each element of Column is aliased with 
new name or names.
+
         Examples
         --------
+        >>> df = spark.createDataFrame(
+        ...      [(2, "Alice"), (5, "Bob")], ["age", "name"])
         >>> df.select(df.age.alias("age2")).collect()
         [Row(age2=2), Row(age2=5)]
         >>> df.select(df.age.alias("age3", metadata={'max': 
99})).schema['age3'].metadata['max']
@@ -903,8 +1021,22 @@ class Column:
 
         .. versionadded:: 1.3.0
 
+        Parameters
+        ----------
+        dataType : :class:`DataType` or str
+            a DataType or Python string literal with a DDL-formatted string
+            to use when parsing the column to the same type.
+
+        Returns
+        -------
+        :class:`Column`
+            Column representing whether each element of Column is cast into 
new type.
+
         Examples
         --------
+        >>> from pyspark.sql.types import StringType
+        >>> df = spark.createDataFrame(
+        ...      [(2, "Alice"), (5, "Bob")], ["age", "name"])
         >>> df.select(df.age.cast("string").alias('ages')).collect()
         [Row(ages='2'), Row(ages='5')]
         >>> df.select(df.age.cast(StringType()).alias('ages')).collect()
@@ -934,8 +1066,23 @@ class Column:
 
         .. versionadded:: 1.3.0
 
+        Parameters
+        ----------
+        lowerBound : :class:`Column`, int, float, string, bool, datetime, date 
or Decimal
+            a boolean expression that boundary start, inclusive.
+        upperBound : :class:`Column`, int, float, string, bool, datetime, date 
or Decimal
+            a boolean expression that boundary end, inclusive.
+
+        Returns
+        -------
+        :class:`Column`
+            Column of booleans showing whether each element of Column
+            is between left and right (inclusive).
+
         Examples
         --------
+        >>> df = spark.createDataFrame(
+        ...      [(2, "Alice"), (5, "Bob")], ["age", "name"])
         >>> df.select(df.name, df.age.between(2, 4)).show()
         +-----+---------------------------+
         | name|((age >= 2) AND (age <= 4))|
@@ -960,9 +1107,16 @@ class Column:
         value
             a literal value, or a :class:`Column` expression.
 
+        Returns
+        -------
+        :class:`Column`
+            Column representing whether each element of Column is in 
conditions.
+
         Examples
         --------
         >>> from pyspark.sql import functions as F
+        >>> df = spark.createDataFrame(
+        ...      [(2, "Alice"), (5, "Bob")], ["age", "name"])
         >>> df.select(df.name, F.when(df.age > 4, 1).when(df.age < 3, 
-1).otherwise(0)).show()
         +-----+------------------------------------------------------------+
         | name|CASE WHEN (age > 4) THEN 1 WHEN (age < 3) THEN -1 ELSE 0 END|
@@ -993,9 +1147,16 @@ class Column:
         value
             a literal value, or a :class:`Column` expression.
 
+        Returns
+        -------
+        :class:`Column`
+            Column representing whether each element of Column is unmatched 
conditions.
+
         Examples
         --------
         >>> from pyspark.sql import functions as F
+        >>> df = spark.createDataFrame(
+        ...      [(2, "Alice"), (5, "Bob")], ["age", "name"])
         >>> df.select(df.name, F.when(df.age > 3, 1).otherwise(0)).show()
         +-----+-------------------------------------+
         | name|CASE WHEN (age > 3) THEN 1 ELSE 0 END|
@@ -1033,6 +1194,8 @@ class Column:
                 .rowsBetween(Window.unboundedPreceding, Window.currentRow)
         >>> from pyspark.sql.functions import rank, min
         >>> from pyspark.sql.functions import desc
+        >>> df = spark.createDataFrame(
+        ...      [(2, "Alice"), (5, "Bob")], ["age", "name"])
         >>> df.withColumn("rank", rank().over(window)) \
                 .withColumn("min", 
min('age').over(window)).sort(desc("age")).show()
         +---+-----+----+---+
@@ -1068,11 +1231,7 @@ def _test() -> None:
 
     globs = pyspark.sql.column.__dict__.copy()
     spark = SparkSession.builder.master("local[4]").appName("sql.column 
tests").getOrCreate()
-    sc = spark.sparkContext
     globs["spark"] = spark
-    globs["df"] = sc.parallelize([(2, "Alice"), (5, "Bob")]).toDF(
-        StructType([StructField("age", IntegerType()), StructField("name", 
StringType())])
-    )
 
     (failure_count, test_count) = doctest.testmod(
         pyspark.sql.column,


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated: [SPARK-40078][PYTHON][DOCS] Make pyspark.sql.column examples self-contained

Reply via email to