This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch branch-4.1
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-4.1 by this push:
new a30d93b646c2 [SPARK-53597][PYTHON][SQL][DOCS] Add `asTable()` and
`TableArg` examples
a30d93b646c2 is described below
commit a30d93b646c27fc419ffaba6458aca44c72f741a
Author: Yicong-Huang <[email protected]>
AuthorDate: Wed Nov 26 15:28:19 2025 -0800
[SPARK-53597][PYTHON][SQL][DOCS] Add `asTable()` and `TableArg` examples
### Why are the changes needed?
There are no examples showing how to use `DataFrame.asTable()` and
`TableArg` features. These examples are essential for users to understand how
to:
- Convert DataFrames to table arguments for use in UDTFs
- Control data partitioning and ordering when passing DataFrames as table
arguments
- Use the various methods available on `TableArg` instances
### Does this PR introduce _any_ user-facing change?
Yes, this is a documentation-only change that adds examples to the public
API documentation. No functional changes are introduced.
### How was this patch tested?
The examples follow the same patterns used in existing test cases
(`test_udtf.py`). The examples are written in doctest format and should be
validated when running doctests.
### Was this patch authored or co-authored using generative AI tooling?
Co-Generated-by: Cursor
Closes #53240 from Yicong-Huang/SPARK-53597/docs/add-astable-examples.
Authored-by: Yicong-Huang <[email protected]>
Signed-off-by: Dongjoon Hyun <[email protected]>
(cherry picked from commit a3e55e5dc8ddef5f78dc947b274b407bc804c208)
Signed-off-by: Dongjoon Hyun <[email protected]>
---
python/pyspark/sql/dataframe.py | 58 ++++++++++++++
python/pyspark/sql/table_arg.py | 166 ++++++++++++++++++++++++++++++++++++++++
2 files changed, 224 insertions(+)
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index ca33539df960..502883cf59b1 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -6691,6 +6691,64 @@ class DataFrame:
-------
:class:`table_arg.TableArg`
A `TableArg` object representing a table argument.
+
+ Examples
+ --------
+ >>> from pyspark.sql.functions import udtf
+ >>>
+ >>> # Create a simple UDTF that processes table data
+ >>> @udtf(returnType="id: int, doubled: int")
+ ... class DoubleUDTF:
+ ... def eval(self, row):
+ ... yield row["id"], row["id"] * 2
+ ...
+ >>> # Create a DataFrame
+ >>> df = spark.createDataFrame([(1,), (2,), (3,)], ["id"])
+ >>>
+ >>> # Use asTable() to pass the DataFrame as a table argument to the
UDTF
+ >>> result = DoubleUDTF(df.asTable())
+ >>> result.show()
+ +---+-------+
+ | id|doubled|
+ +---+-------+
+ | 1| 2|
+ | 2| 4|
+ | 3| 6|
+ +---+-------+
+ >>>
+ >>> # Use partitionBy and orderBy to control data partitioning and
ordering
+ >>> df2 = spark.createDataFrame(
+ ... [(1, "a"), (1, "b"), (2, "c"), (2, "d")], ["key", "value"]
+ ... )
+ >>>
+ >>> @udtf(returnType="key: int, value: string")
+ ... class ProcessUDTF:
+ ... def eval(self, row):
+ ... yield row["key"], row["value"]
+ ...
+ >>> # Partition by 'key' and order by 'value' within each partition
+ >>> result2 =
ProcessUDTF(df2.asTable().partitionBy("key").orderBy("value"))
+ >>> result2.show()
+ +---+-----+
+ |key|value|
+ +---+-----+
+ | 1| a|
+ | 1| b|
+ | 2| c|
+ | 2| d|
+ +---+-----+
+ >>>
+ >>> # Use withSinglePartition to process all data in a single partition
+ >>> result3 =
ProcessUDTF(df2.asTable().withSinglePartition().orderBy("value"))
+ >>> result3.show()
+ +---+-----+
+ |key|value|
+ +---+-----+
+ | 1| a|
+ | 1| b|
+ | 2| c|
+ | 2| d|
+ +---+-----+
"""
...
diff --git a/python/pyspark/sql/table_arg.py b/python/pyspark/sql/table_arg.py
index f96b40b2dee1..483b26eb97ab 100644
--- a/python/pyspark/sql/table_arg.py
+++ b/python/pyspark/sql/table_arg.py
@@ -40,6 +40,59 @@ class TableArg(TableValuedFunctionArgument):
def partitionBy(self, *cols: "ColumnOrName") -> "TableArg":
"""
Partitions the data based on the specified columns.
+
+ This method partitions the table argument data by the specified
columns.
+ It must be called before `orderBy()` and cannot be called after
+ `withSinglePartition()` has been called.
+
+ Parameters
+ ----------
+ cols : str, :class:`Column`, or list
+ Column names or :class:`Column` objects to partition by.
+
+ Returns
+ -------
+ :class:`TableArg`
+ A new `TableArg` instance with partitioning applied.
+
+ Examples
+ --------
+ >>> from pyspark.sql.functions import udtf
+ >>>
+ >>> @udtf(returnType="key: int, value: string")
+ ... class ProcessUDTF:
+ ... def eval(self, row):
+ ... yield row["key"], row["value"]
+ ...
+ >>> df = spark.createDataFrame(
+ ... [(1, "a"), (1, "b"), (2, "c"), (2, "d")], ["key", "value"]
+ ... )
+ >>>
+ >>> # Partition by a single column
+ >>> result = ProcessUDTF(df.asTable().partitionBy("key"))
+ >>> result.show()
+ +---+-----+
+ |key|value|
+ +---+-----+
+ | 1| a|
+ | 1| b|
+ | 2| c|
+ | 2| d|
+ +---+-----+
+ >>>
+ >>> # Partition by multiple columns
+ >>> df2 = spark.createDataFrame(
+ ... [(1, "x", 10), (1, "x", 20), (2, "y", 30)], ["key",
"category", "value"]
+ ... )
+ >>> result2 = ProcessUDTF(df2.asTable().partitionBy("key", "category"))
+ >>> result2.show()
+ +---+-----+
+ |key|value|
+ +---+-----+
+ | 1| x|
+ | 1| x|
+ | 2| y|
+ +---+-----+
"""
...
@@ -47,6 +100,72 @@ class TableArg(TableValuedFunctionArgument):
def orderBy(self, *cols: "ColumnOrName") -> "TableArg":
"""
Orders the data within each partition by the specified columns.
+
+ This method orders the data within partitions. It must be called after
+ `partitionBy()` or `withSinglePartition()` has been called.
+
+ Parameters
+ ----------
+ cols : str, :class:`Column`, or list
+ Column names or :class:`Column` objects to order by. Columns can be
+ ordered in ascending or descending order using :meth:`Column.asc`
or
+ :meth:`Column.desc`.
+
+ Returns
+ -------
+ :class:`TableArg`
+ A new `TableArg` instance with ordering applied.
+
+ Examples
+ --------
+ >>> from pyspark.sql.functions import udtf
+ >>>
+ >>> @udtf(returnType="key: int, value: string")
+ ... class ProcessUDTF:
+ ... def eval(self, row):
+ ... yield row["key"], row["value"]
+ ...
+ >>> df = spark.createDataFrame(
+ ... [(1, "b"), (1, "a"), (2, "d"), (2, "c")], ["key", "value"]
+ ... )
+ >>>
+ >>> # Order by a single column within partitions
+ >>> result =
ProcessUDTF(df.asTable().partitionBy("key").orderBy("value"))
+ >>> result.show()
+ +---+-----+
+ |key|value|
+ +---+-----+
+ | 1| a|
+ | 1| b|
+ | 2| c|
+ | 2| d|
+ +---+-----+
+ >>>
+ >>> # Order by multiple columns
+ >>> df2 = spark.createDataFrame(
+ ... [(1, "a", 2), (1, "a", 1), (1, "b", 3)], ["key", "value",
"num"]
+ ... )
+ >>> result2 =
ProcessUDTF(df2.asTable().partitionBy("key").orderBy("value", "num"))
+ >>> result2.show()
+ +---+-----+
+ |key|value|
+ +---+-----+
+ | 1| a|
+ | 1| a|
+ | 1| b|
+ +---+-----+
+ >>>
+ >>> # Order by descending order
+ >>> result3 =
ProcessUDTF(df.asTable().partitionBy("key").orderBy(df.value.desc()))
+ >>> result3.show()
+ +---+-----+
+ |key|value|
+ +---+-----+
+ | 1| b|
+ | 1| a|
+ | 2| d|
+ | 2| c|
+ +---+-----+
"""
...
@@ -54,5 +173,52 @@ class TableArg(TableValuedFunctionArgument):
def withSinglePartition(self) -> "TableArg":
"""
Forces the data to be processed in a single partition.
+
+ This method indicates that all data should be treated as a single
partition.
+ It cannot be called after `partitionBy()` has been called. `orderBy()`
can
+ be called after this method to order the data within the single
partition.
+
+ Returns
+ -------
+ :class:`TableArg`
+ A new `TableArg` instance with single partition constraint applied.
+
+ Examples
+ --------
+ >>> from pyspark.sql.functions import udtf
+ >>>
+ >>> @udtf(returnType="key: int, value: string")
+ ... class ProcessUDTF:
+ ... def eval(self, row):
+ ... yield row["key"], row["value"]
+ ...
+ >>> df = spark.createDataFrame(
+ ... [(1, "a"), (2, "b"), (3, "c")], ["key", "value"]
+ ... )
+ >>>
+ >>> # Process all data in a single partition
+ >>> result = ProcessUDTF(df.asTable().withSinglePartition())
+ >>> result.show()
+ +---+-----+
+ |key|value|
+ +---+-----+
+ | 1| a|
+ | 2| b|
+ | 3| c|
+ +---+-----+
+ >>>
+ >>> # Use withSinglePartition and orderBy together
+ >>> df2 = spark.createDataFrame(
+ ... [(3, "c"), (1, "a"), (2, "b")], ["key", "value"]
+ ... )
+ >>> result2 =
ProcessUDTF(df2.asTable().withSinglePartition().orderBy("key"))
+ >>> result2.show()
+ +---+-----+
+ |key|value|
+ +---+-----+
+ | 1| a|
+ | 2| b|
+ | 3| c|
+ +---+-----+
"""
...
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]