This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 760cd665f6cd [SPARK-46557][PYTHON][DOCS] Refine docstring for DataFrame.schema/explain/printSchema 760cd665f6cd is described below commit 760cd665f6cdd41e352477b1e2d235dd3e8de335 Author: Hyukjin Kwon <gurwls...@apache.org> AuthorDate: Tue Jan 2 16:51:12 2024 +0900 [SPARK-46557][PYTHON][DOCS] Refine docstring for DataFrame.schema/explain/printSchema ### What changes were proposed in this pull request? This PR proposes to improve the docstring of `DataFrame.schema`, `DataFrame.explain` and `DataFrame.printSchema`. ### Why are the changes needed? For better usability. ### Does this PR introduce _any_ user-facing change? Yes, it improves user-facing documentation. ### How was this patch tested? Manually ran the tests via: ```bash python/run-tests --python-executable=python3 --testnames 'pyspark.sql.dataframe' ``` ### Was this patch authored or co-authored using generative AI tooling? No. Closes #44553 from HyukjinKwon/SPARK-46557. Authored-by: Hyukjin Kwon <gurwls...@apache.org> Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> --- python/pyspark/sql/dataframe.py | 53 ++++++++++++++++++++++++++++++++--------- 1 file changed, 42 insertions(+), 11 deletions(-) diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index f138b817bd22..7933c62a3503 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -609,14 +609,32 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): Examples -------- + Example 1: Retrieve the inferred schema of the current DataFrame. + >>> df = spark.createDataFrame( ... [(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"]) + >>> df.schema + StructType([StructField('age', LongType(), True), + StructField('name', StringType(), True)]) - Retrieve the schema of the current DataFrame. + Example 2: Retrieve the schema of the current DataFrame (DDL-formatted schema). + >>> df = spark.createDataFrame( + ... [(14, "Tom"), (23, "Alice"), (16, "Bob")], + ... "age INT, name STRING") >>> df.schema - StructType([StructField('age', LongType(), True), + StructType([StructField('age', IntegerType(), True), StructField('name', StringType(), True)]) + + Example 3: Retrieve the specified schema of the current DataFrame. + + >>> from pyspark.sql.types import StructType, StructField, StringType + >>> df = spark.createDataFrame( + ... [("a",), ("b",), ("c",)], + ... StructType([StructField("value", StringType(), False)])) + >>> df.schema + StructType([StructField('value', StringType(), False)]) + """ if self._schema is None: try: @@ -648,6 +666,8 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): Examples -------- + Example 1: Printing the schema of a DataFrame with basic columns + >>> df = spark.createDataFrame( ... [(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"]) >>> df.printSchema() @@ -655,18 +675,30 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): |-- age: long (nullable = true) |-- name: string (nullable = true) - >>> df = spark.createDataFrame([(1, (2,2))], ["a", "b"]) + Example 2: Printing the schema with a specified level for nested columns + + >>> df = spark.createDataFrame([(1, (2, 2))], ["a", "b"]) >>> df.printSchema(1) root |-- a: long (nullable = true) |-- b: struct (nullable = true) + Example 3: Printing the schema with deeper nesting level + >>> df.printSchema(2) root |-- a: long (nullable = true) |-- b: struct (nullable = true) | |-- _1: long (nullable = true) | |-- _2: long (nullable = true) + + Example 4: Printing the schema of a DataFrame with nullable and non-nullable columns + + >>> df = spark.range(1).selectExpr("id AS nonnullable", "NULL AS nullable") + >>> df.printSchema() + root + |-- nonnullable: long (nullable = false) + |-- nullable: void (nullable = true) """ if level: print(self._jdf.schema().treeString(level)) @@ -704,18 +736,17 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): Examples -------- + Example 1: Print out the physical plan only (default). + >>> df = spark.createDataFrame( ... [(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"]) - - Print out the physical plan only (default). - >>> df.explain() # doctest: +SKIP == Physical Plan == *(1) Scan ExistingRDD[age...,name...] - Print out all of the parsed, analyzed, optimized and physical plans. + Example 2: Print out all parsed, analyzed, optimized, and physical plans. - >>> df.explain(True) + >>> df.explain(extended=True) == Parsed Logical Plan == ... == Analyzed Logical Plan == @@ -725,7 +756,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): == Physical Plan == ... - Print out the plans with two sections: a physical plan outline and node details + Example 3: Print out the plans with two sections: a physical plan outline and node details. >>> df.explain(mode="formatted") # doctest: +SKIP == Physical Plan == @@ -734,9 +765,9 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): Output [2]: [age..., name...] ... - Print a logical plan and statistics if they are available. + Example 4: Print a logical plan and statistics if they are available. - >>> df.explain("cost") + >>> df.explain(mode="cost") == Optimized Logical Plan == ...Statistics... ... --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org