This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 4027474cc744 [SPARK-45221][PYTHON][DOCS] Refine docstring of
DataFrameReader.parquet
4027474cc744 is described below
commit 4027474cc74438b29b0eae38f07ab03aeab99f5a
Author: Hyukjin Kwon <[email protected]>
AuthorDate: Thu Oct 12 09:24:27 2023 +0900
[SPARK-45221][PYTHON][DOCS] Refine docstring of DataFrameReader.parquet
### What changes were proposed in this pull request?
This PR refines the docstring of DataFrameReader.parquet by adding more
examples.
### Why are the changes needed?
To improve PySpark documentation
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
doctest
### Was this patch authored or co-authored using generative AI tooling?
No
Closes #43301 from allisonwang-db/spark-45221-refine-parquet.
Lead-authored-by: Hyukjin Kwon <[email protected]>
Co-authored-by: allisonwang-db <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
---
python/pyspark/sql/readwriter.py | 68 ++++++++++++++++++++++++++++++++++------
1 file changed, 58 insertions(+), 10 deletions(-)
diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index cfac8fdbc68b..ea429a75e157 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -495,6 +495,7 @@ class DataFrameReader(OptionUtils):
Parameters
----------
paths : str
+ One or more file paths to read the Parquet files from.
Other Parameters
----------------
@@ -505,24 +506,71 @@ class DataFrameReader(OptionUtils):
.. # noqa
+ Returns
+ -------
+ :class:`DataFrame`
+ A DataFrame containing the data from the Parquet files.
+
Examples
--------
+ Create sample dataframes.
+
+ >>> df = spark.createDataFrame(
+ ... [(10, "Alice"), (15, "Bob"), (20, "Tom")], schema=["age",
"name"])
+ >>> df2 = spark.createDataFrame([(70, "Alice"), (80, "Bob")],
schema=["height", "name"])
+
Write a DataFrame into a Parquet file and read it back.
>>> import tempfile
>>> with tempfile.TemporaryDirectory() as d:
- ... # Write a DataFrame into a Parquet file
- ... spark.createDataFrame(
- ... [{"age": 100, "name": "Hyukjin Kwon"}]
- ... ).write.mode("overwrite").format("parquet").save(d)
+ ... # Write a DataFrame into a Parquet file.
+ ... df.write.mode("overwrite").format("parquet").save(d)
...
... # Read the Parquet file as a DataFrame.
- ... spark.read.parquet(d).show()
- +---+------------+
- |age| name|
- +---+------------+
- |100|Hyukjin Kwon|
- +---+------------+
+ ... spark.read.parquet(d).orderBy("name").show()
+ +---+-----+
+ |age| name|
+ +---+-----+
+ | 10|Alice|
+ | 15| Bob|
+ | 20| Tom|
+ +---+-----+
+
+ Read a Parquet file with a specific column.
+
+ >>> with tempfile.TemporaryDirectory() as d:
+ ... df.write.mode("overwrite").format("parquet").save(d)
+ ...
+ ... # Read the Parquet file with only the 'name' column.
+ ... spark.read.schema("name
string").parquet(d).orderBy("name").show()
+ +-----+
+ | name|
+ +-----+
+ |Alice|
+ | Bob|
+ | Tom|
+ +-----+
+
+ Read multiple Parquet files and merge schema.
+
+ >>> with tempfile.TemporaryDirectory() as d1,
tempfile.TemporaryDirectory() as d2:
+ ... df.write.mode("overwrite").format("parquet").save(d1)
+ ... df2.write.mode("overwrite").format("parquet").save(d2)
+ ...
+ ... spark.read.option(
+ ... "mergeSchema", "true"
+ ... ).parquet(d1, d2).select(
+ ... "name", "age", "height"
+ ... ).orderBy("name", "age").show()
+ +-----+----+------+
+ | name| age|height|
+ +-----+----+------+
+ |Alice|NULL| 70|
+ |Alice| 10| NULL|
+ | Bob|NULL| 80|
+ | Bob| 15| NULL|
+ | Tom| 20| NULL|
+ +-----+----+------+
"""
mergeSchema = options.get("mergeSchema", None)
pathGlobFilter = options.get("pathGlobFilter", None)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]