This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 536ac30d3ca [SPARK-44879][PYTHON][DOCS] Refine the docstring of spark.createDataFrame 536ac30d3ca is described below commit 536ac30d3ca4bc81dca6a31d1211e61f25cbbc14 Author: allisonwang-db <allison.w...@databricks.com> AuthorDate: Mon Aug 21 09:24:39 2023 +0900 [SPARK-44879][PYTHON][DOCS] Refine the docstring of spark.createDataFrame ### What changes were proposed in this pull request? This PR refines the examples in the docstring of `spark.createDataFrame`. It also removes the examples using RDDs, as RDDs are outdated and Spark Connect won't support RDD: pyspark.errors.exceptions.base.PySparkNotImplementedError: [NOT_IMPLEMENTED] sparkContext() is not implemented. ### Why are the changes needed? To improve PySpark documentation ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Doctest. Closes #42569 from allisonwang-db/spark-44879-refine-create-df. Authored-by: allisonwang-db <allison.w...@databricks.com> Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> --- python/pyspark/sql/session.py | 118 +++++++++++++++++++++++++----------------- 1 file changed, 71 insertions(+), 47 deletions(-) diff --git a/python/pyspark/sql/session.py b/python/pyspark/sql/session.py index ce197319977..7a492d634cf 100644 --- a/python/pyspark/sql/session.py +++ b/python/pyspark/sql/session.py @@ -1255,7 +1255,7 @@ class SparkSession(SparkConversionMixin): :class:`pandas.DataFrame` or :class:`numpy.ndarray`. schema : :class:`pyspark.sql.types.DataType`, str or list, optional a :class:`pyspark.sql.types.DataType` or a datatype string or a list of - column names, default is None. The data type string format equals to + column names, default is None. The data type string format equals to :class:`pyspark.sql.types.DataType.simpleString`, except that top level struct type can omit the ``struct<>``. @@ -1292,34 +1292,31 @@ class SparkSession(SparkConversionMixin): -------- Create a DataFrame from a list of tuples. - >>> spark.createDataFrame([('Alice', 1)]).collect() - [Row(_1='Alice', _2=1)] - >>> spark.createDataFrame([('Alice', 1)], ['name', 'age']).collect() - [Row(name='Alice', age=1)] + >>> spark.createDataFrame([('Alice', 1)]).show() + +-----+---+ + | _1| _2| + +-----+---+ + |Alice| 1| + +-----+---+ - Create a DataFrame from a list of dictionaries + Create a DataFrame from a list of dictionaries. >>> d = [{'name': 'Alice', 'age': 1}] - >>> spark.createDataFrame(d).collect() - [Row(age=1, name='Alice')] - - Create a DataFrame from an RDD. - - >>> rdd = spark.sparkContext.parallelize([('Alice', 1)]) - >>> spark.createDataFrame(rdd).collect() - [Row(_1='Alice', _2=1)] - >>> df = spark.createDataFrame(rdd, ['name', 'age']) - >>> df.collect() - [Row(name='Alice', age=1)] - - Create a DataFrame from Row instances. - - >>> from pyspark.sql import Row - >>> Person = Row('name', 'age') - >>> person = rdd.map(lambda r: Person(*r)) - >>> df2 = spark.createDataFrame(person) - >>> df2.collect() - [Row(name='Alice', age=1)] + >>> spark.createDataFrame(d).show() + +---+-----+ + |age| name| + +---+-----+ + | 1|Alice| + +---+-----+ + + Create a DataFrame with column names specified. + + >>> spark.createDataFrame([('Alice', 1)], ['name', 'age']).show() + +-----+---+ + | name|age| + +-----+---+ + |Alice| 1| + +-----+---+ Create a DataFrame with the explicit schema specified. @@ -1327,31 +1324,58 @@ class SparkSession(SparkConversionMixin): >>> schema = StructType([ ... StructField("name", StringType(), True), ... StructField("age", IntegerType(), True)]) - >>> df3 = spark.createDataFrame(rdd, schema) - >>> df3.collect() - [Row(name='Alice', age=1)] + >>> spark.createDataFrame([('Alice', 1)], schema).show() + +-----+---+ + | name|age| + +-----+---+ + |Alice| 1| + +-----+---+ + + Create a DataFrame with the schema in DDL formatted string. + + >>> spark.createDataFrame([('Alice', 1)], "name: string, age: int").show() + +-----+---+ + | name|age| + +-----+---+ + |Alice| 1| + +-----+---+ + + Create an empty DataFrame. + When initializing an empty DataFrame in PySpark, it's mandatory to specify its schema, + as the DataFrame lacks data from which the schema can be inferred. + + >>> spark.createDataFrame([], "name: string, age: int").show() + +----+---+ + |name|age| + +----+---+ + +----+---+ + + Create a DataFrame from Row objects. + + >>> from pyspark.sql import Row + >>> Person = Row('name', 'age') + >>> df = spark.createDataFrame([Person("Alice", 1)]) + >>> df.show() + +-----+---+ + | name|age| + +-----+---+ + |Alice| 1| + +-----+---+ Create a DataFrame from a pandas DataFrame. - >>> spark.createDataFrame(df.toPandas()).collect() # doctest: +SKIP - [Row(name='Alice', age=1)] + >>> spark.createDataFrame(df.toPandas()).show() # doctest: +SKIP + +-----+---+ + | name|age| + +-----+---+ + |Alice| 1| + +-----+---+ >>> spark.createDataFrame(pandas.DataFrame([[1, 2]])).collect() # doctest: +SKIP - [Row(0=1, 1=2)] - - Create a DataFrame from an RDD with the schema in DDL formatted string. - - >>> spark.createDataFrame(rdd, "a: string, b: int").collect() - [Row(a='Alice', b=1)] - >>> rdd = rdd.map(lambda row: row[1]) - >>> spark.createDataFrame(rdd, "int").collect() - [Row(value=1)] - - When the type is unmatched, it throws an exception. - - >>> spark.createDataFrame(rdd, "boolean").collect() # doctest: +IGNORE_EXCEPTION_DETAIL - Traceback (most recent call last): - ... - Py4JJavaError: ... + +---+---+ + | 0| 1| + +---+---+ + | 1| 2| + +---+---+ """ SparkSession._activeSession = self assert self._jvm is not None --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org