This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 6a44b627f40f [SPARK-45811][PYTHON][DOCS] Refine docstring of `from_xml` 6a44b627f40f is described below commit 6a44b627f40f501a171794416b6a6a9cae8893b5 Author: Hyukjin Kwon <gurwls...@apache.org> AuthorDate: Tue Nov 7 10:01:50 2023 -0800 [SPARK-45811][PYTHON][DOCS] Refine docstring of `from_xml` ### What changes were proposed in this pull request? This PR proposes to improve the docstring of `from_xml`. ### Why are the changes needed? For end users, and better usability of PySpark. ### Does this PR introduce _any_ user-facing change? Yes, it fixes the user facing documentation. ### How was this patch tested? Manually tested. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #43680 from HyukjinKwon/SPARK-45186. Authored-by: Hyukjin Kwon <gurwls...@apache.org> Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> --- python/pyspark/sql/functions.py | 51 ++++++++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 19 deletions(-) diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index dd6be89ab853..ef5c0ea073ab 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -13635,6 +13635,8 @@ def json_object_keys(col: "ColumnOrName") -> Column: return _invoke_function_over_columns("json_object_keys", col) +# TODO: Fix and add an example for StructType with Spark Connect +# e.g., StructType([StructField("a", IntegerType())]) @_try_remote_functions def from_xml( col: "ColumnOrName", @@ -13668,40 +13670,51 @@ def from_xml( Examples -------- - >>> from pyspark.sql.types import * - >>> from pyspark.sql.functions import from_xml, schema_of_xml, lit - - StructType input with simple IntegerType. + Example 1: Parsing XML with a :class:`StructType` schema + >>> import pyspark.sql.functions as sf + >>> from pyspark.sql.types import StructType, StructField, LongType + ... # Sample data with an XML column >>> data = [(1, '''<p><a>1</a></p>''')] >>> df = spark.createDataFrame(data, ("key", "value")) + ... # Define the schema using a StructType + >>> schema = StructType([StructField("a", LongType())]) + ... # Parse the XML column using the specified schema + >>> df.select(sf.from_xml(df.value, schema).alias("xml")).collect() + [Row(xml=Row(a=1))] - TODO: Fix StructType for spark connect - schema = StructType([StructField("a", IntegerType())]) + Example 2: Parsing XML with a DDL-formatted string schema + >>> import pyspark.sql.functions as sf + >>> data = [(1, '''<p><a>1</a></p>''')] + >>> df = spark.createDataFrame(data, ("key", "value")) + ... # Define the schema using a DDL-formatted string >>> schema = "STRUCT<a: BIGINT>" - >>> df.select(from_xml(df.value, schema).alias("xml")).collect() + ... # Parse the XML column using the DDL-formatted schema + >>> df.select(sf.from_xml(df.value, schema).alias("xml")).collect() [Row(xml=Row(a=1))] - String input. - - >>> df.select(from_xml(df.value, "a INT").alias("xml")).collect() - [Row(xml=Row(a=1))] + Example 3: Parsing XML with :class:`ArrayType` in schema + >>> import pyspark.sql.functions as sf >>> data = [(1, '<p><a>1</a><a>2</a></p>')] >>> df = spark.createDataFrame(data, ("key", "value")) - - TODO: Fix StructType for spark connect - schema = StructType([StructField("a", ArrayType(IntegerType()))]) - + ... # Define the schema with an Array type >>> schema = "STRUCT<a: ARRAY<BIGINT>>" - >>> df.select(from_xml(df.value, schema).alias("xml")).collect() + ... # Parse the XML column using the schema with an Array + >>> df.select(sf.from_xml(df.value, schema).alias("xml")).collect() [Row(xml=Row(a=[1, 2]))] - Column input generated by schema_of_xml. + Example 4: Parsing XML using :meth:`pyspark.sql.functions.schema_of_xml` - >>> schema = schema_of_xml(lit(data[0][1])) - >>> df.select(from_xml(df.value, schema).alias("xml")).collect() + >>> import pyspark.sql.functions as sf + >>> # Sample data with an XML column + ... data = [(1, '<p><a>1</a><a>2</a></p>')] + >>> df = spark.createDataFrame(data, ("key", "value")) + ... # Generate the schema from an example XML value + >>> schema = sf.schema_of_xml(sf.lit(data[0][1])) + ... # Parse the XML column using the generated schema + >>> df.select(sf.from_xml(df.value, schema).alias("xml")).collect() [Row(xml=Row(a=[1, 2]))] """ --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org