Github user BryanCutler commented on a diff in the pull request:
https://github.com/apache/spark/pull/21427#discussion_r191502180
--- Diff: python/pyspark/sql/tests.py ---
@@ -4931,6 +4931,63 @@ def foo3(key, pdf):
expected4 = udf3.func((), pdf)
self.assertPandasEqual(expected4, result4)
+ def test_column_order(self):
+ import pandas as pd
+ from pyspark.sql.functions import pandas_udf, PandasUDFType
+ df = self.data
+
+ # Function returns a pdf with required column names, but order
could be arbitrary using dict
+ def change_col_order(pdf):
+ # Constructing a DataFrame from a dict should result in the
same order,
+ # but use from_items to ensure the pdf column order is
different than schema
+ return pd.DataFrame.from_items([
+ ('id', pdf.id),
+ ('u', pdf.v * 2),
+ ('v', pdf.v)])
+
+ ordered_udf = pandas_udf(
+ change_col_order,
+ 'id long, v int, u int',
+ PandasUDFType.GROUPED_MAP
+ )
+
+ def positional_col_order(pdf):
--- End diff --
yeah, I'll add a test for an integer index. I don't think we need to
explicitly only support string or int. Only if it is not string based, then
position will be used.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]