Github user HyukjinKwon commented on a diff in the pull request: https://github.com/apache/spark/pull/22807#discussion_r227784077 --- Diff: python/pyspark/sql/tests.py --- @@ -4961,6 +4961,31 @@ def foofoo(x, y): ).collect ) + def test_pandas_udf_detect_unsafe_type_conversion(self): + from distutils.version import LooseVersion + from pyspark.sql.functions import pandas_udf + import pandas as pd + import numpy as np + import pyarrow as pa + + values = [1.0] * 3 + pdf = pd.DataFrame({'A': values}) + df = self.spark.createDataFrame(pdf).repartition(1) + + @pandas_udf(returnType="int") + def udf(column): + return pd.Series(np.linspace(0, 1, 3)) + + udf_boolean = df.select(['A']).withColumn('udf', udf('A')) + + # Since 0.11.0, PyArrow supports the feature to raise an error for unsafe cast. + if LooseVersion(pa.__version__) >= LooseVersion("0.11.0"): --- End diff -- BTW, let's bump up the minimal required PyArrow and Pandas version up if possible at 3.0 :-)
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org