Hyukjin Kwon created SPARK-48086:
------------------------------------
Summary: Different Arrow versions in client and server
Key: SPARK-48086
URL: https://issues.apache.org/jira/browse/SPARK-48086
Project: Spark
Issue Type: Sub-task
Components: Connect, PySpark, SQL
Affects Versions: 4.0.0
Reporter: Hyukjin Kwon
{code}
======================================================================
FAIL [1.071s]: test_pandas_udf_arrow_overflow
(pyspark.sql.tests.connect.test_parity_pandas_udf.PandasUDFParityTests.test_pandas_udf_arrow_overflow)
----------------------------------------------------------------------
pyspark.errors.exceptions.connect.PythonException:
An exception was thrown from the Python worker. Please see the stack trace
below.
Traceback (most recent call last):
File
"/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py",
line 302, in _create_array
return pa.Array.from_pandas(
^^^^^^^^^^^^^^^^^^^^^
File "pyarrow/array.pxi", line 1054, in pyarrow.lib.Array.from_pandas
File "pyarrow/array.pxi", line 323, in pyarrow.lib.array
File "pyarrow/array.pxi", line 83, in pyarrow.lib._ndarray_to_array
File "pyarrow/error.pxi", line 100, in pyarrow.lib.check_status
pyarrow.lib.ArrowInvalid: Integer value 128 not in range: -128 to 127
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File
"/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/worker.py", line
1834, in main
process()
File
"/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/worker.py", line
1826, in process
serializer.dump_stream(out_iter, outfile)
File
"/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py",
line 531, in dump_stream
return ArrowStreamSerializer.dump_stream(self, init_stream_yield_batches(),
stream)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File
"/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py",
line 104, in dump_stream
for batch in iterator:
File
"/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py",
line 525, in init_stream_yield_batches
batch = self._create_batch(series)
^^^^^^^^^^^^^^^^^^^^^^^^^^
File
"/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py",
line 511, in _create_batch
arrs.append(self._create_array(s, t, arrow_cast=self._arrow_cast))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File
"/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py",
line 330, in _create_array
raise PySparkValueError(error_msg % (series.dtype, series.na...
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File
"/home/runner/work/spark/spark-3.5/python/pyspark/sql/tests/pandas/test_pandas_udf.py",
line 299, in test_pandas_udf_arrow_overflow
with self.assertRaisesRegex(
AssertionError: "Exception thrown when converting pandas.Series" does not match
"
An exception was thrown from the Python worker. Please see the stack trace
below.
Traceback (most recent call last):
File
"/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py",
line 302, in _create_array
return pa.Array.from_pandas(
^^^^^^^^^^^^^^^^^^^^^
File "pyarrow/array.pxi", line 1054, in pyarrow.lib.Array.from_pandas
File "pyarrow/array.pxi", line 323, in pyarrow.lib.array
File "pyarrow/array.pxi", line 83, in pyarrow.lib._ndarray_to_array
File "pyarrow/error.pxi", line 100, in pyarrow.lib.check_status
pyarrow.lib.ArrowInvalid: Integer value 128 not in range: -128 to 127
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File
"/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/worker.py", line
1834, in main
process()
File
"/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/worker.py", line
1826, in process
serializer.dump_stream(out_iter, outfile)
File
"/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py",
line 531, in dump_stream
Traceback (most recent call last):
File
"/home/runner/work/spark/spark-3.5/python/pyspark/sql/tests/pandas/test_pandas_udf.py",
line 279, in test_pandas_udf_detect_unsafe_type_conversion
with self.assertRaisesRegex(
AssertionError: "Exception thrown when converting pandas.Series" does not match
"
An exception was thrown from the Python worker. Please see the stack trace
below.
Traceback (most recent call last):
File
"/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py",
line
[302](https://github.com/HyukjinKwon/spark/actions/runs/8916220872/job/24487232590#step:9:303),
in _create_array
return pa.Array.from_pandas(
^^^^^^^^^^^^^^^^^^^^^
File "pyarrow/array.pxi", line 1054, in pyarrow.lib.Array.from_pandas
File "pyarrow/array.pxi", line 323, in pyarrow.lib.array
File "pyarrow/array.pxi", line 83, in pyarrow.lib._ndarray_to_array
File "pyarrow/error.pxi", line 100, in pyarrow.lib.check_status
pyarrow.lib.ArrowInvalid: Float value 0.5 was truncated converting to int32
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File
"/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/worker.py", line
1834, in main
process()
File
"/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/worker.py", line
1826, in process
serializer.dump_stream(out_iter, outfile)
File
"/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py",
line 531, in dump_stream
return ArrowStreamSerializer.dump_stream(self, init_stream_yield_batches(),
stream)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File
"/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py",
line 104, in dump_stream
for batch in iterator:
File
"/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py",
line 525, in init_stream_yield_batches
batch = self._create_batch(series)
^^^^^^^^^^^^^^^^^^^^^^^^^^
File
"/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py",
line 511, in _create_batch
arrs.append(self._create_array(s, t, arrow_cast=self._arrow_cast))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File
"/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py",
line 330, in _create_array
raise PySparkValueError(error_msg % (series.dtype, ser..."
----------------------------------------------------------------------
{code}
{code}
======================================================================
FAIL [0.162s]: test_vectorized_udf_exception
(pyspark.sql.tests.connect.test_parity_pandas_udf_scalar.PandasUDFScalarParityTests.test_vectorized_udf_exception)
----------------------------------------------------------------------
pyspark.errors.exceptions.connect.PythonException:
An exception was thrown from the Python worker. Please see the stack trace
below.
Traceback (most recent call last):
File
"/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/worker.py", line
1834, in main
process()
File
"/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/worker.py", line
1826, in process
serializer.dump_stream(out_iter, outfile)
File
"/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py",
line 531, in dump_stream
return ArrowStreamSerializer.dump_stream(self, init_stream_yield_batches(),
stream)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File
"/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py",
line 104, in dump_stream
for batch in iterator:
File
"/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py",
line 524, in init_stream_yield_batches
for series in iterator:
File
"/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/worker.py", line
1734, in mapper
result = tuple(f(*[a[o] for o in arg_offsets]) for arg_offsets, f in udfs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File
"/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/worker.py", line
1734, in <genexpr>
result = tuple(f(*[a[o] for o in arg_offsets]) for arg_offsets, f in udfs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File
"/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/worker.py", line
146, in <lambda>
verify_result_length(verify_result_type(func(*a)), len(a[0])),
^^^^^^^^
File "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/util.py",
line 118, in wrapper
return f(*args, **kwargs)
^^^^^^^^^^^^^^^^^^
File
"/home/runner/work/spark/spark-3.5/python/pyspark/sql/tests/pandas/test_pandas_udf_scalar.py",
line 650, in <lambda>
scalar_raise_exception = pandas_udf(lambda x: x * (1 / 0), LongType())
~~^~...
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File
"/home/runner/work/spark/spark-3.5/python/pyspark/sql/tests/connect/test_parity_pandas_udf_scalar.py",
line 35, in test_vectorized_udf_exception
self.check_vectorized_udf_exception()
File
"/home/runner/work/spark/spark-3.5/python/pyspark/sql/tests/pandas/test_pandas_udf_scalar.py",
line 658, in check_vectorized_udf_exception
with self.assertRaisesRegex(Exception, "division( or modulo)? by zero"):
AssertionError: "division( or modulo)? by zero" does not match "
An exception was thrown from the Python worker. Please see the stack trace
below.
Traceback (most recent call last):
File
"/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/worker.py", line
1834, in main
process()
File
"/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/worker.py", line
1826, in process
serializer.dump_stream(out_iter, outfile)
File
"/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py",
line 531, in dump_stream
return ArrowStreamSerializer.dump_stream(self, init_stream_yield_batches(),
stream)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File
"/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py",
line 104, in dump_stream
for batch in iterator:
File
"/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py",
line 524, in init_stream_yield_batches
for series in iterator:
File
"/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/worker.py", line
1734, in mapper
result = tuple(f(*[a[o] for o in arg_offsets]) for arg_offsets, f in udfs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File
"/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/worker.py", line
1734, in <genexpr>
result = tuple(f(*[a[o] for o in arg_offsets]) for arg_offsets, f in udfs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File
"/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/worker.py", line
146, in <lambda>
verify_result_length(verify_result_type(func(*a)), len(a[0])),
^^^^^^^^
File "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/util.py",
line 118, in wrapper
return f(*args, **kwargs)
^^^^^^^^^^^^^^^^^^
File
"/home/runner/work/spark/spark-3.5/python/pyspark/sql/tests/pandas/test_pandas_udf_scalar.py",
line 650, in <lambda>
scalar_raise_exception = pandas_udf(lambda x: x * (1 / 0), LongType())
~~^~..."
----------------------------------------------------------------------
{code}
--
This message was sent by Atlassian Jira
(v8.20.10#820010)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]