IMarvinTPA opened a new issue, #55824:
URL: https://github.com/apache/spark/issues/55824
It could be something I'm doing wrong, but it is super easy to reproduce:
```
import pyspark.pandas as ppd
import pyspark.sql as ss
mySpark = ss.SparkSession.builder.appName('test') \
.config("spark.sql.catalogImplementation", "hive") \
.config("spark.sql.legacy.createHiveTableByDefault", "false")\
.config("spark.sql.repl.eagerEval.enabled", True) \
.config("spark.sql.sources.default", "parquet") \
.config("spark.sql.execution.arrow.pyspark.enabled", "True") \
.config("spark.sql.ansi.enabled", "false") \
.config("spark.python.worker.faulthandler.enabled", "true") \
.config("spark.sql.execution.pyspark.udf.faulthandler.enabled",
"true") \
.enableHiveSupport().getOrCreate()
# .config("spark.executor.memory", "6g") \
# .config("spark.executor.memoryOverhead", "1g") \
# .config("spark.delta.catalog.update.enabled", "false") \
DECIMAL_STR_LIST = ["1",
"2.2",
"3.3",
"44.4",
"555.55",
"654321.12",
"7.75",
"-800000.8",
None,
"1000"]
def noChange(val: str) -> str:
return str(val)
cols = {"aDecimal82": DECIMAL_STR_LIST}
df = ppd.DataFrame(cols)
df["aDecimal82"] = df["aDecimal82"].apply(noChange)
print(df)
```
This code results in this error message:
'''
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
Cell In[21], [line 41](vscode-notebook-cell:?execution_count=21&line=41)
38 df = ppd.DataFrame(cols)
39 df["aDecimal82"] = df["aDecimal82"].apply(noChange)
---> [41](vscode-notebook-cell:?execution_count=21&line=41) print(df)
File
c:\Users\599281\AppData\Local\Programs\Python\Python312\Lib\site-packages\pyspark\pandas\frame.py:13997,
in DataFrame.__repr__(self)
13993 max_display_count = get_option("display.max_rows")
13994 if max_display_count is None:
13995 return self._to_internal_pandas().to_string()
13996
> 13997 pdf = cast("DataFrame",
self._get_or_create_repr_pandas_cache(max_display_count))
13998 pdf_length = len(pdf)
13999 pdf = cast("DataFrame", pdf.iloc[:max_display_count])
14000 if pdf_length > max_display_count:
File
c:\Users\599281\AppData\Local\Programs\Python\Python312\Lib\site-packages\pyspark\pandas\frame.py:13988,
in DataFrame._get_or_create_repr_pandas_cache(self, n)
13985 def _get_or_create_repr_pandas_cache(self, n: int) ->
Union[pd.DataFrame, pd.Series]:
13986 if not hasattr(self, "_repr_pandas_cache") or n not in
self._repr_pandas_cache:
13987 object.__setattr__(
> 13988 self, "_repr_pandas_cache", {n: self.head(n +
1)._to_internal_pandas()}
13989 )
13990 return self._repr_pandas_cache[n]
File
c:\Users\599281\AppData\Local\Programs\Python\Python312\Lib\site-packages\pyspark\pandas\frame.py:13983,
in DataFrame._to_internal_pandas(self)
13979 Return a pandas DataFrame directly from _internal to avoid
overhead of copy.
13980
13981 This method is for internal use only.
13982 """
> 13983 return self._internal.to_pandas_frame
File
c:\Users\599281\AppData\Local\Programs\Python\Python312\Lib\site-packages\pyspark\pandas\utils.py:[608](file:///C:/Users/599281/AppData/Local/Programs/Python/Python312/Lib/site-packages/pyspark/pandas/utils.py:608),
in lazy_property.<locals>.wrapped_lazy_property(self)
604 @property
605 @functools.wraps(fn)
606 def wrapped_lazy_property(self):
607 if not hasattr(self, attr_name):
--> 608 setattr(self, attr_name, fn(self))
609 return getattr(self, attr_name)
File
c:\Users\599281\AppData\Local\Programs\Python\Python312\Lib\site-packages\pyspark\pandas\internal.py:[1080](file:///C:/Users/599281/AppData/Local/Programs/Python/Python312/Lib/site-packages/pyspark/pandas/internal.py:1080),
in InternalFrame.to_pandas_frame(self)
1078 """Return as pandas DataFrame."""
1079 sdf = self.to_internal_spark_frame
-> 1080 pdf = sdf._to_pandas(pandasStructHandlingMode="row")
1081 if len(pdf) == 0 and len(sdf.schema) > 0:
1082 pdf = pdf.astype(
1083 {field.name: spark_type_to_pandas_dtype(field.dataType) for
field in sdf.schema}
1084 )
File
c:\Users\599281\AppData\Local\Programs\Python\Python312\Lib\site-packages\pyspark\sql\classic\dataframe.py:[1959](file:///C:/Users/599281/AppData/Local/Programs/Python/Python312/Lib/site-packages/pyspark/sql/classic/dataframe.py:1959),
in DataFrame._to_pandas(self, **kwargs)
1958 def _to_pandas(self, **kwargs: Any) -> "PandasDataFrameLike":
-> 1959 return PandasConversionMixin._to_pandas(self, **kwargs)
File
c:\Users\599281\AppData\Local\Programs\Python\Python312\Lib\site-packages\pyspark\sql\pandas\conversion.py:[408](file:///C:/Users/599281/AppData/Local/Programs/Python/Python312/Lib/site-packages/pyspark/sql/pandas/conversion.py:408),
in PandasConversionMixin._to_pandas(self, **kwargs)
405 raise
407 # Below is toPandas without Arrow optimization.
--> 408 rows = self.collect()
409 if len(rows) > 0:
410 pdf = pd.DataFrame.from_records(rows, index=range(len(rows)),
columns=self.columns)
File
c:\Users\599281\AppData\Local\Programs\Python\Python312\Lib\site-packages\pyspark\sql\classic\dataframe.py:[503](file:///C:/Users/599281/AppData/Local/Programs/Python/Python312/Lib/site-packages/pyspark/sql/classic/dataframe.py:503),
in DataFrame.collect(self)
501 def collect(self) -> List[Row]:
502 with SCCallSiteSync(self._sc):
--> 503 sock_info = self._jdf.collectToPython()
504 with _load_from_socket(sock_info,
BatchedSerializer(CPickleSerializer())) as stream:
505 return list(stream)
File
c:\Users\599281\AppData\Local\Programs\Python\Python312\Lib\site-packages\py4j\java_gateway.py:1362,
in JavaMember.__call__(self, *args)
1356 command = proto.CALL_COMMAND_NAME +\
1357 self.command_header +\
1358 args_command +\
1359 proto.END_COMMAND_PART
1361 answer = self.gateway_client.send_command(command)
->
[1362](file:///C:/Users/599281/AppData/Local/Programs/Python/Python312/Lib/site-packages/py4j/java_gateway.py:1362)
return_value = get_return_value(
1363 answer, self.gateway_client, self.target_id, self.name)
1365 for temp_arg in temp_args:
1366 if hasattr(temp_arg, "_detach"):
File
c:\Users\599281\AppData\Local\Programs\Python\Python312\Lib\site-packages\pyspark\errors\exceptions\captured.py:251,
in capture_sql_exception.<locals>.deco(*a, **kw)
248 from py4j.protocol import Py4JJavaError
250 try:
-->
[251](file:///C:/Users/599281/AppData/Local/Programs/Python/Python312/Lib/site-packages/pyspark/errors/exceptions/captured.py:251)
return f(*a, **kw)
252 except Py4JJavaError as e:
253 converted = convert_exception(e.java_exception)
File
c:\Users\599281\AppData\Local\Programs\Python\Python312\Lib\site-packages\py4j\protocol.py:327,
in get_return_value(answer, gateway_client, target_id, name)
325 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client)
326 if answer[1] == REFERENCE_TYPE:
-->
[327](file:///C:/Users/599281/AppData/Local/Programs/Python/Python312/Lib/site-packages/py4j/protocol.py:327)
raise Py4JJavaError(
328 "An error occurred while calling {0}{1}{2}.\n".
329 format(target_id, ".", name), value)
330 else:
331 raise Py4JError(
332 "An error occurred while calling {0}{1}{2}. Trace:\n{3}\n".
333 format(target_id, ".", name, value))
Py4JJavaError: An error occurred while calling o4134.collectToPython.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 8
in stage 240.0 failed 1 times, most recent failure: Lost task 8.0 in stage
240.0 (TID 806) (mycomputername executor driver):
org.apache.spark.SparkException: Python worker exited unexpectedly (crashed)
at
org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:711)
at
org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:686)
at
scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:35)
at
org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:1068)
at
org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:1045)
at
org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:602)
at
org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:611)
at scala.collection.Iterator$$anon$9.hasNext(Iterator.scala:593)
at scala.collection.Iterator$$anon$9.hasNext(Iterator.scala:593)
at
org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown
Source)
at
org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at
org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:50)
at scala.collection.Iterator$$anon$9.hasNext(Iterator.scala:593)
at scala.collection.Iterator$$anon$9.hasNext(Iterator.scala:593)
at
org.apache.spark.sql.execution.python.BatchedPythonArrowInput.writeNextBatchToArrowStream(PythonArrowInput.scala:190)
at
org.apache.spark.sql.execution.python.BatchedPythonArrowInput.writeNextBatchToArrowStream$(PythonArrowInput.scala:180)
at
org.apache.spark.sql.execution.python.ArrowEvalPythonEvaluatorFactory$$anon$2.writeNextBatchToArrowStream(ArrowEvalPythonExec.scala:198)
at
org.apache.spark.sql.execution.python.PythonArrowInput$$anon$1.writeNextInputToStream(PythonArrowInput.scala:122)
at
org.apache.spark.api.python.BasePythonRunner$ReaderInputStream.writeAdditionalInputToPythonWorker(PythonRunner.scala:964)
at
org.apache.spark.api.python.BasePythonRunner$ReaderInputStream.read(PythonRunner.scala:879)
at
java.base/java.io.BufferedInputStream.fill(BufferedInputStream.java:244)
at
java.base/java.io.BufferedInputStream.read(BufferedInputStream.java:263)
at java.base/java.io.DataInputStream.readInt(DataInputStream.java:393)
at
org.apache.spark.sql.execution.python.PythonArrowOutput$$anon$1.read(PythonArrowOutput.scala:121)
at
org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:602)
at
org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:611)
at scala.collection.Iterator$$anon$9.hasNext(Iterator.scala:593)
at
org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.processNext(Unknown
Source)
at
org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at
org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:50)
at
org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:402)
at
org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:910)
at
org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:910)
at
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:57)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:374)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:338)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
at
org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:206)
at org.apache.spark.scheduler.Task.run(Task.scala:147)
at
org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:894)
at
org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:86)
at
org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:83)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:97)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:897)
at
java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
at
java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
at java.base/java.lang.Thread.run(Thread.java:833)
Caused by: java.io.IOException: An established connection was aborted by the
software in your host machine
at java.base/sun.nio.ch.SocketDispatcher.write0(Native Method)
at java.base/sun.nio.ch.SocketDispatcher.write(SocketDispatcher.java:54)
at java.base/sun.nio.ch.IOUtil.writeFromNativeBuffer(IOUtil.java:132)
at java.base/sun.nio.ch.IOUtil.write(IOUtil.java:76)
at java.base/sun.nio.ch.IOUtil.write(IOUtil.java:53)
at
java.base/sun.nio.ch.SocketChannelImpl.write(SocketChannelImpl.java:532)
at
org.apache.spark.api.python.BasePythonRunner$ReaderInputStream.writeAdditionalInputToPythonWorker(PythonRunner.scala:975)
at
org.apache.spark.api.python.BasePythonRunner$ReaderInputStream.read(PythonRunner.scala:879)
at
java.base/java.io.BufferedInputStream.fill(BufferedInputStream.java:244)
at
java.base/java.io.BufferedInputStream.read(BufferedInputStream.java:263)
at java.base/java.io.DataInputStream.readInt(DataInputStream.java:393)
at
org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:1053)
... 45 more
Driver stacktrace:
at
org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$3(DAGScheduler.scala:3304)
at scala.Option.getOrElse(Option.scala:201)
at
org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:3304)
at
org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:3296)
at scala.collection.immutable.List.foreach(List.scala:323)
at
org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:3296)
at
org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1353)
at
org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1353)
at scala.Option.foreach(Option.scala:437)
at
org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1353)
at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3575)
at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:3503)
at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:3492)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:50)
at
org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:1053)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2496)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2517)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2536)
at
org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:544)
at
org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:497)
at
org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:58)
at
org.apache.spark.sql.classic.Dataset.$anonfun$collectToPython$1(Dataset.scala:2084)
at
org.apache.spark.sql.classic.Dataset.$anonfun$withAction$2(Dataset.scala:2264)
at
org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:743)
at
org.apache.spark.sql.classic.Dataset.$anonfun$withAction$1(Dataset.scala:2262)
at
org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId0$8(SQLExecution.scala:228)
at
org.apache.spark.sql.execution.SQLExecution$.withSessionTagsApplied(SQLExecution.scala:352)
at
org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId0$7(SQLExecution.scala:189)
at
org.apache.spark.JobArtifactSet$.withActiveJobArtifactState(JobArtifactSet.scala:94)
at
org.apache.spark.sql.artifact.ArtifactManager.$anonfun$withResources$1(ArtifactManager.scala:112)
at
org.apache.spark.sql.artifact.ArtifactManager.withClassLoaderIfNeeded(ArtifactManager.scala:106)
at
org.apache.spark.sql.artifact.ArtifactManager.withResources(ArtifactManager.scala:111)
at
org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId0$6(SQLExecution.scala:189)
at
org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:375)
at
org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId0$1(SQLExecution.scala:188)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:810)
at
org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId0(SQLExecution.scala:130)
at
org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:317)
at org.apache.spark.sql.classic.Dataset.withAction(Dataset.scala:2262)
at
org.apache.spark.sql.classic.Dataset.collectToPython(Dataset.scala:2080)
at jdk.internal.reflect.GeneratedMethodAccessor165.invoke(Unknown
Source)
at
java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.base/java.lang.reflect.Method.invoke(Method.java:568)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at
py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:184)
at py4j.ClientServerConnection.run(ClientServerConnection.java:108)
at java.base/java.lang.Thread.run(Thread.java:833)
Caused by: org.apache.spark.SparkException: Python worker exited
unexpectedly (crashed)
at
org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:711)
at
org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:686)
at
scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:35)
at
org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:1068)
at
org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:1045)
at
org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:602)
at
org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:611)
at scala.collection.Iterator$$anon$9.hasNext(Iterator.scala:593)
at scala.collection.Iterator$$anon$9.hasNext(Iterator.scala:593)
at
org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown
Source)
at
org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at
org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:50)
at scala.collection.Iterator$$anon$9.hasNext(Iterator.scala:593)
at scala.collection.Iterator$$anon$9.hasNext(Iterator.scala:593)
at
org.apache.spark.sql.execution.python.BatchedPythonArrowInput.writeNextBatchToArrowStream(PythonArrowInput.scala:190)
at
org.apache.spark.sql.execution.python.BatchedPythonArrowInput.writeNextBatchToArrowStream$(PythonArrowInput.scala:180)
at
org.apache.spark.sql.execution.python.ArrowEvalPythonEvaluatorFactory$$anon$2.writeNextBatchToArrowStream(ArrowEvalPythonExec.scala:198)
at
org.apache.spark.sql.execution.python.PythonArrowInput$$anon$1.writeNextInputToStream(PythonArrowInput.scala:122)
at
org.apache.spark.api.python.BasePythonRunner$ReaderInputStream.writeAdditionalInputToPythonWorker(PythonRunner.scala:964)
at
org.apache.spark.api.python.BasePythonRunner$ReaderInputStream.read(PythonRunner.scala:879)
at
java.base/java.io.BufferedInputStream.fill(BufferedInputStream.java:244)
at
java.base/java.io.BufferedInputStream.read(BufferedInputStream.java:263)
at java.base/java.io.DataInputStream.readInt(DataInputStream.java:393)
at
org.apache.spark.sql.execution.python.PythonArrowOutput$$anon$1.read(PythonArrowOutput.scala:121)
at
org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:602)
at
org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:611)
at scala.collection.Iterator$$anon$9.hasNext(Iterator.scala:593)
at
org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.processNext(Unknown
Source)
at
org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at
org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:50)
at
org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:402)
at
org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:910)
at
org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:910)
at
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:57)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:374)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:338)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
at
org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:206)
at org.apache.spark.scheduler.Task.run(Task.scala:147)
at
org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:894)
at
org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:86)
at
org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:83)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:97)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:897)
at
java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
at
java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
... 1 more
Caused by: java.io.IOException: An established connection was aborted by the
software in your host machine
at java.base/sun.nio.ch.SocketDispatcher.write0(Native Method)
at java.base/sun.nio.ch.SocketDispatcher.write(SocketDispatcher.java:54)
at java.base/sun.nio.ch.IOUtil.writeFromNativeBuffer(IOUtil.java:132)
at java.base/sun.nio.ch.IOUtil.write(IOUtil.java:76)
at java.base/sun.nio.ch.IOUtil.write(IOUtil.java:53)
at
java.base/sun.nio.ch.SocketChannelImpl.write(SocketChannelImpl.java:532)
at
org.apache.spark.api.python.BasePythonRunner$ReaderInputStream.writeAdditionalInputToPythonWorker(PythonRunner.scala:975)
at
org.apache.spark.api.python.BasePythonRunner$ReaderInputStream.read(PythonRunner.scala:879)
at
java.base/java.io.BufferedInputStream.fill(BufferedInputStream.java:244)
at
java.base/java.io.BufferedInputStream.read(BufferedInputStream.java:263)
at java.base/java.io.DataInputStream.readInt(DataInputStream.java:393)
at
org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:1053)
... 45 more
'''
Versions in the environment:
Python 3.12.0
Running Java 17.0.0.1
pandas Version: 2.2.3
spark Version: 4.2.0-preview5
numpy Version: 2.1.3
pyArrow Version: 19.0.1
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]