IMarvinTPA opened a new issue, #55824:
URL: https://github.com/apache/spark/issues/55824

   It could be something I'm doing wrong, but it is super easy to reproduce:
   ```
   import pyspark.pandas as ppd
   import pyspark.sql as ss
   
   
   mySpark = ss.SparkSession.builder.appName('test') \
               .config("spark.sql.catalogImplementation", "hive") \
               .config("spark.sql.legacy.createHiveTableByDefault", "false")\
               .config("spark.sql.repl.eagerEval.enabled", True) \
               .config("spark.sql.sources.default", "parquet") \
               .config("spark.sql.execution.arrow.pyspark.enabled", "True") \
               .config("spark.sql.ansi.enabled", "false") \
               .config("spark.python.worker.faulthandler.enabled", "true") \
               .config("spark.sql.execution.pyspark.udf.faulthandler.enabled", 
"true") \
               .enableHiveSupport().getOrCreate()
   
               # .config("spark.executor.memory", "6g") \
               # .config("spark.executor.memoryOverhead", "1g") \
               # .config("spark.delta.catalog.update.enabled", "false") \
   
   
   DECIMAL_STR_LIST = ["1",
                       "2.2",
                       "3.3",
                       "44.4",
                       "555.55",
                       "654321.12",
                       "7.75",
                       "-800000.8",
                       None,
                       "1000"]
   
   
   def noChange(val: str) -> str:
       return str(val)
   
   cols = {"aDecimal82": DECIMAL_STR_LIST}
   df = ppd.DataFrame(cols)
   df["aDecimal82"] = df["aDecimal82"].apply(noChange)
   
   print(df)
   ```
   
   This code results in this error message:
   '''
   ---------------------------------------------------------------------------
   Py4JJavaError                             Traceback (most recent call last)
   Cell In[21], [line 41](vscode-notebook-cell:?execution_count=21&line=41)
        38 df = ppd.DataFrame(cols)
        39 df["aDecimal82"] = df["aDecimal82"].apply(noChange)
   ---> [41](vscode-notebook-cell:?execution_count=21&line=41) print(df)
   
   File 
c:\Users\599281\AppData\Local\Programs\Python\Python312\Lib\site-packages\pyspark\pandas\frame.py:13997,
 in DataFrame.__repr__(self)
     13993         max_display_count = get_option("display.max_rows")
     13994         if max_display_count is None:
     13995             return self._to_internal_pandas().to_string()
     13996 
   > 13997         pdf = cast("DataFrame", 
self._get_or_create_repr_pandas_cache(max_display_count))
     13998         pdf_length = len(pdf)
     13999         pdf = cast("DataFrame", pdf.iloc[:max_display_count])
     14000         if pdf_length > max_display_count:
   
   File 
c:\Users\599281\AppData\Local\Programs\Python\Python312\Lib\site-packages\pyspark\pandas\frame.py:13988,
 in DataFrame._get_or_create_repr_pandas_cache(self, n)
     13985     def _get_or_create_repr_pandas_cache(self, n: int) -> 
Union[pd.DataFrame, pd.Series]:
     13986         if not hasattr(self, "_repr_pandas_cache") or n not in 
self._repr_pandas_cache:
     13987             object.__setattr__(
   > 13988                 self, "_repr_pandas_cache", {n: self.head(n + 
1)._to_internal_pandas()}
     13989             )
     13990         return self._repr_pandas_cache[n]
   
   File 
c:\Users\599281\AppData\Local\Programs\Python\Python312\Lib\site-packages\pyspark\pandas\frame.py:13983,
 in DataFrame._to_internal_pandas(self)
     13979         Return a pandas DataFrame directly from _internal to avoid 
overhead of copy.
     13980 
     13981         This method is for internal use only.
     13982         """
   > 13983         return self._internal.to_pandas_frame
   
   File 
c:\Users\599281\AppData\Local\Programs\Python\Python312\Lib\site-packages\pyspark\pandas\utils.py:[608](file:///C:/Users/599281/AppData/Local/Programs/Python/Python312/Lib/site-packages/pyspark/pandas/utils.py:608),
 in lazy_property.<locals>.wrapped_lazy_property(self)
       604 @property
       605 @functools.wraps(fn)
       606 def wrapped_lazy_property(self):
       607     if not hasattr(self, attr_name):
   --> 608         setattr(self, attr_name, fn(self))
       609     return getattr(self, attr_name)
   
   File 
c:\Users\599281\AppData\Local\Programs\Python\Python312\Lib\site-packages\pyspark\pandas\internal.py:[1080](file:///C:/Users/599281/AppData/Local/Programs/Python/Python312/Lib/site-packages/pyspark/pandas/internal.py:1080),
 in InternalFrame.to_pandas_frame(self)
      1078 """Return as pandas DataFrame."""
      1079 sdf = self.to_internal_spark_frame
   -> 1080 pdf = sdf._to_pandas(pandasStructHandlingMode="row")
      1081 if len(pdf) == 0 and len(sdf.schema) > 0:
      1082     pdf = pdf.astype(
      1083         {field.name: spark_type_to_pandas_dtype(field.dataType) for 
field in sdf.schema}
      1084     )
   
   File 
c:\Users\599281\AppData\Local\Programs\Python\Python312\Lib\site-packages\pyspark\sql\classic\dataframe.py:[1959](file:///C:/Users/599281/AppData/Local/Programs/Python/Python312/Lib/site-packages/pyspark/sql/classic/dataframe.py:1959),
 in DataFrame._to_pandas(self, **kwargs)
      1958 def _to_pandas(self, **kwargs: Any) -> "PandasDataFrameLike":
   -> 1959     return PandasConversionMixin._to_pandas(self, **kwargs)
   
   File 
c:\Users\599281\AppData\Local\Programs\Python\Python312\Lib\site-packages\pyspark\sql\pandas\conversion.py:[408](file:///C:/Users/599281/AppData/Local/Programs/Python/Python312/Lib/site-packages/pyspark/sql/pandas/conversion.py:408),
 in PandasConversionMixin._to_pandas(self, **kwargs)
       405             raise
       407 # Below is toPandas without Arrow optimization.
   --> 408 rows = self.collect()
       409 if len(rows) > 0:
       410     pdf = pd.DataFrame.from_records(rows, index=range(len(rows)), 
columns=self.columns)
   
   File 
c:\Users\599281\AppData\Local\Programs\Python\Python312\Lib\site-packages\pyspark\sql\classic\dataframe.py:[503](file:///C:/Users/599281/AppData/Local/Programs/Python/Python312/Lib/site-packages/pyspark/sql/classic/dataframe.py:503),
 in DataFrame.collect(self)
       501 def collect(self) -> List[Row]:
       502     with SCCallSiteSync(self._sc):
   --> 503         sock_info = self._jdf.collectToPython()
       504     with _load_from_socket(sock_info, 
BatchedSerializer(CPickleSerializer())) as stream:
       505         return list(stream)
   
   File 
c:\Users\599281\AppData\Local\Programs\Python\Python312\Lib\site-packages\py4j\java_gateway.py:1362,
 in JavaMember.__call__(self, *args)
      1356 command = proto.CALL_COMMAND_NAME +\
      1357     self.command_header +\
      1358     args_command +\
      1359     proto.END_COMMAND_PART
      1361 answer = self.gateway_client.send_command(command)
   -> 
[1362](file:///C:/Users/599281/AppData/Local/Programs/Python/Python312/Lib/site-packages/py4j/java_gateway.py:1362)
 return_value = get_return_value(
      1363     answer, self.gateway_client, self.target_id, self.name)
      1365 for temp_arg in temp_args:
      1366     if hasattr(temp_arg, "_detach"):
   
   File 
c:\Users\599281\AppData\Local\Programs\Python\Python312\Lib\site-packages\pyspark\errors\exceptions\captured.py:251,
 in capture_sql_exception.<locals>.deco(*a, **kw)
       248 from py4j.protocol import Py4JJavaError
       250 try:
   --> 
[251](file:///C:/Users/599281/AppData/Local/Programs/Python/Python312/Lib/site-packages/pyspark/errors/exceptions/captured.py:251)
     return f(*a, **kw)
       252 except Py4JJavaError as e:
       253     converted = convert_exception(e.java_exception)
   
   File 
c:\Users\599281\AppData\Local\Programs\Python\Python312\Lib\site-packages\py4j\protocol.py:327,
 in get_return_value(answer, gateway_client, target_id, name)
       325 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client)
       326 if answer[1] == REFERENCE_TYPE:
   --> 
[327](file:///C:/Users/599281/AppData/Local/Programs/Python/Python312/Lib/site-packages/py4j/protocol.py:327)
     raise Py4JJavaError(
       328         "An error occurred while calling {0}{1}{2}.\n".
       329         format(target_id, ".", name), value)
       330 else:
       331     raise Py4JError(
       332         "An error occurred while calling {0}{1}{2}. Trace:\n{3}\n".
       333         format(target_id, ".", name, value))
   
   Py4JJavaError: An error occurred while calling o4134.collectToPython.
   : org.apache.spark.SparkException: Job aborted due to stage failure: Task 8 
in stage 240.0 failed 1 times, most recent failure: Lost task 8.0 in stage 
240.0 (TID 806) (mycomputername executor driver): 
org.apache.spark.SparkException: Python worker exited unexpectedly (crashed)
        at 
org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:711)
        at 
org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:686)
        at 
scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:35)
        at 
org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:1068)
        at 
org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:1045)
        at 
org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:602)
        at 
org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
        at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:611)
        at scala.collection.Iterator$$anon$9.hasNext(Iterator.scala:593)
        at scala.collection.Iterator$$anon$9.hasNext(Iterator.scala:593)
        at 
org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown
 Source)
        at 
org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
        at 
org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:50)
        at scala.collection.Iterator$$anon$9.hasNext(Iterator.scala:593)
        at scala.collection.Iterator$$anon$9.hasNext(Iterator.scala:593)
        at 
org.apache.spark.sql.execution.python.BatchedPythonArrowInput.writeNextBatchToArrowStream(PythonArrowInput.scala:190)
        at 
org.apache.spark.sql.execution.python.BatchedPythonArrowInput.writeNextBatchToArrowStream$(PythonArrowInput.scala:180)
        at 
org.apache.spark.sql.execution.python.ArrowEvalPythonEvaluatorFactory$$anon$2.writeNextBatchToArrowStream(ArrowEvalPythonExec.scala:198)
        at 
org.apache.spark.sql.execution.python.PythonArrowInput$$anon$1.writeNextInputToStream(PythonArrowInput.scala:122)
        at 
org.apache.spark.api.python.BasePythonRunner$ReaderInputStream.writeAdditionalInputToPythonWorker(PythonRunner.scala:964)
        at 
org.apache.spark.api.python.BasePythonRunner$ReaderInputStream.read(PythonRunner.scala:879)
        at 
java.base/java.io.BufferedInputStream.fill(BufferedInputStream.java:244)
        at 
java.base/java.io.BufferedInputStream.read(BufferedInputStream.java:263)
        at java.base/java.io.DataInputStream.readInt(DataInputStream.java:393)
        at 
org.apache.spark.sql.execution.python.PythonArrowOutput$$anon$1.read(PythonArrowOutput.scala:121)
        at 
org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:602)
        at 
org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
        at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:611)
        at scala.collection.Iterator$$anon$9.hasNext(Iterator.scala:593)
        at 
org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.processNext(Unknown
 Source)
        at 
org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
        at 
org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:50)
        at 
org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:402)
        at 
org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:910)
        at 
org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:910)
        at 
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:57)
        at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:374)
        at org.apache.spark.rdd.RDD.iterator(RDD.scala:338)
        at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
        at 
org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:206)
        at org.apache.spark.scheduler.Task.run(Task.scala:147)
        at 
org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:894)
        at 
org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:86)
        at 
org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:83)
        at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:97)
        at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:897)
        at 
java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
        at 
java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
        at java.base/java.lang.Thread.run(Thread.java:833)
   Caused by: java.io.IOException: An established connection was aborted by the 
software in your host machine
        at java.base/sun.nio.ch.SocketDispatcher.write0(Native Method)
        at java.base/sun.nio.ch.SocketDispatcher.write(SocketDispatcher.java:54)
        at java.base/sun.nio.ch.IOUtil.writeFromNativeBuffer(IOUtil.java:132)
        at java.base/sun.nio.ch.IOUtil.write(IOUtil.java:76)
        at java.base/sun.nio.ch.IOUtil.write(IOUtil.java:53)
        at 
java.base/sun.nio.ch.SocketChannelImpl.write(SocketChannelImpl.java:532)
        at 
org.apache.spark.api.python.BasePythonRunner$ReaderInputStream.writeAdditionalInputToPythonWorker(PythonRunner.scala:975)
        at 
org.apache.spark.api.python.BasePythonRunner$ReaderInputStream.read(PythonRunner.scala:879)
        at 
java.base/java.io.BufferedInputStream.fill(BufferedInputStream.java:244)
        at 
java.base/java.io.BufferedInputStream.read(BufferedInputStream.java:263)
        at java.base/java.io.DataInputStream.readInt(DataInputStream.java:393)
        at 
org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:1053)
        ... 45 more
   
   Driver stacktrace:
        at 
org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$3(DAGScheduler.scala:3304)
        at scala.Option.getOrElse(Option.scala:201)
        at 
org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:3304)
        at 
org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:3296)
        at scala.collection.immutable.List.foreach(List.scala:323)
        at 
org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:3296)
        at 
org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1353)
        at 
org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1353)
        at scala.Option.foreach(Option.scala:437)
        at 
org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1353)
        at 
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3575)
        at 
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:3503)
        at 
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:3492)
        at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:50)
        at 
org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:1053)
        at org.apache.spark.SparkContext.runJob(SparkContext.scala:2496)
        at org.apache.spark.SparkContext.runJob(SparkContext.scala:2517)
        at org.apache.spark.SparkContext.runJob(SparkContext.scala:2536)
        at 
org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:544)
        at 
org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:497)
        at 
org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:58)
        at 
org.apache.spark.sql.classic.Dataset.$anonfun$collectToPython$1(Dataset.scala:2084)
        at 
org.apache.spark.sql.classic.Dataset.$anonfun$withAction$2(Dataset.scala:2264)
        at 
org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:743)
        at 
org.apache.spark.sql.classic.Dataset.$anonfun$withAction$1(Dataset.scala:2262)
        at 
org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId0$8(SQLExecution.scala:228)
        at 
org.apache.spark.sql.execution.SQLExecution$.withSessionTagsApplied(SQLExecution.scala:352)
        at 
org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId0$7(SQLExecution.scala:189)
        at 
org.apache.spark.JobArtifactSet$.withActiveJobArtifactState(JobArtifactSet.scala:94)
        at 
org.apache.spark.sql.artifact.ArtifactManager.$anonfun$withResources$1(ArtifactManager.scala:112)
        at 
org.apache.spark.sql.artifact.ArtifactManager.withClassLoaderIfNeeded(ArtifactManager.scala:106)
        at 
org.apache.spark.sql.artifact.ArtifactManager.withResources(ArtifactManager.scala:111)
        at 
org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId0$6(SQLExecution.scala:189)
        at 
org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:375)
        at 
org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId0$1(SQLExecution.scala:188)
        at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:810)
        at 
org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId0(SQLExecution.scala:130)
        at 
org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:317)
        at org.apache.spark.sql.classic.Dataset.withAction(Dataset.scala:2262)
        at 
org.apache.spark.sql.classic.Dataset.collectToPython(Dataset.scala:2080)
        at jdk.internal.reflect.GeneratedMethodAccessor165.invoke(Unknown 
Source)
        at 
java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
        at java.base/java.lang.reflect.Method.invoke(Method.java:568)
        at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
        at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
        at py4j.Gateway.invoke(Gateway.java:282)
        at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
        at py4j.commands.CallCommand.execute(CallCommand.java:79)
        at 
py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:184)
        at py4j.ClientServerConnection.run(ClientServerConnection.java:108)
        at java.base/java.lang.Thread.run(Thread.java:833)
   Caused by: org.apache.spark.SparkException: Python worker exited 
unexpectedly (crashed)
        at 
org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:711)
        at 
org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:686)
        at 
scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:35)
        at 
org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:1068)
        at 
org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:1045)
        at 
org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:602)
        at 
org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
        at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:611)
        at scala.collection.Iterator$$anon$9.hasNext(Iterator.scala:593)
        at scala.collection.Iterator$$anon$9.hasNext(Iterator.scala:593)
        at 
org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown
 Source)
        at 
org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
        at 
org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:50)
        at scala.collection.Iterator$$anon$9.hasNext(Iterator.scala:593)
        at scala.collection.Iterator$$anon$9.hasNext(Iterator.scala:593)
        at 
org.apache.spark.sql.execution.python.BatchedPythonArrowInput.writeNextBatchToArrowStream(PythonArrowInput.scala:190)
        at 
org.apache.spark.sql.execution.python.BatchedPythonArrowInput.writeNextBatchToArrowStream$(PythonArrowInput.scala:180)
        at 
org.apache.spark.sql.execution.python.ArrowEvalPythonEvaluatorFactory$$anon$2.writeNextBatchToArrowStream(ArrowEvalPythonExec.scala:198)
        at 
org.apache.spark.sql.execution.python.PythonArrowInput$$anon$1.writeNextInputToStream(PythonArrowInput.scala:122)
        at 
org.apache.spark.api.python.BasePythonRunner$ReaderInputStream.writeAdditionalInputToPythonWorker(PythonRunner.scala:964)
        at 
org.apache.spark.api.python.BasePythonRunner$ReaderInputStream.read(PythonRunner.scala:879)
        at 
java.base/java.io.BufferedInputStream.fill(BufferedInputStream.java:244)
        at 
java.base/java.io.BufferedInputStream.read(BufferedInputStream.java:263)
        at java.base/java.io.DataInputStream.readInt(DataInputStream.java:393)
        at 
org.apache.spark.sql.execution.python.PythonArrowOutput$$anon$1.read(PythonArrowOutput.scala:121)
        at 
org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:602)
        at 
org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
        at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:611)
        at scala.collection.Iterator$$anon$9.hasNext(Iterator.scala:593)
        at 
org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.processNext(Unknown
 Source)
        at 
org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
        at 
org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:50)
        at 
org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:402)
        at 
org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:910)
        at 
org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:910)
        at 
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:57)
        at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:374)
        at org.apache.spark.rdd.RDD.iterator(RDD.scala:338)
        at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
        at 
org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:206)
        at org.apache.spark.scheduler.Task.run(Task.scala:147)
        at 
org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:894)
        at 
org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:86)
        at 
org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:83)
        at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:97)
        at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:897)
        at 
java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
        at 
java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
        ... 1 more
   Caused by: java.io.IOException: An established connection was aborted by the 
software in your host machine
        at java.base/sun.nio.ch.SocketDispatcher.write0(Native Method)
        at java.base/sun.nio.ch.SocketDispatcher.write(SocketDispatcher.java:54)
        at java.base/sun.nio.ch.IOUtil.writeFromNativeBuffer(IOUtil.java:132)
        at java.base/sun.nio.ch.IOUtil.write(IOUtil.java:76)
        at java.base/sun.nio.ch.IOUtil.write(IOUtil.java:53)
        at 
java.base/sun.nio.ch.SocketChannelImpl.write(SocketChannelImpl.java:532)
        at 
org.apache.spark.api.python.BasePythonRunner$ReaderInputStream.writeAdditionalInputToPythonWorker(PythonRunner.scala:975)
        at 
org.apache.spark.api.python.BasePythonRunner$ReaderInputStream.read(PythonRunner.scala:879)
        at 
java.base/java.io.BufferedInputStream.fill(BufferedInputStream.java:244)
        at 
java.base/java.io.BufferedInputStream.read(BufferedInputStream.java:263)
        at java.base/java.io.DataInputStream.readInt(DataInputStream.java:393)
        at 
org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:1053)
        ... 45 more
   '''
   Versions in the environment:
   Python 3.12.0
   Running Java 17.0.0.1
   pandas Version: 2.2.3
   spark Version: 4.2.0-preview5
   numpy Version: 2.1.3
   pyArrow Version: 19.0.1


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to