[
https://issues.apache.org/jira/browse/SPARK-54731?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=18056659#comment-18056659
]
André Souprayane commented on SPARK-54731:
------------------------------------------
Hello Christoph,
I am not able to reproduce this error.
{code:java}
Welcome to
____ __
/ __/__ ___ _____/ /__
_\ \/ _ \/ _ `/ __/ '_/
/__ / .__/\_,_/_/ /_/\_\ version 4.0.2-SNAPSHOT
/_/Using Python version 3.12.1 (main, Jul 10 2025 11:57:50)
Spark context Web UI available at
http://dac5a452-cfe3-4711-b8a4-1fd2294f74fd.internal.cloudapp.net:4040
Spark context available as 'sc' (master = local[*], app id =
local-1770298301920).
SparkSession available as 'spark'.
>>> import pyspark.sql
>>> import pyspark.sql.functions as F
>>> import pyspark.sql.types as T
>>>
>>> spark =
>>> pyspark.sql.SparkSession.builder.appName("tracelyzer-test").getOrCreate()
26/02/05 13:32:22 WARN SparkSession: Using an existing Spark session; only
runtime SQL configurations will take effect.
>>> rows = [(bytes([0xe6, 0x87]),)]
>>> df = spark.createDataFrame(rows, ["binary"])
>>> df = df.withColumn("reverse", F.reverse(df.binary))
>>> df.printSchema()
root
|-- binary: binary (nullable = true)
|-- reverse: string (nullable = true)>>> df.show()
+-------+-------+
| binary|reverse|
+-------+-------+
|[E6 87]| �|
+-------+-------+ {code}
And the reverse works only with string and array type, not binary.
[https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.reverse.html]
code:
/workspaces/spark/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
nullSafeCodeGen(ctx, ev, c => dataType match {
case _: StringType => stringCodeGen(ev, c)
case _: ArrayType => arrayCodeGen(ctx, ev, c)
})
}
> sql.functions.reverse results in SparkIllegalArgumentException
> --------------------------------------------------------------
>
> Key: SPARK-54731
> URL: https://issues.apache.org/jira/browse/SPARK-54731
> Project: Spark
> Issue Type: Bug
> Components: Spark Core
> Affects Versions: 4.0.1
> Reporter: Christoph Rüthing
> Priority: Major
>
> Running the following example results in a SparkIllegalArgumentException.
> When I change the data to e.g {{[0x00, 0x87]}} it works without issue. From
> my understanding, {{reverse}} should simply reverse the arguments and should
> be independent of the data itself? Or do I misinterpret the function?
> {code:java}
> import pyspark.sql
> import pyspark.sql.functions as F
> import pyspark.sql.types as T
> spark =
> pyspark.sql.SparkSession.builder.appName("tracelyzer-test").getOrCreate()
> rows = [(bytes([0xe6, 0x87]),)]
> df = spark.createDataFrame(rows, ["binary"])
> df = df.withColumn("reverse", F.reverse(df.binary))
> df.printSchema()
> df.show(){code}
> {noformat}
> 25/12/17 14:25:21 ERROR Executor: Exception in task 2.0 in stage 3.0 (TID 27)
> org.apache.spark.SparkIllegalArgumentException: Cannot grow BufferHolder by
> size -436207608 because the size is negative
> at
> org.apache.spark.sql.catalyst.expressions.codegen.BufferHolder.grow(BufferHolder.java:73)
> at
> org.apache.spark.sql.catalyst.expressions.codegen.UnsafeWriter.grow(UnsafeWriter.java:64)
> at
> org.apache.spark.sql.catalyst.expressions.codegen.UnsafeWriter.writeUnalignedBytes(UnsafeWriter.java:128)
> at
> org.apache.spark.sql.catalyst.expressions.codegen.UnsafeWriter.write(UnsafeWriter.java:111)
> at
> org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.project_doConsume_0$(Unknown
> Source)
> at
> org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown
> Source)
> at
> org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
> at
> org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:50)
> at
> org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:402)
> at
> org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:901)
> at
> org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:901)
> at
> org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
> at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:374)
> at org.apache.spark.rdd.RDD.iterator(RDD.scala:338)
> at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
> at
> org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:171)
> at org.apache.spark.scheduler.Task.run(Task.scala:147)
> at
> org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$5(Executor.scala:647)
> at
> org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:80)
> at
> org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:77)
> at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:99)
> at
> org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:650)
> at
> java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144)
> at
> java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642)
> at java.base/java.lang.Thread.run(Thread.java:1583)
> 25/12/17 14:25:21 WARN TaskSetManager: Lost task 2.0 in stage 3.0 (TID 27)
> (localhost executor driver): org.apache.spark.SparkIllegalArgumentException:
> Cannot grow BufferHolder by size -436207608 because the size is negative
> at
> org.apache.spark.sql.catalyst.expressions.codegen.BufferHolder.grow(BufferHolder.java:73)
> at
> org.apache.spark.sql.catalyst.expressions.codegen.UnsafeWriter.grow(UnsafeWriter.java:64)
> at
> org.apache.spark.sql.catalyst.expressions.codegen.UnsafeWriter.writeUnalignedBytes(UnsafeWriter.java:128)
> at
> org.apache.spark.sql.catalyst.expressions.codegen.UnsafeWriter.write(UnsafeWriter.java:111)
> at
> org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.project_doConsume_0$(Unknown
> Source)
> at
> org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown
> Source)
> at
> org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
> at
> org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:50)
> at
> org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:402)
> at
> org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:901)
> at
> org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:901)
> at
> org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
> at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:374)
> at org.apache.spark.rdd.RDD.iterator(RDD.scala:338)
> at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
> at
> org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:171)
> at org.apache.spark.scheduler.Task.run(Task.scala:147)
> at
> org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$5(Executor.scala:647)
> at
> org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:80)
> at
> org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:77)
> at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:99)
> at
> org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:650)
> at
> java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144)
> at
> java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642)
> at java.base/java.lang.Thread.run(Thread.java:1583)
> {noformat}
--
This message was sent by Atlassian Jira
(v8.20.10#820010)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]