[
https://issues.apache.org/jira/browse/SPARK-56502?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=18081433#comment-18081433
]
Anupam Yadav commented on SPARK-56502:
--------------------------------------
I would like to work on this. Will investigate and submit a PR.
> Fix integer overflow in DirectByteBufferOutputStream capacity check
> -------------------------------------------------------------------
>
> Key: SPARK-56502
> URL: https://issues.apache.org/jira/browse/SPARK-56502
> Project: Spark
> Issue Type: Bug
> Components: Spark Core
> Affects Versions: 4.1.0, 4.0.0, 4.0.1, 4.0.2, 4.2.0, 4.1.1
> Reporter: Tim Lee
> Priority: Major
> Labels: pull-request-available
>
> {{DirectByteBufferOutputStream.write(b, off, len)}} computes
> {{buffer.position() + len}} using {{Int}} arithmetic. When the sum exceeds
> {{{}Integer.MAX_VALUE{}}}, it wraps to negative, bypassing the
> {{ensureCapacity}} guard. The subsequent {{buffer.put()}} writes past the
> buffer, causing a SIGSEGV.
> Here's an example JVM crash stack trace:
>
> {code:java}
> J 17470 c2 org.apache.spark.util.DirectByteBufferOutputStream.write([BII)V
> (25 bytes) @ 0x00007f47dcaaa7d2 [0x00007f47dcaaa5c0+0x0000000000000212]
> J 30250 c2
> java.nio.channels.Channels$WritableByteChannelImpl.write(Ljava/nio/ByteBuffer;)I
> [email protected] (151 bytes) @ 0x00007f47dd81ffd4
> [0x00007f47dd81fd00+0x00000000000002d4]
> J 21322 c1
> org.apache.arrow.vector.ipc.WriteChannel.write(Ljava/nio/ByteBuffer;)J (64
> bytes) @ 0x00007f47cdb37184 [0x00007f47cdb36d20+0x0000000000000464]
> J 23536 c1 org.apache.arrow.vector.ipc.WriteChannel.writeIntLittleEndian(I)J
> (17 bytes) @ 0x00007f47cdd1fe8c [0x00007f47cdd1fce0+0x00000000000001ac]
> j
> org.apache.arrow.vector.ipc.message.MessageSerializer.serialize(Lorg/apache/arrow/vector/ipc/WriteChannel;Lorg/apache/arrow/vector/ipc/message/ArrowRecordBatch;Lorg/apache/arrow/vector/ipc/message/IpcOption;)Lorg/apache/arrow/vector/ipc/message/ArrowBlock;+58
> j
> org.apache.arrow.vector.ipc.ArrowWriter.writeRecordBatch(Lorg/apache/arrow/vector/ipc/message/ArrowRecordBatch;)Lorg/apache/arrow/vector/ipc/message/ArrowBlock;+9
> j org.apache.arrow.vector.ipc.ArrowWriter.writeBatch()V+28
> J 29845 c1
> org.apache.spark.sql.execution.python.BasicPythonArrowInput.writeNextBatchToArrowStream(Lorg/apache/arrow/vector/VectorSchemaRoot;Lorg/apache/arrow/vector/ipc/ArrowStreamWriter;Ljava/io/DataOutputStream;Lscala/collection/Iterator;)Z
> (233 bytes) @ 0x00007f47cde9603c [0x00007f47cde949e0+0x000000000000165c]
> j
> org.apache.spark.sql.execution.python.BasicPythonArrowInput.writeNextBatchToArrowStream$(Lorg/apache/spark/sql/execution/python/BasicPythonArrowInput;Lorg/apache/arrow/vector/VectorSchemaRoot;Lorg/apache/arrow/vector/ipc/ArrowStreamWriter;Ljava/io/DataOutputStream;Lscala/collection/Iterator;)Z+6
> j
> org.apache.spark.sql.execution.python.BaseArrowPythonRunner.writeNextBatchToArrowStream(Lorg/apache/arrow/vector/VectorSchemaRoot;Lorg/apache/arrow/vector/ipc/ArrowStreamWriter;Ljava/io/DataOutputStream;Lscala/collection/Iterator;)Z+6
> j
> org.apache.spark.sql.execution.python.PythonArrowInput$ArrowWriter.writeNextInputToStream(Ljava/io/DataOutputStream;)Z+128
> J 29886 c1
> org.apache.spark.api.python.BasePythonRunner$ReaderInputStream.writeAdditionalInputToPythonWorker()V
> (377 bytes) @ 0x00007f47ce29f8c4 [0x00007f47ce29efa0+0x0000000000000924]
> j
> org.apache.spark.api.python.BasePythonRunner$ReaderInputStream.read([BII)I+204
> J 20370 c2 java.io.BufferedInputStream.fill()V [email protected] (203 bytes)
> @ 0x00007f47dd129790 [0x00007f47dd129720+0x0000000000000070]
> J 12173 c2 java.io.BufferedInputStream.read()I [email protected] (49 bytes) @
> 0x00007f47dc7cc48c [0x00007f47dc7cc3a0+0x00000000000000ec]
> J 8989 c1 java.io.DataInputStream.readInt()I [email protected] (68 bytes) @
> 0x00007f47cd16d40c [0x00007f47cd16d300+0x000000000000010c]
> j
> org.apache.spark.sql.execution.python.PythonArrowOutput$$anon$1.read()Ljava/lang/Object;+216
> j org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext()Z+16
> j org.apache.spark.InterruptibleIterator.hasNext()Z+11
> j scala.collection.Iterator$$anon$11.hasNext()Z+16
> J 20285 c2 scala.collection.Iterator$$anon$10.hasNext()Z (10 bytes) @
> 0x00007f47dd1230e0 [0x00007f47dd1230a0+0x0000000000000040]
> j
> org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage22.processNext()V+739
> j org.apache.spark.sql.execution.BufferedRowIterator.hasNext()Z+11
> j
> org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext()Z+4
> J 20285 c2 scala.collection.Iterator$$anon$10.hasNext()Z (10 bytes) @
> 0x00007f47dd1230e0 [0x00007f47dd1230a0+0x0000000000000040]
> J 20285 c2 scala.collection.Iterator$$anon$10.hasNext()Z (10 bytes) @
> 0x00007f47dd1230e0 [0x00007f47dd1230a0+0x0000000000000040]
> j org.apache.spark.sql.execution.python.BatchIterator.hasNext()Z+4
> j
> org.apache.spark.sql.execution.python.BasicPythonArrowInput.writeNextBatchToArrowStream(Lorg/apache/arrow/vector/VectorSchemaRoot;Lorg/apache/arrow/vector/ipc/ArrowStreamWriter;Ljava/io/DataOutputStream;Lscala/collection/Iterator;)Z+16
> j
> org.apache.spark.sql.execution.python.BasicPythonArrowInput.writeNextBatchToArrowStream$(Lorg/apache/spark/sql/execution/python/BasicPythonArrowInput;Lorg/apache/arrow/vector/VectorSchemaRoot;Lorg/apache/arrow/vector/ipc/ArrowStreamWriter;Ljava/io/DataOutputStream;Lscala/collection/Iterator;)Z+6
> j
> org.apache.spark.sql.execution.python.BaseArrowPythonRunner.writeNextBatchToArrowStream(Lorg/apache/arrow/vector/VectorSchemaRoot;Lorg/apache/arrow/vector/ipc/ArrowStreamWriter;Ljava/io/DataOutputStream;Lscala/collection/Iterator;)Z+6
> j
> org.apache.spark.sql.execution.python.PythonArrowInput$ArrowWriter.writeNextInputToStream(Ljava/io/DataOutputStream;)Z+128
> j
> org.apache.spark.api.python.BasePythonRunner$ReaderInputStream.writeAdditionalInputToPythonWorker()V+107
> j
> org.apache.spark.api.python.BasePythonRunner$ReaderInputStream.read([BII)I+204
> J 20370 c2 java.io.BufferedInputStream.fill()V [email protected] (203 bytes)
> @ 0x00007f47dd129790 [0x00007f47dd129720+0x0000000000000070]
> J 12173 c2 java.io.BufferedInputStream.read()I [email protected] (49 bytes) @
> 0x00007f47dc7cc48c [0x00007f47dc7cc3a0+0x00000000000000ec]
> J 8989 c1 java.io.DataInputStream.readInt()I [email protected] (68 bytes) @
> 0x00007f47cd16d40c [0x00007f47cd16d300+0x000000000000010c]
> j
> org.apache.spark.sql.execution.python.PythonArrowOutput$$anon$1.read()Ljava/lang/Object;+216
> j org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext()Z+16
> ...{code}
>
--
This message was sent by Atlassian Jira
(v8.20.10#820010)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]