[GitHub] [spark] mridulm commented on a diff in pull request #38064: [SPARK-40622][SQL][CORE]Result of a single task in collect() must fit in 2GB

GitBox Tue, 11 Oct 2022 22:55:31 -0700


mridulm commented on code in PR #38064:
URL: https://github.com/apache/spark/pull/38064#discussion_r993007045



##########
core/src/main/scala/org/apache/spark/executor/Executor.scala:
##########
@@ -659,9 +659,9 @@ private[spark] class Executor(
         val accumUpdates = task.collectAccumulatorUpdates()
         val metricPeaks = metricsPoller.getTaskMetricPeaks(taskId)
         // TODO: do not serialize value twice
-        val directResult = new DirectTaskResult(valueBytes, accumUpdates, 
metricPeaks)
-        val serializedDirectResult = ser.serialize(directResult)
-        val resultSize = serializedDirectResult.limit()
+        val directResult = new DirectTaskResult(valueByteBuffer, accumUpdates, 
metricPeaks)
+        val serializedDirectResult = 
SerializerHelper.serializeToChunkedBuffer(ser, directResult)

Review Comment:
   Unlike the earlier invocation of `serializeToChunkedBuffer` (L#599) , here 
we have a good estimate of the size - something to leverage and minimize the 
cost of `serializeToChunkedBuffer`



##########
core/src/main/scala/org/apache/spark/util/io/ChunkedByteBuffer.scala:
##########
@@ -17,21 +17,22 @@
 
 package org.apache.spark.util.io
 
-import java.io.{File, FileInputStream, InputStream}
+import java.io.{Externalizable, File, FileInputStream, InputStream, 
ObjectInput, ObjectOutput, OutputStream}
 import java.nio.ByteBuffer
 import java.nio.channels.WritableByteChannel
 
 import com.google.common.io.ByteStreams
 import com.google.common.primitives.UnsignedBytes
 import org.apache.commons.io.IOUtils
-
 import org.apache.spark.SparkEnv
+
 import org.apache.spark.internal.config
 import org.apache.spark.network.buffer.{FileSegmentManagedBuffer, 
ManagedBuffer}
 import org.apache.spark.network.util.{ByteArrayWritableChannel, 
LimitedInputStream}
 import org.apache.spark.storage.{EncryptedManagedBuffer, StorageUtils}
 import org.apache.spark.unsafe.array.ByteArrayMethods
 import org.apache.spark.util.Utils
+import org.apache.spark.util.io.ChunkedByteBuffer.writeBufferToDest

Review Comment:
   nit: Replace this import with inline invocation



##########
core/src/main/scala/org/apache/spark/util/io/ChunkedByteBuffer.scala:
##########
@@ -84,6 +92,64 @@ private[spark] class ChunkedByteBuffer(var chunks: 
Array[ByteBuffer]) {
     }
   }
 
+  /**
+   * write to stream with zero copy if possible
+   */
+  def writeToStream(out: OutputStream): Unit = {
+    var buffer: Array[Byte] = null
+    val bufferLen = 1024 * 1024
+    writeBufferToDest(this, out.write)
+  }
+
+  /**
+   * write to ObjectOutput with zero copy if possible
+   */
+  override def writeExternal(out: ObjectOutput): Unit = {
+    // we want to keep the chunks layout
+    out.writeInt(chunks.length)
+    chunks.foreach(buffer => out.writeInt(buffer.limit()))
+    chunks.foreach(buffer => out.writeBoolean(buffer.isDirect))
+    writeBufferToDest(this, out.write)
+  }
+
+  override def readExternal(in: ObjectInput): Unit = {
+    val chunksNum = in.readInt()
+    val indices = 0 until chunksNum
+    val chunksSize = indices.map(_ => in.readInt())
+    val chunksDirect = indices.map(_ => in.readBoolean())

Review Comment:
   Do we have cases where we want to preserve whether the buffer was direct or 
not across VM ? The current usecase does not require it ?
   +CC @Ngone51 ?
   
   If not, drop this and simplify the impl ?



##########
core/src/main/scala/org/apache/spark/util/io/ChunkedByteBuffer.scala:
##########
@@ -84,6 +92,64 @@ private[spark] class ChunkedByteBuffer(var chunks: 
Array[ByteBuffer]) {
     }
   }
 
+  /**
+   * write to stream with zero copy if possible
+   */
+  def writeToStream(out: OutputStream): Unit = {
+    var buffer: Array[Byte] = null
+    val bufferLen = 1024 * 1024
+    writeBufferToDest(this, out.write)
+  }
+
+  /**
+   * write to ObjectOutput with zero copy if possible
+   */
+  override def writeExternal(out: ObjectOutput): Unit = {
+    // we want to keep the chunks layout
+    out.writeInt(chunks.length)
+    chunks.foreach(buffer => out.writeInt(buffer.limit()))
+    chunks.foreach(buffer => out.writeBoolean(buffer.isDirect))
+    writeBufferToDest(this, out.write)
+  }
+
+  override def readExternal(in: ObjectInput): Unit = {
+    val chunksNum = in.readInt()
+    val indices = 0 until chunksNum
+    val chunksSize = indices.map(_ => in.readInt())
+    val chunksDirect = indices.map(_ => in.readBoolean())
+    val chunks = new Array[ByteBuffer](chunksNum)
+
+    val copyBufferLen = 1024 * 1024

Review Comment:
   Pull this out as a constant - both `serializeToChunkedBuffer` and this 
method is relying on this value implicitly for efficiency.



##########
core/src/main/scala/org/apache/spark/serializer/SerializerHelper.scala:
##########
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.serializer
+
+import java.nio.ByteBuffer
+
+import scala.reflect.ClassTag
+
+import org.apache.spark.internal.Logging
+import org.apache.spark.util.io.{ChunkedByteBuffer, 
ChunkedByteBufferOutputStream}
+
+private[spark] object SerializerHelper extends Logging {
+  def serializeToChunkedBuffer[T: ClassTag](
+      serializerInstance: SerializerInstance,
+      t: T): ChunkedByteBuffer = {
+    val cbbos = new ChunkedByteBufferOutputStream(1024 * 1024, 
ByteBuffer.allocate)

Review Comment:
   The default size here is fairly large - most results are fairly small 
(except for a small subset).
   Unfortunately, small `chunkSize` leads to suboptimal performance for 
`ChunkedByteBufferOutputStream` - this change has a potential for causing 
performance regressions - something to watch out for.



##########
core/src/main/scala/org/apache/spark/util/io/ChunkedByteBuffer.scala:
##########
@@ -207,6 +273,41 @@ private[spark] object ChunkedByteBuffer {
     }
     out.toChunkedByteBuffer
   }
+
+  /**
+   * util function writing ChunkedByteBuffer to destination with zero copy if 
possible(when
+   * ChunkedByteBuffer is backed by bytes array)
+   * @param src ChunkedByteBuffer as src
+   * @param write write function writing data to destination, following the 
semantic of
+   *              java.io.outputStream.write(buffer Array[Byte], off Int, len 
Int)
+   */
+  private def writeBufferToDest(
+      src: ChunkedByteBuffer,
+      write: (Array[Byte], Int, Int) => Unit): Unit = {
+    var buffer: Array[Byte] = null
+    val bufferLen = 1024 * 1024
+
+    src.chunks.foreach { chunk => {

Review Comment:
   Use getChunks instead of chunks directly and simplify the impl.
   The `rewind` below, for example, is modifying the buffer directly.



##########
core/src/main/scala/org/apache/spark/util/io/ChunkedByteBuffer.scala:
##########
@@ -207,6 +273,41 @@ private[spark] object ChunkedByteBuffer {
     }
     out.toChunkedByteBuffer
   }
+
+  /**
+   * util function writing ChunkedByteBuffer to destination with zero copy if 
possible(when
+   * ChunkedByteBuffer is backed by bytes array)
+   * @param src ChunkedByteBuffer as src
+   * @param write write function writing data to destination, following the 
semantic of
+   *              java.io.outputStream.write(buffer Array[Byte], off Int, len 
Int)
+   */
+  private def writeBufferToDest(
+      src: ChunkedByteBuffer,
+      write: (Array[Byte], Int, Int) => Unit): Unit = {
+    var buffer: Array[Byte] = null
+    val bufferLen = 1024 * 1024
+
+    src.chunks.foreach { chunk => {
+      if (chunk.hasArray) {
+        // zero copy if the bytebuffer is backed by bytes array
+        write(chunk.array(), chunk.arrayOffset(), chunk.limit())
+      } else {
+        // fallback to copy approach
+        if (buffer == null) {
+          buffer = new Array[Byte](bufferLen)
+        }
+        val originalPos = chunk.position()
+        chunk.rewind()
+        var bytesToRead = Math.min(chunk.remaining(), bufferLen)
+        while (bytesToRead > 0) {
+          chunk.get(buffer, 0, bytesToRead)
+          write(buffer, 0, bytesToRead)
+          bytesToRead = Math.min(chunk.remaining(), bufferLen)
+        }
+        chunk.position(originalPos)
+      }
+    }}

Review Comment:
   Move this as `writeFully(out: OutputStream)` in `ChunkedByteBuffer` ?



##########
core/src/main/scala/org/apache/spark/util/io/ChunkedByteBuffer.scala:
##########
@@ -84,6 +92,64 @@ private[spark] class ChunkedByteBuffer(var chunks: 
Array[ByteBuffer]) {
     }
   }
 
+  /**
+   * write to stream with zero copy if possible
+   */
+  def writeToStream(out: OutputStream): Unit = {
+    var buffer: Array[Byte] = null
+    val bufferLen = 1024 * 1024
+    writeBufferToDest(this, out.write)
+  }

Review Comment:
   Why do we need this method ?
   If not required, removing it will simplify `writeExternal`



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [spark] mridulm commented on a diff in pull request #38064: [SPARK-40622][SQL][CORE]Result of a single task in collect() must fit in 2GB

Reply via email to