waitinfuture commented on code in PR #2064:
URL:
https://github.com/apache/incubator-celeborn/pull/2064#discussion_r1386173946
##########
worker/src/main/scala/org/apache/celeborn/service/deploy/worker/PushDataHandler.scala:
##########
@@ -821,28 +807,11 @@ class PushDataHandler(val workerSource: WorkerSource)
extends BaseMessageHandler
// for primary, send data to replica
if (location.hasPeer && isPrimary) {
- // to do
- wrappedCallback.onSuccess(ByteBuffer.wrap(Array[Byte]()))
- } else {
- wrappedCallback.onSuccess(ByteBuffer.wrap(Array[Byte]()))
- }
-
- try {
- fileWriter.write(body)
- } catch {
- case e: AlreadyClosedException =>
- fileWriter.decrementPendingWrites()
- val (mapId, attemptId) = getMapAttempt(body)
- val endedAttempt =
- if (shuffleMapperAttempts.containsKey(shuffleKey)) {
- shuffleMapperAttempts.get(shuffleKey).get(mapId)
- } else -1
- // TODO just info log for ended attempt
- logError(
- s"[handleMapPartitionPushData] Append data failed for task(shuffle
$shuffleKey, map $mapId, attempt" +
- s" $attemptId), caused by AlreadyClosedException, endedAttempt
$endedAttempt, error message: ${e.getMessage}")
- case e: Exception =>
- logError("Exception encountered when write.", e)
+ writeDataWithExceptionHandling(fileWriter, body, shuffleKey).onComplete {
+ case Success(_) =>
wrappedCallback.onSuccess(ByteBuffer.wrap(Array[Byte]()))
+ // TODO: handle exception
+ case Failure(_) =>
wrappedCallback.onSuccess(ByteBuffer.wrap(Array[Byte]()))
+ }
Review Comment:
Missed `else` branch here, I believe this is why the flink UTs failed.
`handleMapPartitionPushData` is used for Flink currently.
##########
worker/src/main/scala/org/apache/celeborn/service/deploy/worker/PushDataHandler.scala:
##########
@@ -516,7 +517,18 @@ class PushDataHandler(val workerSource: WorkerSource)
extends BaseMessageHandler
fileWriters.foreach(_.decrementPendingWrites())
return
}
-
+ def writePushMergedDataLocal() = {
+ var fileWriter: FileWriter = null
+ batchOffsets.zip(fileWriters).map { case (offset, writer) =>
+ fileWriter = writer
+ val batchBody = body.slice(
+ body.readerIndex() + offset,
+ if (offset == batchOffsets.last) body.readableBytes() - offset
+ else batchOffsets(batchOffsets.indexOf(offset) + 1) - offset)
+ writeDataWithExceptionHandling(fileWriter, batchBody, shuffleKey)
Review Comment:
This means for each partition in this `PushMergedData`, a `Future` will be
created, this could be hundreds or even a thousand. Even for `PushData`, since
this has high frequency, creating one or many `Future`s may impact performance.
##########
worker/src/main/scala/org/apache/celeborn/service/deploy/worker/PushDataHandler.scala:
##########
@@ -246,6 +250,7 @@ class PushDataHandler(val workerSource: WorkerSource)
extends BaseMessageHandler
return;
}
+ val localWriteFuture = writeDataWithExceptionHandling(fileWriter, body,
shuffleKey)
Review Comment:
In case `doReplicate == true`, we should make sure to call
`pushData.body().retain()` before calling `writeDataWithExceptionHandling`, or
else it might happen that the data is flushed and released before `retain` is
called, then the refCnt will be 0 and netty might reuse it, causing memory
corruption.
##########
worker/src/main/scala/org/apache/celeborn/service/deploy/worker/PushDataHandler.scala:
##########
@@ -751,6 +736,7 @@ class PushDataHandler(val workerSource: WorkerSource)
extends BaseMessageHandler
requestId: Long,
handler: () => Unit): Unit = {
try {
+ message.body().retain()
Review Comment:
I don't think we should call `retain` here, or it will cause netty memory
leak.
##########
worker/src/main/scala/org/apache/celeborn/service/deploy/worker/PushDataHandler.scala:
##########
@@ -516,7 +517,18 @@ class PushDataHandler(val workerSource: WorkerSource)
extends BaseMessageHandler
fileWriters.foreach(_.decrementPendingWrites())
return
}
-
+ def writePushMergedDataLocal() = {
+ var fileWriter: FileWriter = null
+ batchOffsets.zip(fileWriters).map { case (offset, writer) =>
+ fileWriter = writer
+ val batchBody = body.slice(
+ body.readerIndex() + offset,
+ if (offset == batchOffsets.last) body.readableBytes() - offset
+ else batchOffsets(batchOffsets.indexOf(offset) + 1) - offset)
+ writeDataWithExceptionHandling(fileWriter, batchBody, shuffleKey)
+ }.toSeq
+ }
+ val localWriteFutures = writePushMergedDataLocal()
Review Comment:
ditto
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]