xintongsong commented on code in PR #22833:
URL: https://github.com/apache/flink/pull/22833#discussion_r1239483891


##########
flink-runtime/src/main/java/org/apache/flink/runtime/io/network/partition/hybrid/tiered/storage/BufferAccumulator.java:
##########
@@ -46,7 +46,10 @@ public interface BufferAccumulator extends AutoCloseable {
      * transformed into finished buffers.
      */
     void receive(
-            ByteBuffer record, TieredStorageSubpartitionId subpartitionId, 
Buffer.DataType dataType)
+            ByteBuffer record,
+            TieredStorageSubpartitionId subpartitionId,
+            Buffer.DataType dataType,
+            boolean isBroadcast)

Review Comment:
   When `isBroadcast` is true, what `subpartitionId` are we expecting? Will it 
be ignored and thus can be an arbitrary id, or even `null`? Or it is required 
to be some special value?



##########
flink-runtime/src/main/java/org/apache/flink/runtime/io/network/partition/hybrid/tiered/storage/SortBufferAccumulator.java:
##########
@@ -0,0 +1,321 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.runtime.io.network.partition.hybrid.tiered.storage;
+
+import org.apache.flink.core.memory.MemorySegment;
+import org.apache.flink.runtime.io.network.buffer.Buffer;
+import org.apache.flink.runtime.io.network.buffer.BufferBuilder;
+import org.apache.flink.runtime.io.network.buffer.BufferRecycler;
+import org.apache.flink.runtime.io.network.buffer.NetworkBuffer;
+import 
org.apache.flink.runtime.io.network.partition.hybrid.tiered.common.TieredStorageSubpartitionId;
+
+import org.apache.commons.lang3.tuple.Pair;
+
+import javax.annotation.Nullable;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.Collections;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.function.BiConsumer;
+
+import static org.apache.flink.util.Preconditions.checkNotNull;
+import static org.apache.flink.util.Preconditions.checkState;
+
+/**
+ * The sort-based implementation of the {@link BufferAccumulator}. The {@link 
BufferAccumulator}
+ * receives the records from {@link TieredStorageProducerClient} and the 
records will accumulate and
+ * transform to finished buffers. The accumulated buffers will be transferred 
to the corresponding
+ * tier dynamically.
+ *
+ * <p>The {@link BufferAccumulator} can help use less buffers to accumulate 
data, which decouples
+ * the buffer usage with the number of parallelism. The number of buffers used 
by the {@link
+ * SortBufferAccumulator} will be numBuffers at most. Once the {@link 
SortBufferContainer} is full,
+ * or receiving a different type of buffer, or receiving the end-of-partition 
event, the buffer in
+ * the sort buffer container will be flushed to the tiers.
+ *
+ * <p>Note that this class need not be thread-safe, because it should only be 
accessed from the main
+ * thread.
+ */
+public class SortBufferAccumulator implements BufferAccumulator {
+
+    /** The number of the subpartitions. */
+    private final int numSubpartitions;
+
+    /** The total number of the buffers used by the {@link 
SortBufferAccumulator}. */
+    private final int numBuffers;
+
+    /** The byte size of one single buffer. */
+    private final int bufferSizeBytes;
+
+    /** The empty buffers without storing data. */
+    private final LinkedList<MemorySegment> freeSegments = new LinkedList<>();
+
+    /** The memory manager of the tiered storage. */
+    private final TieredStorageMemoryManager storeMemoryManager;
+
+    /** The number of buffers for sorting used in the {@link 
SortBufferContainer}. */
+    private int numBuffersForSort;
+
+    /**
+     * The {@link SortBufferContainer} for accumulating broadcast data. Note 
that this can be null
+     * before using it to store records, and this buffer container will be 
released once flushed.
+     */
+    @Nullable private SortBufferContainer broadcastDataBuffer;
+
+    /**
+     * The {@link SortBufferContainer} for accumulating non-broadcast data. 
Note that this can be
+     * null before using it to store records, and this buffer container will 
be released once
+     * flushed.
+     */
+    @Nullable private SortBufferContainer unicastDataBuffer;
+
+    /**
+     * The buffer recycler. Note that this can be null before requesting 
buffers from the memory
+     * manager.
+     */
+    @Nullable private BufferRecycler bufferRecycler;
+
+    /**
+     * The {@link SortBufferAccumulator}'s accumulated buffer flusher is not 
prepared during
+     * construction, requiring the field to be initialized during setup. 
Therefore, it is necessary
+     * to verify whether this field is null before using it.
+     */
+    @Nullable
+    private BiConsumer<TieredStorageSubpartitionId, List<Buffer>> 
accumulatedBufferFlusher;
+
+    public SortBufferAccumulator(
+            int numSubpartitions,
+            int numBuffers,
+            int bufferSizeBytes,
+            TieredStorageMemoryManager storeMemoryManager) {
+        this.numSubpartitions = numSubpartitions;
+        this.bufferSizeBytes = bufferSizeBytes;
+        this.numBuffers = numBuffers;
+        this.storeMemoryManager = storeMemoryManager;
+    }
+
+    @Override
+    public void setup(BiConsumer<TieredStorageSubpartitionId, List<Buffer>> 
bufferFlusher) {
+        this.accumulatedBufferFlusher = bufferFlusher;
+    }
+
+    @Override
+    public void receive(
+            ByteBuffer record,
+            TieredStorageSubpartitionId subpartitionId,
+            Buffer.DataType dataType,
+            boolean isBroadcast)
+            throws IOException {
+        int targetSubpartition = subpartitionId.getSubpartitionId();
+        SortBufferContainer sortBufferContainer =
+                isBroadcast ? getBroadcastDataBuffer() : 
getUnicastDataBuffer();
+        if (!sortBufferContainer.writeRecord(record, targetSubpartition, 
dataType)) {
+            return;
+        }
+
+        if (!sortBufferContainer.hasRemaining()) {
+            sortBufferContainer.release();
+            writeLargeRecord(record, targetSubpartition, dataType);
+            return;
+        }

Review Comment:
   It took me a while to figure out why `sortBufferContainer.hasRemaining()` 
being `false` should result in writing a large record. It would be easier if we 
add a simple comment explaining: the container is empty, yet we failed to write 
the record into it, which suggests the record is larger than the container can 
hold.



##########
flink-runtime/src/main/java/org/apache/flink/runtime/io/network/partition/hybrid/tiered/storage/SortBufferContainer.java:
##########
@@ -0,0 +1,454 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.runtime.io.network.partition.hybrid.tiered.storage;
+
+import org.apache.flink.core.memory.MemorySegment;
+import org.apache.flink.core.memory.MemorySegmentFactory;
+import org.apache.flink.runtime.io.network.buffer.Buffer;
+import org.apache.flink.runtime.io.network.buffer.BufferRecycler;
+import org.apache.flink.runtime.io.network.buffer.NetworkBuffer;
+
+import org.apache.commons.lang3.tuple.Pair;
+
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.LinkedList;
+
+import static org.apache.flink.util.Preconditions.checkArgument;
+import static org.apache.flink.util.Preconditions.checkNotNull;
+import static org.apache.flink.util.Preconditions.checkState;
+
+/**
+ * The buffer container for accumulating the records into {@link Buffer}s. 
After accumulating, the
+ * {@link SortBufferAccumulator} will read the sorted buffers.
+ */
+public class SortBufferContainer {
+
+    /**
+     * Size of an index entry: 4 bytes for record length, 4 bytes for data 
type and 8 bytes for
+     * pointer to next entry.
+     */
+    private static final int INDEX_ENTRY_SIZE = 4 + 4 + 8;
+
+    /** A list of {@link MemorySegment}s used to store data in memory. */
+    private final LinkedList<MemorySegment> freeSegments;
+
+    /** A segment list as a joint buffer which stores all records and index 
entries. */
+    private final ArrayList<MemorySegment> dataSegments;
+
+    /** {@link BufferRecycler} used to recycle {@link #freeSegments}. */
+    private final BufferRecycler bufferRecycler;
+
+    /** Addresses of the first record's index entry for each subpartition. */
+    private final long[] subpartitionFirstBufferIndexEntries;
+
+    /** Addresses of the last record's index entry for each subpartition. */
+    private final long[] subpartitionLastBufferIndexEntries;
+
+    /** Size of buffers requested from buffer pool. All buffers must be of the 
same size. */
+    private final int bufferSizeBytes;
+
+    /** Number of guaranteed buffers can be allocated from the buffer pool for 
data sort. */
+    private final int numBuffersForSort;
+
+    // ------------------------------------------------------------------------
+    // The statistics and states
+    // ------------------------------------------------------------------------
+
+    /** Total number of bytes already appended to this sort buffer. */
+    private long numTotalBytes;
+
+    /** Total number of bytes already read from this sort buffer. */
+    private long numTotalBytesRead;
+
+    /** Whether this sort buffer is finished. One can only read a finished 
sort buffer. */
+    private boolean isFinished;
+
+    /** Whether this sort buffer is released. A released sort buffer can not 
be used. */
+    private boolean isReleased;
+
+    // ------------------------------------------------------------------------
+    // For writing
+    // ------------------------------------------------------------------------
+
+    /** Array index in the segment list of the current available buffer for 
writing. */
+    private int writeBufferIndex;
+
+    /** Next position in the current available buffer for writing. */
+    private int writeOffsetInCurrentBuffer;
+
+    // ------------------------------------------------------------------------
+    // For reading
+    // ------------------------------------------------------------------------
+
+    /** Index entry address of the current record or event to be read. */
+    private long readBufferIndexEntry;
+
+    /**
+     * Record the bytes remaining after the last read, which must be 
initialized before reading a
+     * new record.
+     */
+    private int recordRemainingBytesToRead;
+
+    /** The subpartition that is reading data from. */
+    private int readingSubpartitionId = -1;
+
+    SortBufferContainer(
+            LinkedList<MemorySegment> freeSegments,
+            BufferRecycler bufferRecycler,
+            int numSubpartitions,
+            int bufferSizeBytes,
+            int numBuffersForSort) {
+        checkArgument(bufferSizeBytes > INDEX_ENTRY_SIZE, "Buffer size is too 
small.");
+        checkArgument(numBuffersForSort > 0, "No guaranteed buffers for 
sort.");
+        checkState(numBuffersForSort <= freeSegments.size(), "Wrong number of 
free segments.");
+
+        this.freeSegments = checkNotNull(freeSegments);
+        this.bufferRecycler = checkNotNull(bufferRecycler);
+        this.bufferSizeBytes = bufferSizeBytes;
+        this.numBuffersForSort = numBuffersForSort;
+        this.dataSegments = new ArrayList<>();
+        this.subpartitionFirstBufferIndexEntries = new long[numSubpartitions];
+        this.subpartitionLastBufferIndexEntries = new long[numSubpartitions];
+
+        Arrays.fill(subpartitionFirstBufferIndexEntries, -1L);
+        Arrays.fill(subpartitionLastBufferIndexEntries, -1L);
+    }
+
+    // ------------------------------------------------------------------------
+    //  Called by SortBufferAccumulator
+    // ------------------------------------------------------------------------
+
+    /**
+     * Note that no partial records will be written to this {@link 
SortBufferContainer}, which means
+     * that either all data of target record will be written or nothing will 
be written.
+     *
+     * @param record the record to be written
+     * @param subpartitionId the subpartition id
+     * @param dataType the data type of the record
+     * @return true if the {@link SortBufferContainer} is full, or return 
false if the contianer is
+     *     not full

Review Comment:
   This is against conventions. Usually, a write-like method returns true means 
the write operation has succeed.



##########
flink-runtime/src/main/java/org/apache/flink/runtime/io/network/partition/hybrid/tiered/storage/SortBufferAccumulator.java:
##########
@@ -0,0 +1,321 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.runtime.io.network.partition.hybrid.tiered.storage;
+
+import org.apache.flink.core.memory.MemorySegment;
+import org.apache.flink.runtime.io.network.buffer.Buffer;
+import org.apache.flink.runtime.io.network.buffer.BufferBuilder;
+import org.apache.flink.runtime.io.network.buffer.BufferRecycler;
+import org.apache.flink.runtime.io.network.buffer.NetworkBuffer;
+import 
org.apache.flink.runtime.io.network.partition.hybrid.tiered.common.TieredStorageSubpartitionId;
+
+import org.apache.commons.lang3.tuple.Pair;
+
+import javax.annotation.Nullable;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.Collections;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.function.BiConsumer;
+
+import static org.apache.flink.util.Preconditions.checkNotNull;
+import static org.apache.flink.util.Preconditions.checkState;
+
+/**
+ * The sort-based implementation of the {@link BufferAccumulator}. The {@link 
BufferAccumulator}
+ * receives the records from {@link TieredStorageProducerClient} and the 
records will accumulate and
+ * transform to finished buffers. The accumulated buffers will be transferred 
to the corresponding
+ * tier dynamically.
+ *
+ * <p>The {@link BufferAccumulator} can help use less buffers to accumulate 
data, which decouples
+ * the buffer usage with the number of parallelism. The number of buffers used 
by the {@link
+ * SortBufferAccumulator} will be numBuffers at most. Once the {@link 
SortBufferContainer} is full,
+ * or receiving a different type of buffer, or receiving the end-of-partition 
event, the buffer in
+ * the sort buffer container will be flushed to the tiers.
+ *
+ * <p>Note that this class need not be thread-safe, because it should only be 
accessed from the main
+ * thread.
+ */
+public class SortBufferAccumulator implements BufferAccumulator {
+
+    /** The number of the subpartitions. */
+    private final int numSubpartitions;
+
+    /** The total number of the buffers used by the {@link 
SortBufferAccumulator}. */
+    private final int numBuffers;
+
+    /** The byte size of one single buffer. */
+    private final int bufferSizeBytes;
+
+    /** The empty buffers without storing data. */
+    private final LinkedList<MemorySegment> freeSegments = new LinkedList<>();
+
+    /** The memory manager of the tiered storage. */
+    private final TieredStorageMemoryManager storeMemoryManager;
+
+    /** The number of buffers for sorting used in the {@link 
SortBufferContainer}. */
+    private int numBuffersForSort;
+
+    /**
+     * The {@link SortBufferContainer} for accumulating broadcast data. Note 
that this can be null
+     * before using it to store records, and this buffer container will be 
released once flushed.
+     */
+    @Nullable private SortBufferContainer broadcastDataBuffer;
+
+    /**
+     * The {@link SortBufferContainer} for accumulating non-broadcast data. 
Note that this can be
+     * null before using it to store records, and this buffer container will 
be released once
+     * flushed.
+     */
+    @Nullable private SortBufferContainer unicastDataBuffer;
+
+    /**
+     * The buffer recycler. Note that this can be null before requesting 
buffers from the memory
+     * manager.
+     */
+    @Nullable private BufferRecycler bufferRecycler;
+
+    /**
+     * The {@link SortBufferAccumulator}'s accumulated buffer flusher is not 
prepared during
+     * construction, requiring the field to be initialized during setup. 
Therefore, it is necessary
+     * to verify whether this field is null before using it.
+     */
+    @Nullable
+    private BiConsumer<TieredStorageSubpartitionId, List<Buffer>> 
accumulatedBufferFlusher;
+
+    public SortBufferAccumulator(
+            int numSubpartitions,
+            int numBuffers,
+            int bufferSizeBytes,
+            TieredStorageMemoryManager storeMemoryManager) {
+        this.numSubpartitions = numSubpartitions;
+        this.bufferSizeBytes = bufferSizeBytes;
+        this.numBuffers = numBuffers;
+        this.storeMemoryManager = storeMemoryManager;
+    }
+
+    @Override
+    public void setup(BiConsumer<TieredStorageSubpartitionId, List<Buffer>> 
bufferFlusher) {
+        this.accumulatedBufferFlusher = bufferFlusher;
+    }
+
+    @Override
+    public void receive(
+            ByteBuffer record,
+            TieredStorageSubpartitionId subpartitionId,
+            Buffer.DataType dataType,
+            boolean isBroadcast)
+            throws IOException {
+        int targetSubpartition = subpartitionId.getSubpartitionId();
+        SortBufferContainer sortBufferContainer =
+                isBroadcast ? getBroadcastDataBuffer() : 
getUnicastDataBuffer();
+        if (!sortBufferContainer.writeRecord(record, targetSubpartition, 
dataType)) {
+            return;
+        }
+
+        if (!sortBufferContainer.hasRemaining()) {
+            sortBufferContainer.release();
+            writeLargeRecord(record, targetSubpartition, dataType);
+            return;
+        }
+
+        flushDataBuffer(sortBufferContainer);
+        sortBufferContainer.release();
+        if (record.hasRemaining()) {
+            receive(record, subpartitionId, dataType, isBroadcast);
+        }
+    }
+
+    @Override
+    public void close() {
+        flushUnicastDataBuffer();
+        flushBroadcastDataBuffer();
+        releaseFreeBuffers();
+        releaseDataBuffer(unicastDataBuffer);
+        releaseDataBuffer(broadcastDataBuffer);
+    }
+
+    // ------------------------------------------------------------------------
+    //  Internal Methods
+    // ------------------------------------------------------------------------
+
+    private SortBufferContainer getUnicastDataBuffer() {
+        flushBroadcastDataBuffer();
+
+        if (unicastDataBuffer != null
+                && !unicastDataBuffer.isFinished()
+                && !unicastDataBuffer.isReleased()) {
+            return unicastDataBuffer;
+        }
+
+        unicastDataBuffer = createNewDataBuffer();
+        return unicastDataBuffer;
+    }
+
+    private SortBufferContainer getBroadcastDataBuffer() {
+        flushUnicastDataBuffer();
+
+        if (broadcastDataBuffer != null
+                && !broadcastDataBuffer.isFinished()
+                && !broadcastDataBuffer.isReleased()) {
+            return broadcastDataBuffer;
+        }
+
+        broadcastDataBuffer = createNewDataBuffer();
+        return broadcastDataBuffer;
+    }
+
+    private SortBufferContainer createNewDataBuffer() {
+        requestNetworkBuffers();
+
+        return new SortBufferContainer(
+                freeSegments,
+                this::recycleBuffer,
+                numSubpartitions,
+                bufferSizeBytes,
+                numBuffersForSort);
+    }
+
+    private void requestGuaranteedBuffers() {
+        int effectiveRequiredBuffers = effectiveNumRequestedBuffers();
+
+        while (freeSegments.size() < effectiveRequiredBuffers) {
+            BufferBuilder bufferBuilder = 
storeMemoryManager.requestBufferBlocking(this);
+            Buffer buffer = 
bufferBuilder.createBufferConsumerFromBeginning().build();
+            freeSegments.add(checkNotNull(buffer).getMemorySegment());
+            if (bufferRecycler == null) {
+                bufferRecycler = buffer.getRecycler();
+            }
+        }
+    }
+
+    private void requestNetworkBuffers() {
+        requestGuaranteedBuffers();
+
+        // Use the half of the buffers for writing, and the other half for 
reading
+        numBuffersForSort = freeSegments.size() / 2;
+    }
+
+    private void flushDataBuffer(SortBufferContainer sortBufferContainer) {
+        if (sortBufferContainer == null
+                || sortBufferContainer.isReleased()
+                || !sortBufferContainer.hasRemaining()) {
+            return;
+        }
+        sortBufferContainer.finish();
+
+        do {
+            MemorySegment freeSegment = getFreeSegment();
+            Pair<Integer, Buffer> bufferAndSubpartitionId =
+                    sortBufferContainer.readBuffer(freeSegment);
+            if (bufferAndSubpartitionId == null) {
+                if (freeSegment != null) {
+                    recycleBuffer(freeSegment);
+                }
+                break;
+            }
+            addFinishedBuffer(bufferAndSubpartitionId);
+        } while (true);
+
+        releaseFreeBuffers();
+        sortBufferContainer.release();
+    }
+
+    private void flushBroadcastDataBuffer() {
+        if (broadcastDataBuffer != null) {
+            flushDataBuffer(broadcastDataBuffer);
+            broadcastDataBuffer.release();
+            broadcastDataBuffer = null;
+        }
+    }
+
+    private void flushUnicastDataBuffer() {
+        if (unicastDataBuffer != null) {
+            flushDataBuffer(unicastDataBuffer);
+            unicastDataBuffer.release();
+            unicastDataBuffer = null;
+        }
+    }
+
+    private void flushContainerWhenEndOfPartition(
+            boolean isEndOfPartition, SortBufferContainer sortBufferContainer) 
{
+        if (isEndOfPartition) {
+            flushDataBuffer(sortBufferContainer);
+        }
+    }
+
+    private void writeLargeRecord(
+            ByteBuffer record, int targetSubpartition, Buffer.DataType 
dataType) {
+
+        checkState(dataType != Buffer.DataType.EVENT_BUFFER);
+        while (record.hasRemaining()) {
+            int toCopy = Math.min(record.remaining(), bufferSizeBytes);
+            MemorySegment writeBuffer = checkNotNull(getFreeSegment());
+            writeBuffer.put(0, record, toCopy);
+
+            addFinishedBuffer(
+                    Pair.of(
+                            targetSubpartition,
+                            new NetworkBuffer(
+                                    writeBuffer, checkNotNull(bufferRecycler), 
dataType, toCopy)));
+        }
+
+        releaseFreeBuffers();
+    }
+
+    private MemorySegment getFreeSegment() {
+        MemorySegment freeSegment = freeSegments.poll();
+        if (freeSegment == null) {
+            BufferBuilder bufferBuilder = 
storeMemoryManager.requestBufferBlocking(this);
+            Buffer buffer = 
bufferBuilder.createBufferConsumerFromBeginning().build();
+            freeSegment = buffer.getMemorySegment();
+        }
+        return freeSegment;
+    }
+
+    private int effectiveNumRequestedBuffers() {
+        return Math.min(numSubpartitions + 1, numBuffers);

Review Comment:
   Why `numSubpartitions + 1`?



##########
flink-runtime/src/main/java/org/apache/flink/runtime/io/network/partition/hybrid/tiered/storage/SortBufferContainer.java:
##########
@@ -0,0 +1,454 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.runtime.io.network.partition.hybrid.tiered.storage;
+
+import org.apache.flink.core.memory.MemorySegment;
+import org.apache.flink.core.memory.MemorySegmentFactory;
+import org.apache.flink.runtime.io.network.buffer.Buffer;
+import org.apache.flink.runtime.io.network.buffer.BufferRecycler;
+import org.apache.flink.runtime.io.network.buffer.NetworkBuffer;
+
+import org.apache.commons.lang3.tuple.Pair;
+
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.LinkedList;
+
+import static org.apache.flink.util.Preconditions.checkArgument;
+import static org.apache.flink.util.Preconditions.checkNotNull;
+import static org.apache.flink.util.Preconditions.checkState;
+
+/**
+ * The buffer container for accumulating the records into {@link Buffer}s. 
After accumulating, the
+ * {@link SortBufferAccumulator} will read the sorted buffers.
+ */
+public class SortBufferContainer {

Review Comment:
   1. The name is confusing. I'd suggest simply `SortBuffer` or `BufferSorter`.
   2. I think the important characteristic of this component is that, you can 
write data into it in any order but read data from it in the order grouped by 
subpartitions. This is not clear enough from the JavaDoc. 



##########
flink-runtime/src/main/java/org/apache/flink/runtime/io/network/partition/hybrid/tiered/storage/SortBufferAccumulator.java:
##########
@@ -0,0 +1,321 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.runtime.io.network.partition.hybrid.tiered.storage;
+
+import org.apache.flink.core.memory.MemorySegment;
+import org.apache.flink.runtime.io.network.buffer.Buffer;
+import org.apache.flink.runtime.io.network.buffer.BufferBuilder;
+import org.apache.flink.runtime.io.network.buffer.BufferRecycler;
+import org.apache.flink.runtime.io.network.buffer.NetworkBuffer;
+import 
org.apache.flink.runtime.io.network.partition.hybrid.tiered.common.TieredStorageSubpartitionId;
+
+import org.apache.commons.lang3.tuple.Pair;
+
+import javax.annotation.Nullable;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.Collections;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.function.BiConsumer;
+
+import static org.apache.flink.util.Preconditions.checkNotNull;
+import static org.apache.flink.util.Preconditions.checkState;
+
+/**
+ * The sort-based implementation of the {@link BufferAccumulator}. The {@link 
BufferAccumulator}
+ * receives the records from {@link TieredStorageProducerClient} and the 
records will accumulate and
+ * transform to finished buffers. The accumulated buffers will be transferred 
to the corresponding
+ * tier dynamically.
+ *
+ * <p>The {@link BufferAccumulator} can help use less buffers to accumulate 
data, which decouples
+ * the buffer usage with the number of parallelism. The number of buffers used 
by the {@link
+ * SortBufferAccumulator} will be numBuffers at most. Once the {@link 
SortBufferContainer} is full,
+ * or receiving a different type of buffer, or receiving the end-of-partition 
event, the buffer in
+ * the sort buffer container will be flushed to the tiers.
+ *
+ * <p>Note that this class need not be thread-safe, because it should only be 
accessed from the main
+ * thread.
+ */
+public class SortBufferAccumulator implements BufferAccumulator {
+
+    /** The number of the subpartitions. */
+    private final int numSubpartitions;
+
+    /** The total number of the buffers used by the {@link 
SortBufferAccumulator}. */
+    private final int numBuffers;
+
+    /** The byte size of one single buffer. */
+    private final int bufferSizeBytes;
+
+    /** The empty buffers without storing data. */
+    private final LinkedList<MemorySegment> freeSegments = new LinkedList<>();
+
+    /** The memory manager of the tiered storage. */
+    private final TieredStorageMemoryManager storeMemoryManager;
+
+    /** The number of buffers for sorting used in the {@link 
SortBufferContainer}. */
+    private int numBuffersForSort;
+
+    /**
+     * The {@link SortBufferContainer} for accumulating broadcast data. Note 
that this can be null
+     * before using it to store records, and this buffer container will be 
released once flushed.
+     */
+    @Nullable private SortBufferContainer broadcastDataBuffer;
+
+    /**
+     * The {@link SortBufferContainer} for accumulating non-broadcast data. 
Note that this can be
+     * null before using it to store records, and this buffer container will 
be released once
+     * flushed.
+     */
+    @Nullable private SortBufferContainer unicastDataBuffer;
+
+    /**
+     * The buffer recycler. Note that this can be null before requesting 
buffers from the memory
+     * manager.
+     */
+    @Nullable private BufferRecycler bufferRecycler;
+
+    /**
+     * The {@link SortBufferAccumulator}'s accumulated buffer flusher is not 
prepared during
+     * construction, requiring the field to be initialized during setup. 
Therefore, it is necessary
+     * to verify whether this field is null before using it.
+     */
+    @Nullable
+    private BiConsumer<TieredStorageSubpartitionId, List<Buffer>> 
accumulatedBufferFlusher;
+
+    public SortBufferAccumulator(
+            int numSubpartitions,
+            int numBuffers,
+            int bufferSizeBytes,
+            TieredStorageMemoryManager storeMemoryManager) {
+        this.numSubpartitions = numSubpartitions;
+        this.bufferSizeBytes = bufferSizeBytes;
+        this.numBuffers = numBuffers;
+        this.storeMemoryManager = storeMemoryManager;
+    }
+
+    @Override
+    public void setup(BiConsumer<TieredStorageSubpartitionId, List<Buffer>> 
bufferFlusher) {
+        this.accumulatedBufferFlusher = bufferFlusher;
+    }
+
+    @Override
+    public void receive(
+            ByteBuffer record,
+            TieredStorageSubpartitionId subpartitionId,
+            Buffer.DataType dataType,
+            boolean isBroadcast)
+            throws IOException {
+        int targetSubpartition = subpartitionId.getSubpartitionId();
+        SortBufferContainer sortBufferContainer =
+                isBroadcast ? getBroadcastDataBuffer() : 
getUnicastDataBuffer();
+        if (!sortBufferContainer.writeRecord(record, targetSubpartition, 
dataType)) {
+            return;
+        }
+
+        if (!sortBufferContainer.hasRemaining()) {
+            sortBufferContainer.release();
+            writeLargeRecord(record, targetSubpartition, dataType);
+            return;
+        }
+
+        flushDataBuffer(sortBufferContainer);
+        sortBufferContainer.release();
+        if (record.hasRemaining()) {

Review Comment:
   Shouldn't this always be `true`?



##########
flink-runtime/src/main/java/org/apache/flink/runtime/io/network/partition/hybrid/tiered/storage/SortBufferAccumulator.java:
##########
@@ -0,0 +1,321 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.runtime.io.network.partition.hybrid.tiered.storage;
+
+import org.apache.flink.core.memory.MemorySegment;
+import org.apache.flink.runtime.io.network.buffer.Buffer;
+import org.apache.flink.runtime.io.network.buffer.BufferBuilder;
+import org.apache.flink.runtime.io.network.buffer.BufferRecycler;
+import org.apache.flink.runtime.io.network.buffer.NetworkBuffer;
+import 
org.apache.flink.runtime.io.network.partition.hybrid.tiered.common.TieredStorageSubpartitionId;
+
+import org.apache.commons.lang3.tuple.Pair;
+
+import javax.annotation.Nullable;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.Collections;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.function.BiConsumer;
+
+import static org.apache.flink.util.Preconditions.checkNotNull;
+import static org.apache.flink.util.Preconditions.checkState;
+
+/**
+ * The sort-based implementation of the {@link BufferAccumulator}. The {@link 
BufferAccumulator}
+ * receives the records from {@link TieredStorageProducerClient} and the 
records will accumulate and
+ * transform to finished buffers. The accumulated buffers will be transferred 
to the corresponding
+ * tier dynamically.
+ *
+ * <p>The {@link BufferAccumulator} can help use less buffers to accumulate 
data, which decouples
+ * the buffer usage with the number of parallelism. The number of buffers used 
by the {@link
+ * SortBufferAccumulator} will be numBuffers at most. Once the {@link 
SortBufferContainer} is full,
+ * or receiving a different type of buffer, or receiving the end-of-partition 
event, the buffer in
+ * the sort buffer container will be flushed to the tiers.
+ *
+ * <p>Note that this class need not be thread-safe, because it should only be 
accessed from the main
+ * thread.
+ */
+public class SortBufferAccumulator implements BufferAccumulator {
+
+    /** The number of the subpartitions. */
+    private final int numSubpartitions;
+
+    /** The total number of the buffers used by the {@link 
SortBufferAccumulator}. */
+    private final int numBuffers;
+
+    /** The byte size of one single buffer. */
+    private final int bufferSizeBytes;
+
+    /** The empty buffers without storing data. */
+    private final LinkedList<MemorySegment> freeSegments = new LinkedList<>();
+
+    /** The memory manager of the tiered storage. */
+    private final TieredStorageMemoryManager storeMemoryManager;
+
+    /** The number of buffers for sorting used in the {@link 
SortBufferContainer}. */
+    private int numBuffersForSort;
+
+    /**
+     * The {@link SortBufferContainer} for accumulating broadcast data. Note 
that this can be null
+     * before using it to store records, and this buffer container will be 
released once flushed.
+     */
+    @Nullable private SortBufferContainer broadcastDataBuffer;
+
+    /**
+     * The {@link SortBufferContainer} for accumulating non-broadcast data. 
Note that this can be
+     * null before using it to store records, and this buffer container will 
be released once
+     * flushed.
+     */
+    @Nullable private SortBufferContainer unicastDataBuffer;
+
+    /**
+     * The buffer recycler. Note that this can be null before requesting 
buffers from the memory
+     * manager.
+     */
+    @Nullable private BufferRecycler bufferRecycler;
+
+    /**
+     * The {@link SortBufferAccumulator}'s accumulated buffer flusher is not 
prepared during
+     * construction, requiring the field to be initialized during setup. 
Therefore, it is necessary
+     * to verify whether this field is null before using it.
+     */
+    @Nullable
+    private BiConsumer<TieredStorageSubpartitionId, List<Buffer>> 
accumulatedBufferFlusher;
+
+    public SortBufferAccumulator(
+            int numSubpartitions,
+            int numBuffers,
+            int bufferSizeBytes,
+            TieredStorageMemoryManager storeMemoryManager) {
+        this.numSubpartitions = numSubpartitions;
+        this.bufferSizeBytes = bufferSizeBytes;
+        this.numBuffers = numBuffers;
+        this.storeMemoryManager = storeMemoryManager;
+    }
+
+    @Override
+    public void setup(BiConsumer<TieredStorageSubpartitionId, List<Buffer>> 
bufferFlusher) {
+        this.accumulatedBufferFlusher = bufferFlusher;
+    }
+
+    @Override
+    public void receive(
+            ByteBuffer record,
+            TieredStorageSubpartitionId subpartitionId,
+            Buffer.DataType dataType,
+            boolean isBroadcast)
+            throws IOException {
+        int targetSubpartition = subpartitionId.getSubpartitionId();
+        SortBufferContainer sortBufferContainer =
+                isBroadcast ? getBroadcastDataBuffer() : 
getUnicastDataBuffer();

Review Comment:
   It is implicit that getting one of the two data buffers would result in the 
other one being flushed.
   
   I'd suggest the following:
   - Have only one field `currentDataBuffer`, instead of two nullable 
`broadcastDataBuffer` and `unicastDataBuffer` and only one of them can have 
non-null value at anytime.
   - Instead of calling `getBroadcast/UnicastDataBuffer`, we can have a method 
`switchCurrentDataBufferIfNeeded(isBroadcast)`, in which we can update the 
value of `currentDataBuffer`. This would make the flushing less surprising.
   - We might need another field to remember whether the current data buffer is 
broadcast or unicast.



##########
flink-runtime/src/main/java/org/apache/flink/runtime/io/network/partition/hybrid/tiered/storage/SortBufferAccumulator.java:
##########
@@ -0,0 +1,321 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.runtime.io.network.partition.hybrid.tiered.storage;
+
+import org.apache.flink.core.memory.MemorySegment;
+import org.apache.flink.runtime.io.network.buffer.Buffer;
+import org.apache.flink.runtime.io.network.buffer.BufferBuilder;
+import org.apache.flink.runtime.io.network.buffer.BufferRecycler;
+import org.apache.flink.runtime.io.network.buffer.NetworkBuffer;
+import 
org.apache.flink.runtime.io.network.partition.hybrid.tiered.common.TieredStorageSubpartitionId;
+
+import org.apache.commons.lang3.tuple.Pair;
+
+import javax.annotation.Nullable;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.Collections;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.function.BiConsumer;
+
+import static org.apache.flink.util.Preconditions.checkNotNull;
+import static org.apache.flink.util.Preconditions.checkState;
+
+/**
+ * The sort-based implementation of the {@link BufferAccumulator}. The {@link 
BufferAccumulator}
+ * receives the records from {@link TieredStorageProducerClient} and the 
records will accumulate and
+ * transform to finished buffers. The accumulated buffers will be transferred 
to the corresponding
+ * tier dynamically.
+ *
+ * <p>The {@link BufferAccumulator} can help use less buffers to accumulate 
data, which decouples
+ * the buffer usage with the number of parallelism. The number of buffers used 
by the {@link
+ * SortBufferAccumulator} will be numBuffers at most. Once the {@link 
SortBufferContainer} is full,
+ * or receiving a different type of buffer, or receiving the end-of-partition 
event, the buffer in
+ * the sort buffer container will be flushed to the tiers.
+ *
+ * <p>Note that this class need not be thread-safe, because it should only be 
accessed from the main
+ * thread.
+ */
+public class SortBufferAccumulator implements BufferAccumulator {
+
+    /** The number of the subpartitions. */
+    private final int numSubpartitions;
+
+    /** The total number of the buffers used by the {@link 
SortBufferAccumulator}. */
+    private final int numBuffers;
+
+    /** The byte size of one single buffer. */
+    private final int bufferSizeBytes;
+
+    /** The empty buffers without storing data. */
+    private final LinkedList<MemorySegment> freeSegments = new LinkedList<>();
+
+    /** The memory manager of the tiered storage. */
+    private final TieredStorageMemoryManager storeMemoryManager;
+
+    /** The number of buffers for sorting used in the {@link 
SortBufferContainer}. */
+    private int numBuffersForSort;
+
+    /**
+     * The {@link SortBufferContainer} for accumulating broadcast data. Note 
that this can be null
+     * before using it to store records, and this buffer container will be 
released once flushed.
+     */
+    @Nullable private SortBufferContainer broadcastDataBuffer;
+
+    /**
+     * The {@link SortBufferContainer} for accumulating non-broadcast data. 
Note that this can be
+     * null before using it to store records, and this buffer container will 
be released once
+     * flushed.
+     */
+    @Nullable private SortBufferContainer unicastDataBuffer;
+
+    /**
+     * The buffer recycler. Note that this can be null before requesting 
buffers from the memory
+     * manager.
+     */
+    @Nullable private BufferRecycler bufferRecycler;
+
+    /**
+     * The {@link SortBufferAccumulator}'s accumulated buffer flusher is not 
prepared during
+     * construction, requiring the field to be initialized during setup. 
Therefore, it is necessary
+     * to verify whether this field is null before using it.
+     */
+    @Nullable
+    private BiConsumer<TieredStorageSubpartitionId, List<Buffer>> 
accumulatedBufferFlusher;
+
+    public SortBufferAccumulator(
+            int numSubpartitions,
+            int numBuffers,
+            int bufferSizeBytes,
+            TieredStorageMemoryManager storeMemoryManager) {
+        this.numSubpartitions = numSubpartitions;
+        this.bufferSizeBytes = bufferSizeBytes;
+        this.numBuffers = numBuffers;
+        this.storeMemoryManager = storeMemoryManager;
+    }
+
+    @Override
+    public void setup(BiConsumer<TieredStorageSubpartitionId, List<Buffer>> 
bufferFlusher) {
+        this.accumulatedBufferFlusher = bufferFlusher;
+    }
+
+    @Override
+    public void receive(
+            ByteBuffer record,
+            TieredStorageSubpartitionId subpartitionId,
+            Buffer.DataType dataType,
+            boolean isBroadcast)
+            throws IOException {
+        int targetSubpartition = subpartitionId.getSubpartitionId();
+        SortBufferContainer sortBufferContainer =
+                isBroadcast ? getBroadcastDataBuffer() : 
getUnicastDataBuffer();
+        if (!sortBufferContainer.writeRecord(record, targetSubpartition, 
dataType)) {
+            return;
+        }
+
+        if (!sortBufferContainer.hasRemaining()) {
+            sortBufferContainer.release();
+            writeLargeRecord(record, targetSubpartition, dataType);
+            return;
+        }
+
+        flushDataBuffer(sortBufferContainer);
+        sortBufferContainer.release();
+        if (record.hasRemaining()) {
+            receive(record, subpartitionId, dataType, isBroadcast);
+        }
+    }
+
+    @Override
+    public void close() {
+        flushUnicastDataBuffer();
+        flushBroadcastDataBuffer();
+        releaseFreeBuffers();
+        releaseDataBuffer(unicastDataBuffer);
+        releaseDataBuffer(broadcastDataBuffer);
+    }
+
+    // ------------------------------------------------------------------------
+    //  Internal Methods
+    // ------------------------------------------------------------------------
+
+    private SortBufferContainer getUnicastDataBuffer() {
+        flushBroadcastDataBuffer();
+
+        if (unicastDataBuffer != null
+                && !unicastDataBuffer.isFinished()
+                && !unicastDataBuffer.isReleased()) {
+            return unicastDataBuffer;
+        }
+
+        unicastDataBuffer = createNewDataBuffer();
+        return unicastDataBuffer;
+    }
+
+    private SortBufferContainer getBroadcastDataBuffer() {
+        flushUnicastDataBuffer();
+
+        if (broadcastDataBuffer != null
+                && !broadcastDataBuffer.isFinished()
+                && !broadcastDataBuffer.isReleased()) {
+            return broadcastDataBuffer;
+        }
+
+        broadcastDataBuffer = createNewDataBuffer();
+        return broadcastDataBuffer;
+    }
+
+    private SortBufferContainer createNewDataBuffer() {
+        requestNetworkBuffers();
+
+        return new SortBufferContainer(
+                freeSegments,
+                this::recycleBuffer,
+                numSubpartitions,
+                bufferSizeBytes,
+                numBuffersForSort);
+    }
+
+    private void requestGuaranteedBuffers() {
+        int effectiveRequiredBuffers = effectiveNumRequestedBuffers();
+
+        while (freeSegments.size() < effectiveRequiredBuffers) {
+            BufferBuilder bufferBuilder = 
storeMemoryManager.requestBufferBlocking(this);
+            Buffer buffer = 
bufferBuilder.createBufferConsumerFromBeginning().build();
+            freeSegments.add(checkNotNull(buffer).getMemorySegment());
+            if (bufferRecycler == null) {
+                bufferRecycler = buffer.getRecycler();
+            }
+        }
+    }
+
+    private void requestNetworkBuffers() {
+        requestGuaranteedBuffers();
+
+        // Use the half of the buffers for writing, and the other half for 
reading
+        numBuffersForSort = freeSegments.size() / 2;
+    }
+
+    private void flushDataBuffer(SortBufferContainer sortBufferContainer) {
+        if (sortBufferContainer == null
+                || sortBufferContainer.isReleased()
+                || !sortBufferContainer.hasRemaining()) {
+            return;
+        }
+        sortBufferContainer.finish();
+
+        do {
+            MemorySegment freeSegment = getFreeSegment();
+            Pair<Integer, Buffer> bufferAndSubpartitionId =
+                    sortBufferContainer.readBuffer(freeSegment);
+            if (bufferAndSubpartitionId == null) {
+                if (freeSegment != null) {
+                    recycleBuffer(freeSegment);
+                }
+                break;
+            }
+            addFinishedBuffer(bufferAndSubpartitionId);
+        } while (true);
+
+        releaseFreeBuffers();
+        sortBufferContainer.release();
+    }
+
+    private void flushBroadcastDataBuffer() {
+        if (broadcastDataBuffer != null) {
+            flushDataBuffer(broadcastDataBuffer);
+            broadcastDataBuffer.release();
+            broadcastDataBuffer = null;
+        }
+    }
+
+    private void flushUnicastDataBuffer() {
+        if (unicastDataBuffer != null) {
+            flushDataBuffer(unicastDataBuffer);
+            unicastDataBuffer.release();
+            unicastDataBuffer = null;
+        }
+    }
+
+    private void flushContainerWhenEndOfPartition(
+            boolean isEndOfPartition, SortBufferContainer sortBufferContainer) 
{
+        if (isEndOfPartition) {
+            flushDataBuffer(sortBufferContainer);
+        }
+    }
+
+    private void writeLargeRecord(
+            ByteBuffer record, int targetSubpartition, Buffer.DataType 
dataType) {
+
+        checkState(dataType != Buffer.DataType.EVENT_BUFFER);
+        while (record.hasRemaining()) {
+            int toCopy = Math.min(record.remaining(), bufferSizeBytes);
+            MemorySegment writeBuffer = checkNotNull(getFreeSegment());
+            writeBuffer.put(0, record, toCopy);
+
+            addFinishedBuffer(
+                    Pair.of(
+                            targetSubpartition,
+                            new NetworkBuffer(
+                                    writeBuffer, checkNotNull(bufferRecycler), 
dataType, toCopy)));
+        }
+
+        releaseFreeBuffers();
+    }
+
+    private MemorySegment getFreeSegment() {
+        MemorySegment freeSegment = freeSegments.poll();
+        if (freeSegment == null) {
+            BufferBuilder bufferBuilder = 
storeMemoryManager.requestBufferBlocking(this);
+            Buffer buffer = 
bufferBuilder.createBufferConsumerFromBeginning().build();
+            freeSegment = buffer.getMemorySegment();
+        }
+        return freeSegment;
+    }
+
+    private int effectiveNumRequestedBuffers() {
+        return Math.min(numSubpartitions + 1, numBuffers);
+    }
+
+    private void releaseDataBuffer(SortBufferContainer sortBufferContainer) {
+        if (sortBufferContainer != null) {
+            sortBufferContainer.release();
+        }
+    }
+
+    private void addFinishedBuffer(Pair<Integer, Buffer> 
bufferAndSubpartitionId) {

Review Comment:
   ```suggestion
       private void flushBuffer(Pair<Integer, Buffer> bufferAndSubpartitionId) {
   ```



##########
flink-runtime/src/main/java/org/apache/flink/runtime/io/network/partition/hybrid/tiered/storage/SortBufferAccumulator.java:
##########
@@ -0,0 +1,321 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.runtime.io.network.partition.hybrid.tiered.storage;
+
+import org.apache.flink.core.memory.MemorySegment;
+import org.apache.flink.runtime.io.network.buffer.Buffer;
+import org.apache.flink.runtime.io.network.buffer.BufferBuilder;
+import org.apache.flink.runtime.io.network.buffer.BufferRecycler;
+import org.apache.flink.runtime.io.network.buffer.NetworkBuffer;
+import 
org.apache.flink.runtime.io.network.partition.hybrid.tiered.common.TieredStorageSubpartitionId;
+
+import org.apache.commons.lang3.tuple.Pair;
+
+import javax.annotation.Nullable;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.Collections;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.function.BiConsumer;
+
+import static org.apache.flink.util.Preconditions.checkNotNull;
+import static org.apache.flink.util.Preconditions.checkState;
+
+/**
+ * The sort-based implementation of the {@link BufferAccumulator}. The {@link 
BufferAccumulator}
+ * receives the records from {@link TieredStorageProducerClient} and the 
records will accumulate and
+ * transform to finished buffers. The accumulated buffers will be transferred 
to the corresponding
+ * tier dynamically.
+ *
+ * <p>The {@link BufferAccumulator} can help use less buffers to accumulate 
data, which decouples
+ * the buffer usage with the number of parallelism. The number of buffers used 
by the {@link
+ * SortBufferAccumulator} will be numBuffers at most. Once the {@link 
SortBufferContainer} is full,
+ * or receiving a different type of buffer, or receiving the end-of-partition 
event, the buffer in
+ * the sort buffer container will be flushed to the tiers.
+ *
+ * <p>Note that this class need not be thread-safe, because it should only be 
accessed from the main
+ * thread.
+ */
+public class SortBufferAccumulator implements BufferAccumulator {
+
+    /** The number of the subpartitions. */
+    private final int numSubpartitions;
+
+    /** The total number of the buffers used by the {@link 
SortBufferAccumulator}. */
+    private final int numBuffers;
+
+    /** The byte size of one single buffer. */
+    private final int bufferSizeBytes;
+
+    /** The empty buffers without storing data. */
+    private final LinkedList<MemorySegment> freeSegments = new LinkedList<>();
+
+    /** The memory manager of the tiered storage. */
+    private final TieredStorageMemoryManager storeMemoryManager;
+
+    /** The number of buffers for sorting used in the {@link 
SortBufferContainer}. */
+    private int numBuffersForSort;
+
+    /**
+     * The {@link SortBufferContainer} for accumulating broadcast data. Note 
that this can be null
+     * before using it to store records, and this buffer container will be 
released once flushed.
+     */
+    @Nullable private SortBufferContainer broadcastDataBuffer;
+
+    /**
+     * The {@link SortBufferContainer} for accumulating non-broadcast data. 
Note that this can be
+     * null before using it to store records, and this buffer container will 
be released once
+     * flushed.
+     */
+    @Nullable private SortBufferContainer unicastDataBuffer;
+
+    /**
+     * The buffer recycler. Note that this can be null before requesting 
buffers from the memory
+     * manager.
+     */
+    @Nullable private BufferRecycler bufferRecycler;
+
+    /**
+     * The {@link SortBufferAccumulator}'s accumulated buffer flusher is not 
prepared during
+     * construction, requiring the field to be initialized during setup. 
Therefore, it is necessary
+     * to verify whether this field is null before using it.
+     */
+    @Nullable
+    private BiConsumer<TieredStorageSubpartitionId, List<Buffer>> 
accumulatedBufferFlusher;
+
+    public SortBufferAccumulator(
+            int numSubpartitions,
+            int numBuffers,
+            int bufferSizeBytes,
+            TieredStorageMemoryManager storeMemoryManager) {
+        this.numSubpartitions = numSubpartitions;
+        this.bufferSizeBytes = bufferSizeBytes;
+        this.numBuffers = numBuffers;
+        this.storeMemoryManager = storeMemoryManager;
+    }
+
+    @Override
+    public void setup(BiConsumer<TieredStorageSubpartitionId, List<Buffer>> 
bufferFlusher) {
+        this.accumulatedBufferFlusher = bufferFlusher;
+    }
+
+    @Override
+    public void receive(
+            ByteBuffer record,
+            TieredStorageSubpartitionId subpartitionId,
+            Buffer.DataType dataType,
+            boolean isBroadcast)
+            throws IOException {
+        int targetSubpartition = subpartitionId.getSubpartitionId();
+        SortBufferContainer sortBufferContainer =
+                isBroadcast ? getBroadcastDataBuffer() : 
getUnicastDataBuffer();
+        if (!sortBufferContainer.writeRecord(record, targetSubpartition, 
dataType)) {
+            return;
+        }
+
+        if (!sortBufferContainer.hasRemaining()) {
+            sortBufferContainer.release();
+            writeLargeRecord(record, targetSubpartition, dataType);
+            return;
+        }
+
+        flushDataBuffer(sortBufferContainer);
+        sortBufferContainer.release();
+        if (record.hasRemaining()) {
+            receive(record, subpartitionId, dataType, isBroadcast);
+        }
+    }
+
+    @Override
+    public void close() {
+        flushUnicastDataBuffer();
+        flushBroadcastDataBuffer();
+        releaseFreeBuffers();
+        releaseDataBuffer(unicastDataBuffer);
+        releaseDataBuffer(broadcastDataBuffer);
+    }
+
+    // ------------------------------------------------------------------------
+    //  Internal Methods
+    // ------------------------------------------------------------------------
+
+    private SortBufferContainer getUnicastDataBuffer() {
+        flushBroadcastDataBuffer();
+
+        if (unicastDataBuffer != null
+                && !unicastDataBuffer.isFinished()
+                && !unicastDataBuffer.isReleased()) {
+            return unicastDataBuffer;
+        }
+
+        unicastDataBuffer = createNewDataBuffer();
+        return unicastDataBuffer;
+    }
+
+    private SortBufferContainer getBroadcastDataBuffer() {
+        flushUnicastDataBuffer();
+
+        if (broadcastDataBuffer != null
+                && !broadcastDataBuffer.isFinished()
+                && !broadcastDataBuffer.isReleased()) {
+            return broadcastDataBuffer;
+        }
+
+        broadcastDataBuffer = createNewDataBuffer();
+        return broadcastDataBuffer;
+    }
+
+    private SortBufferContainer createNewDataBuffer() {
+        requestNetworkBuffers();
+
+        return new SortBufferContainer(
+                freeSegments,
+                this::recycleBuffer,
+                numSubpartitions,
+                bufferSizeBytes,
+                numBuffersForSort);
+    }
+
+    private void requestGuaranteedBuffers() {
+        int effectiveRequiredBuffers = effectiveNumRequestedBuffers();
+
+        while (freeSegments.size() < effectiveRequiredBuffers) {
+            BufferBuilder bufferBuilder = 
storeMemoryManager.requestBufferBlocking(this);
+            Buffer buffer = 
bufferBuilder.createBufferConsumerFromBeginning().build();
+            freeSegments.add(checkNotNull(buffer).getMemorySegment());
+            if (bufferRecycler == null) {
+                bufferRecycler = buffer.getRecycler();
+            }
+        }
+    }
+
+    private void requestNetworkBuffers() {
+        requestGuaranteedBuffers();
+
+        // Use the half of the buffers for writing, and the other half for 
reading
+        numBuffersForSort = freeSegments.size() / 2;
+    }
+
+    private void flushDataBuffer(SortBufferContainer sortBufferContainer) {
+        if (sortBufferContainer == null
+                || sortBufferContainer.isReleased()
+                || !sortBufferContainer.hasRemaining()) {
+            return;
+        }
+        sortBufferContainer.finish();
+
+        do {
+            MemorySegment freeSegment = getFreeSegment();
+            Pair<Integer, Buffer> bufferAndSubpartitionId =
+                    sortBufferContainer.readBuffer(freeSegment);
+            if (bufferAndSubpartitionId == null) {
+                if (freeSegment != null) {
+                    recycleBuffer(freeSegment);
+                }
+                break;
+            }
+            addFinishedBuffer(bufferAndSubpartitionId);
+        } while (true);
+
+        releaseFreeBuffers();
+        sortBufferContainer.release();
+    }
+
+    private void flushBroadcastDataBuffer() {
+        if (broadcastDataBuffer != null) {
+            flushDataBuffer(broadcastDataBuffer);
+            broadcastDataBuffer.release();
+            broadcastDataBuffer = null;
+        }
+    }
+
+    private void flushUnicastDataBuffer() {
+        if (unicastDataBuffer != null) {
+            flushDataBuffer(unicastDataBuffer);
+            unicastDataBuffer.release();
+            unicastDataBuffer = null;
+        }
+    }
+
+    private void flushContainerWhenEndOfPartition(
+            boolean isEndOfPartition, SortBufferContainer sortBufferContainer) 
{
+        if (isEndOfPartition) {
+            flushDataBuffer(sortBufferContainer);
+        }
+    }

Review Comment:
   This is unused.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to