xintongsong commented on code in PR #22833:
URL: https://github.com/apache/flink/pull/22833#discussion_r1239483891
##########
flink-runtime/src/main/java/org/apache/flink/runtime/io/network/partition/hybrid/tiered/storage/BufferAccumulator.java:
##########
@@ -46,7 +46,10 @@ public interface BufferAccumulator extends AutoCloseable {
* transformed into finished buffers.
*/
void receive(
- ByteBuffer record, TieredStorageSubpartitionId subpartitionId,
Buffer.DataType dataType)
+ ByteBuffer record,
+ TieredStorageSubpartitionId subpartitionId,
+ Buffer.DataType dataType,
+ boolean isBroadcast)
Review Comment:
When `isBroadcast` is true, what `subpartitionId` are we expecting? Will it
be ignored and thus can be an arbitrary id, or even `null`? Or it is required
to be some special value?
##########
flink-runtime/src/main/java/org/apache/flink/runtime/io/network/partition/hybrid/tiered/storage/SortBufferAccumulator.java:
##########
@@ -0,0 +1,321 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.runtime.io.network.partition.hybrid.tiered.storage;
+
+import org.apache.flink.core.memory.MemorySegment;
+import org.apache.flink.runtime.io.network.buffer.Buffer;
+import org.apache.flink.runtime.io.network.buffer.BufferBuilder;
+import org.apache.flink.runtime.io.network.buffer.BufferRecycler;
+import org.apache.flink.runtime.io.network.buffer.NetworkBuffer;
+import
org.apache.flink.runtime.io.network.partition.hybrid.tiered.common.TieredStorageSubpartitionId;
+
+import org.apache.commons.lang3.tuple.Pair;
+
+import javax.annotation.Nullable;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.Collections;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.function.BiConsumer;
+
+import static org.apache.flink.util.Preconditions.checkNotNull;
+import static org.apache.flink.util.Preconditions.checkState;
+
+/**
+ * The sort-based implementation of the {@link BufferAccumulator}. The {@link
BufferAccumulator}
+ * receives the records from {@link TieredStorageProducerClient} and the
records will accumulate and
+ * transform to finished buffers. The accumulated buffers will be transferred
to the corresponding
+ * tier dynamically.
+ *
+ * <p>The {@link BufferAccumulator} can help use less buffers to accumulate
data, which decouples
+ * the buffer usage with the number of parallelism. The number of buffers used
by the {@link
+ * SortBufferAccumulator} will be numBuffers at most. Once the {@link
SortBufferContainer} is full,
+ * or receiving a different type of buffer, or receiving the end-of-partition
event, the buffer in
+ * the sort buffer container will be flushed to the tiers.
+ *
+ * <p>Note that this class need not be thread-safe, because it should only be
accessed from the main
+ * thread.
+ */
+public class SortBufferAccumulator implements BufferAccumulator {
+
+ /** The number of the subpartitions. */
+ private final int numSubpartitions;
+
+ /** The total number of the buffers used by the {@link
SortBufferAccumulator}. */
+ private final int numBuffers;
+
+ /** The byte size of one single buffer. */
+ private final int bufferSizeBytes;
+
+ /** The empty buffers without storing data. */
+ private final LinkedList<MemorySegment> freeSegments = new LinkedList<>();
+
+ /** The memory manager of the tiered storage. */
+ private final TieredStorageMemoryManager storeMemoryManager;
+
+ /** The number of buffers for sorting used in the {@link
SortBufferContainer}. */
+ private int numBuffersForSort;
+
+ /**
+ * The {@link SortBufferContainer} for accumulating broadcast data. Note
that this can be null
+ * before using it to store records, and this buffer container will be
released once flushed.
+ */
+ @Nullable private SortBufferContainer broadcastDataBuffer;
+
+ /**
+ * The {@link SortBufferContainer} for accumulating non-broadcast data.
Note that this can be
+ * null before using it to store records, and this buffer container will
be released once
+ * flushed.
+ */
+ @Nullable private SortBufferContainer unicastDataBuffer;
+
+ /**
+ * The buffer recycler. Note that this can be null before requesting
buffers from the memory
+ * manager.
+ */
+ @Nullable private BufferRecycler bufferRecycler;
+
+ /**
+ * The {@link SortBufferAccumulator}'s accumulated buffer flusher is not
prepared during
+ * construction, requiring the field to be initialized during setup.
Therefore, it is necessary
+ * to verify whether this field is null before using it.
+ */
+ @Nullable
+ private BiConsumer<TieredStorageSubpartitionId, List<Buffer>>
accumulatedBufferFlusher;
+
+ public SortBufferAccumulator(
+ int numSubpartitions,
+ int numBuffers,
+ int bufferSizeBytes,
+ TieredStorageMemoryManager storeMemoryManager) {
+ this.numSubpartitions = numSubpartitions;
+ this.bufferSizeBytes = bufferSizeBytes;
+ this.numBuffers = numBuffers;
+ this.storeMemoryManager = storeMemoryManager;
+ }
+
+ @Override
+ public void setup(BiConsumer<TieredStorageSubpartitionId, List<Buffer>>
bufferFlusher) {
+ this.accumulatedBufferFlusher = bufferFlusher;
+ }
+
+ @Override
+ public void receive(
+ ByteBuffer record,
+ TieredStorageSubpartitionId subpartitionId,
+ Buffer.DataType dataType,
+ boolean isBroadcast)
+ throws IOException {
+ int targetSubpartition = subpartitionId.getSubpartitionId();
+ SortBufferContainer sortBufferContainer =
+ isBroadcast ? getBroadcastDataBuffer() :
getUnicastDataBuffer();
+ if (!sortBufferContainer.writeRecord(record, targetSubpartition,
dataType)) {
+ return;
+ }
+
+ if (!sortBufferContainer.hasRemaining()) {
+ sortBufferContainer.release();
+ writeLargeRecord(record, targetSubpartition, dataType);
+ return;
+ }
Review Comment:
It took me a while to figure out why `sortBufferContainer.hasRemaining()`
being `false` should result in writing a large record. It would be easier if we
add a simple comment explaining: the container is empty, yet we failed to write
the record into it, which suggests the record is larger than the container can
hold.
##########
flink-runtime/src/main/java/org/apache/flink/runtime/io/network/partition/hybrid/tiered/storage/SortBufferContainer.java:
##########
@@ -0,0 +1,454 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.runtime.io.network.partition.hybrid.tiered.storage;
+
+import org.apache.flink.core.memory.MemorySegment;
+import org.apache.flink.core.memory.MemorySegmentFactory;
+import org.apache.flink.runtime.io.network.buffer.Buffer;
+import org.apache.flink.runtime.io.network.buffer.BufferRecycler;
+import org.apache.flink.runtime.io.network.buffer.NetworkBuffer;
+
+import org.apache.commons.lang3.tuple.Pair;
+
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.LinkedList;
+
+import static org.apache.flink.util.Preconditions.checkArgument;
+import static org.apache.flink.util.Preconditions.checkNotNull;
+import static org.apache.flink.util.Preconditions.checkState;
+
+/**
+ * The buffer container for accumulating the records into {@link Buffer}s.
After accumulating, the
+ * {@link SortBufferAccumulator} will read the sorted buffers.
+ */
+public class SortBufferContainer {
+
+ /**
+ * Size of an index entry: 4 bytes for record length, 4 bytes for data
type and 8 bytes for
+ * pointer to next entry.
+ */
+ private static final int INDEX_ENTRY_SIZE = 4 + 4 + 8;
+
+ /** A list of {@link MemorySegment}s used to store data in memory. */
+ private final LinkedList<MemorySegment> freeSegments;
+
+ /** A segment list as a joint buffer which stores all records and index
entries. */
+ private final ArrayList<MemorySegment> dataSegments;
+
+ /** {@link BufferRecycler} used to recycle {@link #freeSegments}. */
+ private final BufferRecycler bufferRecycler;
+
+ /** Addresses of the first record's index entry for each subpartition. */
+ private final long[] subpartitionFirstBufferIndexEntries;
+
+ /** Addresses of the last record's index entry for each subpartition. */
+ private final long[] subpartitionLastBufferIndexEntries;
+
+ /** Size of buffers requested from buffer pool. All buffers must be of the
same size. */
+ private final int bufferSizeBytes;
+
+ /** Number of guaranteed buffers can be allocated from the buffer pool for
data sort. */
+ private final int numBuffersForSort;
+
+ // ------------------------------------------------------------------------
+ // The statistics and states
+ // ------------------------------------------------------------------------
+
+ /** Total number of bytes already appended to this sort buffer. */
+ private long numTotalBytes;
+
+ /** Total number of bytes already read from this sort buffer. */
+ private long numTotalBytesRead;
+
+ /** Whether this sort buffer is finished. One can only read a finished
sort buffer. */
+ private boolean isFinished;
+
+ /** Whether this sort buffer is released. A released sort buffer can not
be used. */
+ private boolean isReleased;
+
+ // ------------------------------------------------------------------------
+ // For writing
+ // ------------------------------------------------------------------------
+
+ /** Array index in the segment list of the current available buffer for
writing. */
+ private int writeBufferIndex;
+
+ /** Next position in the current available buffer for writing. */
+ private int writeOffsetInCurrentBuffer;
+
+ // ------------------------------------------------------------------------
+ // For reading
+ // ------------------------------------------------------------------------
+
+ /** Index entry address of the current record or event to be read. */
+ private long readBufferIndexEntry;
+
+ /**
+ * Record the bytes remaining after the last read, which must be
initialized before reading a
+ * new record.
+ */
+ private int recordRemainingBytesToRead;
+
+ /** The subpartition that is reading data from. */
+ private int readingSubpartitionId = -1;
+
+ SortBufferContainer(
+ LinkedList<MemorySegment> freeSegments,
+ BufferRecycler bufferRecycler,
+ int numSubpartitions,
+ int bufferSizeBytes,
+ int numBuffersForSort) {
+ checkArgument(bufferSizeBytes > INDEX_ENTRY_SIZE, "Buffer size is too
small.");
+ checkArgument(numBuffersForSort > 0, "No guaranteed buffers for
sort.");
+ checkState(numBuffersForSort <= freeSegments.size(), "Wrong number of
free segments.");
+
+ this.freeSegments = checkNotNull(freeSegments);
+ this.bufferRecycler = checkNotNull(bufferRecycler);
+ this.bufferSizeBytes = bufferSizeBytes;
+ this.numBuffersForSort = numBuffersForSort;
+ this.dataSegments = new ArrayList<>();
+ this.subpartitionFirstBufferIndexEntries = new long[numSubpartitions];
+ this.subpartitionLastBufferIndexEntries = new long[numSubpartitions];
+
+ Arrays.fill(subpartitionFirstBufferIndexEntries, -1L);
+ Arrays.fill(subpartitionLastBufferIndexEntries, -1L);
+ }
+
+ // ------------------------------------------------------------------------
+ // Called by SortBufferAccumulator
+ // ------------------------------------------------------------------------
+
+ /**
+ * Note that no partial records will be written to this {@link
SortBufferContainer}, which means
+ * that either all data of target record will be written or nothing will
be written.
+ *
+ * @param record the record to be written
+ * @param subpartitionId the subpartition id
+ * @param dataType the data type of the record
+ * @return true if the {@link SortBufferContainer} is full, or return
false if the contianer is
+ * not full
Review Comment:
This is against conventions. Usually, a write-like method returns true means
the write operation has succeed.
##########
flink-runtime/src/main/java/org/apache/flink/runtime/io/network/partition/hybrid/tiered/storage/SortBufferAccumulator.java:
##########
@@ -0,0 +1,321 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.runtime.io.network.partition.hybrid.tiered.storage;
+
+import org.apache.flink.core.memory.MemorySegment;
+import org.apache.flink.runtime.io.network.buffer.Buffer;
+import org.apache.flink.runtime.io.network.buffer.BufferBuilder;
+import org.apache.flink.runtime.io.network.buffer.BufferRecycler;
+import org.apache.flink.runtime.io.network.buffer.NetworkBuffer;
+import
org.apache.flink.runtime.io.network.partition.hybrid.tiered.common.TieredStorageSubpartitionId;
+
+import org.apache.commons.lang3.tuple.Pair;
+
+import javax.annotation.Nullable;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.Collections;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.function.BiConsumer;
+
+import static org.apache.flink.util.Preconditions.checkNotNull;
+import static org.apache.flink.util.Preconditions.checkState;
+
+/**
+ * The sort-based implementation of the {@link BufferAccumulator}. The {@link
BufferAccumulator}
+ * receives the records from {@link TieredStorageProducerClient} and the
records will accumulate and
+ * transform to finished buffers. The accumulated buffers will be transferred
to the corresponding
+ * tier dynamically.
+ *
+ * <p>The {@link BufferAccumulator} can help use less buffers to accumulate
data, which decouples
+ * the buffer usage with the number of parallelism. The number of buffers used
by the {@link
+ * SortBufferAccumulator} will be numBuffers at most. Once the {@link
SortBufferContainer} is full,
+ * or receiving a different type of buffer, or receiving the end-of-partition
event, the buffer in
+ * the sort buffer container will be flushed to the tiers.
+ *
+ * <p>Note that this class need not be thread-safe, because it should only be
accessed from the main
+ * thread.
+ */
+public class SortBufferAccumulator implements BufferAccumulator {
+
+ /** The number of the subpartitions. */
+ private final int numSubpartitions;
+
+ /** The total number of the buffers used by the {@link
SortBufferAccumulator}. */
+ private final int numBuffers;
+
+ /** The byte size of one single buffer. */
+ private final int bufferSizeBytes;
+
+ /** The empty buffers without storing data. */
+ private final LinkedList<MemorySegment> freeSegments = new LinkedList<>();
+
+ /** The memory manager of the tiered storage. */
+ private final TieredStorageMemoryManager storeMemoryManager;
+
+ /** The number of buffers for sorting used in the {@link
SortBufferContainer}. */
+ private int numBuffersForSort;
+
+ /**
+ * The {@link SortBufferContainer} for accumulating broadcast data. Note
that this can be null
+ * before using it to store records, and this buffer container will be
released once flushed.
+ */
+ @Nullable private SortBufferContainer broadcastDataBuffer;
+
+ /**
+ * The {@link SortBufferContainer} for accumulating non-broadcast data.
Note that this can be
+ * null before using it to store records, and this buffer container will
be released once
+ * flushed.
+ */
+ @Nullable private SortBufferContainer unicastDataBuffer;
+
+ /**
+ * The buffer recycler. Note that this can be null before requesting
buffers from the memory
+ * manager.
+ */
+ @Nullable private BufferRecycler bufferRecycler;
+
+ /**
+ * The {@link SortBufferAccumulator}'s accumulated buffer flusher is not
prepared during
+ * construction, requiring the field to be initialized during setup.
Therefore, it is necessary
+ * to verify whether this field is null before using it.
+ */
+ @Nullable
+ private BiConsumer<TieredStorageSubpartitionId, List<Buffer>>
accumulatedBufferFlusher;
+
+ public SortBufferAccumulator(
+ int numSubpartitions,
+ int numBuffers,
+ int bufferSizeBytes,
+ TieredStorageMemoryManager storeMemoryManager) {
+ this.numSubpartitions = numSubpartitions;
+ this.bufferSizeBytes = bufferSizeBytes;
+ this.numBuffers = numBuffers;
+ this.storeMemoryManager = storeMemoryManager;
+ }
+
+ @Override
+ public void setup(BiConsumer<TieredStorageSubpartitionId, List<Buffer>>
bufferFlusher) {
+ this.accumulatedBufferFlusher = bufferFlusher;
+ }
+
+ @Override
+ public void receive(
+ ByteBuffer record,
+ TieredStorageSubpartitionId subpartitionId,
+ Buffer.DataType dataType,
+ boolean isBroadcast)
+ throws IOException {
+ int targetSubpartition = subpartitionId.getSubpartitionId();
+ SortBufferContainer sortBufferContainer =
+ isBroadcast ? getBroadcastDataBuffer() :
getUnicastDataBuffer();
+ if (!sortBufferContainer.writeRecord(record, targetSubpartition,
dataType)) {
+ return;
+ }
+
+ if (!sortBufferContainer.hasRemaining()) {
+ sortBufferContainer.release();
+ writeLargeRecord(record, targetSubpartition, dataType);
+ return;
+ }
+
+ flushDataBuffer(sortBufferContainer);
+ sortBufferContainer.release();
+ if (record.hasRemaining()) {
+ receive(record, subpartitionId, dataType, isBroadcast);
+ }
+ }
+
+ @Override
+ public void close() {
+ flushUnicastDataBuffer();
+ flushBroadcastDataBuffer();
+ releaseFreeBuffers();
+ releaseDataBuffer(unicastDataBuffer);
+ releaseDataBuffer(broadcastDataBuffer);
+ }
+
+ // ------------------------------------------------------------------------
+ // Internal Methods
+ // ------------------------------------------------------------------------
+
+ private SortBufferContainer getUnicastDataBuffer() {
+ flushBroadcastDataBuffer();
+
+ if (unicastDataBuffer != null
+ && !unicastDataBuffer.isFinished()
+ && !unicastDataBuffer.isReleased()) {
+ return unicastDataBuffer;
+ }
+
+ unicastDataBuffer = createNewDataBuffer();
+ return unicastDataBuffer;
+ }
+
+ private SortBufferContainer getBroadcastDataBuffer() {
+ flushUnicastDataBuffer();
+
+ if (broadcastDataBuffer != null
+ && !broadcastDataBuffer.isFinished()
+ && !broadcastDataBuffer.isReleased()) {
+ return broadcastDataBuffer;
+ }
+
+ broadcastDataBuffer = createNewDataBuffer();
+ return broadcastDataBuffer;
+ }
+
+ private SortBufferContainer createNewDataBuffer() {
+ requestNetworkBuffers();
+
+ return new SortBufferContainer(
+ freeSegments,
+ this::recycleBuffer,
+ numSubpartitions,
+ bufferSizeBytes,
+ numBuffersForSort);
+ }
+
+ private void requestGuaranteedBuffers() {
+ int effectiveRequiredBuffers = effectiveNumRequestedBuffers();
+
+ while (freeSegments.size() < effectiveRequiredBuffers) {
+ BufferBuilder bufferBuilder =
storeMemoryManager.requestBufferBlocking(this);
+ Buffer buffer =
bufferBuilder.createBufferConsumerFromBeginning().build();
+ freeSegments.add(checkNotNull(buffer).getMemorySegment());
+ if (bufferRecycler == null) {
+ bufferRecycler = buffer.getRecycler();
+ }
+ }
+ }
+
+ private void requestNetworkBuffers() {
+ requestGuaranteedBuffers();
+
+ // Use the half of the buffers for writing, and the other half for
reading
+ numBuffersForSort = freeSegments.size() / 2;
+ }
+
+ private void flushDataBuffer(SortBufferContainer sortBufferContainer) {
+ if (sortBufferContainer == null
+ || sortBufferContainer.isReleased()
+ || !sortBufferContainer.hasRemaining()) {
+ return;
+ }
+ sortBufferContainer.finish();
+
+ do {
+ MemorySegment freeSegment = getFreeSegment();
+ Pair<Integer, Buffer> bufferAndSubpartitionId =
+ sortBufferContainer.readBuffer(freeSegment);
+ if (bufferAndSubpartitionId == null) {
+ if (freeSegment != null) {
+ recycleBuffer(freeSegment);
+ }
+ break;
+ }
+ addFinishedBuffer(bufferAndSubpartitionId);
+ } while (true);
+
+ releaseFreeBuffers();
+ sortBufferContainer.release();
+ }
+
+ private void flushBroadcastDataBuffer() {
+ if (broadcastDataBuffer != null) {
+ flushDataBuffer(broadcastDataBuffer);
+ broadcastDataBuffer.release();
+ broadcastDataBuffer = null;
+ }
+ }
+
+ private void flushUnicastDataBuffer() {
+ if (unicastDataBuffer != null) {
+ flushDataBuffer(unicastDataBuffer);
+ unicastDataBuffer.release();
+ unicastDataBuffer = null;
+ }
+ }
+
+ private void flushContainerWhenEndOfPartition(
+ boolean isEndOfPartition, SortBufferContainer sortBufferContainer)
{
+ if (isEndOfPartition) {
+ flushDataBuffer(sortBufferContainer);
+ }
+ }
+
+ private void writeLargeRecord(
+ ByteBuffer record, int targetSubpartition, Buffer.DataType
dataType) {
+
+ checkState(dataType != Buffer.DataType.EVENT_BUFFER);
+ while (record.hasRemaining()) {
+ int toCopy = Math.min(record.remaining(), bufferSizeBytes);
+ MemorySegment writeBuffer = checkNotNull(getFreeSegment());
+ writeBuffer.put(0, record, toCopy);
+
+ addFinishedBuffer(
+ Pair.of(
+ targetSubpartition,
+ new NetworkBuffer(
+ writeBuffer, checkNotNull(bufferRecycler),
dataType, toCopy)));
+ }
+
+ releaseFreeBuffers();
+ }
+
+ private MemorySegment getFreeSegment() {
+ MemorySegment freeSegment = freeSegments.poll();
+ if (freeSegment == null) {
+ BufferBuilder bufferBuilder =
storeMemoryManager.requestBufferBlocking(this);
+ Buffer buffer =
bufferBuilder.createBufferConsumerFromBeginning().build();
+ freeSegment = buffer.getMemorySegment();
+ }
+ return freeSegment;
+ }
+
+ private int effectiveNumRequestedBuffers() {
+ return Math.min(numSubpartitions + 1, numBuffers);
Review Comment:
Why `numSubpartitions + 1`?
##########
flink-runtime/src/main/java/org/apache/flink/runtime/io/network/partition/hybrid/tiered/storage/SortBufferContainer.java:
##########
@@ -0,0 +1,454 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.runtime.io.network.partition.hybrid.tiered.storage;
+
+import org.apache.flink.core.memory.MemorySegment;
+import org.apache.flink.core.memory.MemorySegmentFactory;
+import org.apache.flink.runtime.io.network.buffer.Buffer;
+import org.apache.flink.runtime.io.network.buffer.BufferRecycler;
+import org.apache.flink.runtime.io.network.buffer.NetworkBuffer;
+
+import org.apache.commons.lang3.tuple.Pair;
+
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.LinkedList;
+
+import static org.apache.flink.util.Preconditions.checkArgument;
+import static org.apache.flink.util.Preconditions.checkNotNull;
+import static org.apache.flink.util.Preconditions.checkState;
+
+/**
+ * The buffer container for accumulating the records into {@link Buffer}s.
After accumulating, the
+ * {@link SortBufferAccumulator} will read the sorted buffers.
+ */
+public class SortBufferContainer {
Review Comment:
1. The name is confusing. I'd suggest simply `SortBuffer` or `BufferSorter`.
2. I think the important characteristic of this component is that, you can
write data into it in any order but read data from it in the order grouped by
subpartitions. This is not clear enough from the JavaDoc.
##########
flink-runtime/src/main/java/org/apache/flink/runtime/io/network/partition/hybrid/tiered/storage/SortBufferAccumulator.java:
##########
@@ -0,0 +1,321 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.runtime.io.network.partition.hybrid.tiered.storage;
+
+import org.apache.flink.core.memory.MemorySegment;
+import org.apache.flink.runtime.io.network.buffer.Buffer;
+import org.apache.flink.runtime.io.network.buffer.BufferBuilder;
+import org.apache.flink.runtime.io.network.buffer.BufferRecycler;
+import org.apache.flink.runtime.io.network.buffer.NetworkBuffer;
+import
org.apache.flink.runtime.io.network.partition.hybrid.tiered.common.TieredStorageSubpartitionId;
+
+import org.apache.commons.lang3.tuple.Pair;
+
+import javax.annotation.Nullable;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.Collections;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.function.BiConsumer;
+
+import static org.apache.flink.util.Preconditions.checkNotNull;
+import static org.apache.flink.util.Preconditions.checkState;
+
+/**
+ * The sort-based implementation of the {@link BufferAccumulator}. The {@link
BufferAccumulator}
+ * receives the records from {@link TieredStorageProducerClient} and the
records will accumulate and
+ * transform to finished buffers. The accumulated buffers will be transferred
to the corresponding
+ * tier dynamically.
+ *
+ * <p>The {@link BufferAccumulator} can help use less buffers to accumulate
data, which decouples
+ * the buffer usage with the number of parallelism. The number of buffers used
by the {@link
+ * SortBufferAccumulator} will be numBuffers at most. Once the {@link
SortBufferContainer} is full,
+ * or receiving a different type of buffer, or receiving the end-of-partition
event, the buffer in
+ * the sort buffer container will be flushed to the tiers.
+ *
+ * <p>Note that this class need not be thread-safe, because it should only be
accessed from the main
+ * thread.
+ */
+public class SortBufferAccumulator implements BufferAccumulator {
+
+ /** The number of the subpartitions. */
+ private final int numSubpartitions;
+
+ /** The total number of the buffers used by the {@link
SortBufferAccumulator}. */
+ private final int numBuffers;
+
+ /** The byte size of one single buffer. */
+ private final int bufferSizeBytes;
+
+ /** The empty buffers without storing data. */
+ private final LinkedList<MemorySegment> freeSegments = new LinkedList<>();
+
+ /** The memory manager of the tiered storage. */
+ private final TieredStorageMemoryManager storeMemoryManager;
+
+ /** The number of buffers for sorting used in the {@link
SortBufferContainer}. */
+ private int numBuffersForSort;
+
+ /**
+ * The {@link SortBufferContainer} for accumulating broadcast data. Note
that this can be null
+ * before using it to store records, and this buffer container will be
released once flushed.
+ */
+ @Nullable private SortBufferContainer broadcastDataBuffer;
+
+ /**
+ * The {@link SortBufferContainer} for accumulating non-broadcast data.
Note that this can be
+ * null before using it to store records, and this buffer container will
be released once
+ * flushed.
+ */
+ @Nullable private SortBufferContainer unicastDataBuffer;
+
+ /**
+ * The buffer recycler. Note that this can be null before requesting
buffers from the memory
+ * manager.
+ */
+ @Nullable private BufferRecycler bufferRecycler;
+
+ /**
+ * The {@link SortBufferAccumulator}'s accumulated buffer flusher is not
prepared during
+ * construction, requiring the field to be initialized during setup.
Therefore, it is necessary
+ * to verify whether this field is null before using it.
+ */
+ @Nullable
+ private BiConsumer<TieredStorageSubpartitionId, List<Buffer>>
accumulatedBufferFlusher;
+
+ public SortBufferAccumulator(
+ int numSubpartitions,
+ int numBuffers,
+ int bufferSizeBytes,
+ TieredStorageMemoryManager storeMemoryManager) {
+ this.numSubpartitions = numSubpartitions;
+ this.bufferSizeBytes = bufferSizeBytes;
+ this.numBuffers = numBuffers;
+ this.storeMemoryManager = storeMemoryManager;
+ }
+
+ @Override
+ public void setup(BiConsumer<TieredStorageSubpartitionId, List<Buffer>>
bufferFlusher) {
+ this.accumulatedBufferFlusher = bufferFlusher;
+ }
+
+ @Override
+ public void receive(
+ ByteBuffer record,
+ TieredStorageSubpartitionId subpartitionId,
+ Buffer.DataType dataType,
+ boolean isBroadcast)
+ throws IOException {
+ int targetSubpartition = subpartitionId.getSubpartitionId();
+ SortBufferContainer sortBufferContainer =
+ isBroadcast ? getBroadcastDataBuffer() :
getUnicastDataBuffer();
+ if (!sortBufferContainer.writeRecord(record, targetSubpartition,
dataType)) {
+ return;
+ }
+
+ if (!sortBufferContainer.hasRemaining()) {
+ sortBufferContainer.release();
+ writeLargeRecord(record, targetSubpartition, dataType);
+ return;
+ }
+
+ flushDataBuffer(sortBufferContainer);
+ sortBufferContainer.release();
+ if (record.hasRemaining()) {
Review Comment:
Shouldn't this always be `true`?
##########
flink-runtime/src/main/java/org/apache/flink/runtime/io/network/partition/hybrid/tiered/storage/SortBufferAccumulator.java:
##########
@@ -0,0 +1,321 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.runtime.io.network.partition.hybrid.tiered.storage;
+
+import org.apache.flink.core.memory.MemorySegment;
+import org.apache.flink.runtime.io.network.buffer.Buffer;
+import org.apache.flink.runtime.io.network.buffer.BufferBuilder;
+import org.apache.flink.runtime.io.network.buffer.BufferRecycler;
+import org.apache.flink.runtime.io.network.buffer.NetworkBuffer;
+import
org.apache.flink.runtime.io.network.partition.hybrid.tiered.common.TieredStorageSubpartitionId;
+
+import org.apache.commons.lang3.tuple.Pair;
+
+import javax.annotation.Nullable;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.Collections;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.function.BiConsumer;
+
+import static org.apache.flink.util.Preconditions.checkNotNull;
+import static org.apache.flink.util.Preconditions.checkState;
+
+/**
+ * The sort-based implementation of the {@link BufferAccumulator}. The {@link
BufferAccumulator}
+ * receives the records from {@link TieredStorageProducerClient} and the
records will accumulate and
+ * transform to finished buffers. The accumulated buffers will be transferred
to the corresponding
+ * tier dynamically.
+ *
+ * <p>The {@link BufferAccumulator} can help use less buffers to accumulate
data, which decouples
+ * the buffer usage with the number of parallelism. The number of buffers used
by the {@link
+ * SortBufferAccumulator} will be numBuffers at most. Once the {@link
SortBufferContainer} is full,
+ * or receiving a different type of buffer, or receiving the end-of-partition
event, the buffer in
+ * the sort buffer container will be flushed to the tiers.
+ *
+ * <p>Note that this class need not be thread-safe, because it should only be
accessed from the main
+ * thread.
+ */
+public class SortBufferAccumulator implements BufferAccumulator {
+
+ /** The number of the subpartitions. */
+ private final int numSubpartitions;
+
+ /** The total number of the buffers used by the {@link
SortBufferAccumulator}. */
+ private final int numBuffers;
+
+ /** The byte size of one single buffer. */
+ private final int bufferSizeBytes;
+
+ /** The empty buffers without storing data. */
+ private final LinkedList<MemorySegment> freeSegments = new LinkedList<>();
+
+ /** The memory manager of the tiered storage. */
+ private final TieredStorageMemoryManager storeMemoryManager;
+
+ /** The number of buffers for sorting used in the {@link
SortBufferContainer}. */
+ private int numBuffersForSort;
+
+ /**
+ * The {@link SortBufferContainer} for accumulating broadcast data. Note
that this can be null
+ * before using it to store records, and this buffer container will be
released once flushed.
+ */
+ @Nullable private SortBufferContainer broadcastDataBuffer;
+
+ /**
+ * The {@link SortBufferContainer} for accumulating non-broadcast data.
Note that this can be
+ * null before using it to store records, and this buffer container will
be released once
+ * flushed.
+ */
+ @Nullable private SortBufferContainer unicastDataBuffer;
+
+ /**
+ * The buffer recycler. Note that this can be null before requesting
buffers from the memory
+ * manager.
+ */
+ @Nullable private BufferRecycler bufferRecycler;
+
+ /**
+ * The {@link SortBufferAccumulator}'s accumulated buffer flusher is not
prepared during
+ * construction, requiring the field to be initialized during setup.
Therefore, it is necessary
+ * to verify whether this field is null before using it.
+ */
+ @Nullable
+ private BiConsumer<TieredStorageSubpartitionId, List<Buffer>>
accumulatedBufferFlusher;
+
+ public SortBufferAccumulator(
+ int numSubpartitions,
+ int numBuffers,
+ int bufferSizeBytes,
+ TieredStorageMemoryManager storeMemoryManager) {
+ this.numSubpartitions = numSubpartitions;
+ this.bufferSizeBytes = bufferSizeBytes;
+ this.numBuffers = numBuffers;
+ this.storeMemoryManager = storeMemoryManager;
+ }
+
+ @Override
+ public void setup(BiConsumer<TieredStorageSubpartitionId, List<Buffer>>
bufferFlusher) {
+ this.accumulatedBufferFlusher = bufferFlusher;
+ }
+
+ @Override
+ public void receive(
+ ByteBuffer record,
+ TieredStorageSubpartitionId subpartitionId,
+ Buffer.DataType dataType,
+ boolean isBroadcast)
+ throws IOException {
+ int targetSubpartition = subpartitionId.getSubpartitionId();
+ SortBufferContainer sortBufferContainer =
+ isBroadcast ? getBroadcastDataBuffer() :
getUnicastDataBuffer();
Review Comment:
It is implicit that getting one of the two data buffers would result in the
other one being flushed.
I'd suggest the following:
- Have only one field `currentDataBuffer`, instead of two nullable
`broadcastDataBuffer` and `unicastDataBuffer` and only one of them can have
non-null value at anytime.
- Instead of calling `getBroadcast/UnicastDataBuffer`, we can have a method
`switchCurrentDataBufferIfNeeded(isBroadcast)`, in which we can update the
value of `currentDataBuffer`. This would make the flushing less surprising.
- We might need another field to remember whether the current data buffer is
broadcast or unicast.
##########
flink-runtime/src/main/java/org/apache/flink/runtime/io/network/partition/hybrid/tiered/storage/SortBufferAccumulator.java:
##########
@@ -0,0 +1,321 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.runtime.io.network.partition.hybrid.tiered.storage;
+
+import org.apache.flink.core.memory.MemorySegment;
+import org.apache.flink.runtime.io.network.buffer.Buffer;
+import org.apache.flink.runtime.io.network.buffer.BufferBuilder;
+import org.apache.flink.runtime.io.network.buffer.BufferRecycler;
+import org.apache.flink.runtime.io.network.buffer.NetworkBuffer;
+import
org.apache.flink.runtime.io.network.partition.hybrid.tiered.common.TieredStorageSubpartitionId;
+
+import org.apache.commons.lang3.tuple.Pair;
+
+import javax.annotation.Nullable;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.Collections;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.function.BiConsumer;
+
+import static org.apache.flink.util.Preconditions.checkNotNull;
+import static org.apache.flink.util.Preconditions.checkState;
+
+/**
+ * The sort-based implementation of the {@link BufferAccumulator}. The {@link
BufferAccumulator}
+ * receives the records from {@link TieredStorageProducerClient} and the
records will accumulate and
+ * transform to finished buffers. The accumulated buffers will be transferred
to the corresponding
+ * tier dynamically.
+ *
+ * <p>The {@link BufferAccumulator} can help use less buffers to accumulate
data, which decouples
+ * the buffer usage with the number of parallelism. The number of buffers used
by the {@link
+ * SortBufferAccumulator} will be numBuffers at most. Once the {@link
SortBufferContainer} is full,
+ * or receiving a different type of buffer, or receiving the end-of-partition
event, the buffer in
+ * the sort buffer container will be flushed to the tiers.
+ *
+ * <p>Note that this class need not be thread-safe, because it should only be
accessed from the main
+ * thread.
+ */
+public class SortBufferAccumulator implements BufferAccumulator {
+
+ /** The number of the subpartitions. */
+ private final int numSubpartitions;
+
+ /** The total number of the buffers used by the {@link
SortBufferAccumulator}. */
+ private final int numBuffers;
+
+ /** The byte size of one single buffer. */
+ private final int bufferSizeBytes;
+
+ /** The empty buffers without storing data. */
+ private final LinkedList<MemorySegment> freeSegments = new LinkedList<>();
+
+ /** The memory manager of the tiered storage. */
+ private final TieredStorageMemoryManager storeMemoryManager;
+
+ /** The number of buffers for sorting used in the {@link
SortBufferContainer}. */
+ private int numBuffersForSort;
+
+ /**
+ * The {@link SortBufferContainer} for accumulating broadcast data. Note
that this can be null
+ * before using it to store records, and this buffer container will be
released once flushed.
+ */
+ @Nullable private SortBufferContainer broadcastDataBuffer;
+
+ /**
+ * The {@link SortBufferContainer} for accumulating non-broadcast data.
Note that this can be
+ * null before using it to store records, and this buffer container will
be released once
+ * flushed.
+ */
+ @Nullable private SortBufferContainer unicastDataBuffer;
+
+ /**
+ * The buffer recycler. Note that this can be null before requesting
buffers from the memory
+ * manager.
+ */
+ @Nullable private BufferRecycler bufferRecycler;
+
+ /**
+ * The {@link SortBufferAccumulator}'s accumulated buffer flusher is not
prepared during
+ * construction, requiring the field to be initialized during setup.
Therefore, it is necessary
+ * to verify whether this field is null before using it.
+ */
+ @Nullable
+ private BiConsumer<TieredStorageSubpartitionId, List<Buffer>>
accumulatedBufferFlusher;
+
+ public SortBufferAccumulator(
+ int numSubpartitions,
+ int numBuffers,
+ int bufferSizeBytes,
+ TieredStorageMemoryManager storeMemoryManager) {
+ this.numSubpartitions = numSubpartitions;
+ this.bufferSizeBytes = bufferSizeBytes;
+ this.numBuffers = numBuffers;
+ this.storeMemoryManager = storeMemoryManager;
+ }
+
+ @Override
+ public void setup(BiConsumer<TieredStorageSubpartitionId, List<Buffer>>
bufferFlusher) {
+ this.accumulatedBufferFlusher = bufferFlusher;
+ }
+
+ @Override
+ public void receive(
+ ByteBuffer record,
+ TieredStorageSubpartitionId subpartitionId,
+ Buffer.DataType dataType,
+ boolean isBroadcast)
+ throws IOException {
+ int targetSubpartition = subpartitionId.getSubpartitionId();
+ SortBufferContainer sortBufferContainer =
+ isBroadcast ? getBroadcastDataBuffer() :
getUnicastDataBuffer();
+ if (!sortBufferContainer.writeRecord(record, targetSubpartition,
dataType)) {
+ return;
+ }
+
+ if (!sortBufferContainer.hasRemaining()) {
+ sortBufferContainer.release();
+ writeLargeRecord(record, targetSubpartition, dataType);
+ return;
+ }
+
+ flushDataBuffer(sortBufferContainer);
+ sortBufferContainer.release();
+ if (record.hasRemaining()) {
+ receive(record, subpartitionId, dataType, isBroadcast);
+ }
+ }
+
+ @Override
+ public void close() {
+ flushUnicastDataBuffer();
+ flushBroadcastDataBuffer();
+ releaseFreeBuffers();
+ releaseDataBuffer(unicastDataBuffer);
+ releaseDataBuffer(broadcastDataBuffer);
+ }
+
+ // ------------------------------------------------------------------------
+ // Internal Methods
+ // ------------------------------------------------------------------------
+
+ private SortBufferContainer getUnicastDataBuffer() {
+ flushBroadcastDataBuffer();
+
+ if (unicastDataBuffer != null
+ && !unicastDataBuffer.isFinished()
+ && !unicastDataBuffer.isReleased()) {
+ return unicastDataBuffer;
+ }
+
+ unicastDataBuffer = createNewDataBuffer();
+ return unicastDataBuffer;
+ }
+
+ private SortBufferContainer getBroadcastDataBuffer() {
+ flushUnicastDataBuffer();
+
+ if (broadcastDataBuffer != null
+ && !broadcastDataBuffer.isFinished()
+ && !broadcastDataBuffer.isReleased()) {
+ return broadcastDataBuffer;
+ }
+
+ broadcastDataBuffer = createNewDataBuffer();
+ return broadcastDataBuffer;
+ }
+
+ private SortBufferContainer createNewDataBuffer() {
+ requestNetworkBuffers();
+
+ return new SortBufferContainer(
+ freeSegments,
+ this::recycleBuffer,
+ numSubpartitions,
+ bufferSizeBytes,
+ numBuffersForSort);
+ }
+
+ private void requestGuaranteedBuffers() {
+ int effectiveRequiredBuffers = effectiveNumRequestedBuffers();
+
+ while (freeSegments.size() < effectiveRequiredBuffers) {
+ BufferBuilder bufferBuilder =
storeMemoryManager.requestBufferBlocking(this);
+ Buffer buffer =
bufferBuilder.createBufferConsumerFromBeginning().build();
+ freeSegments.add(checkNotNull(buffer).getMemorySegment());
+ if (bufferRecycler == null) {
+ bufferRecycler = buffer.getRecycler();
+ }
+ }
+ }
+
+ private void requestNetworkBuffers() {
+ requestGuaranteedBuffers();
+
+ // Use the half of the buffers for writing, and the other half for
reading
+ numBuffersForSort = freeSegments.size() / 2;
+ }
+
+ private void flushDataBuffer(SortBufferContainer sortBufferContainer) {
+ if (sortBufferContainer == null
+ || sortBufferContainer.isReleased()
+ || !sortBufferContainer.hasRemaining()) {
+ return;
+ }
+ sortBufferContainer.finish();
+
+ do {
+ MemorySegment freeSegment = getFreeSegment();
+ Pair<Integer, Buffer> bufferAndSubpartitionId =
+ sortBufferContainer.readBuffer(freeSegment);
+ if (bufferAndSubpartitionId == null) {
+ if (freeSegment != null) {
+ recycleBuffer(freeSegment);
+ }
+ break;
+ }
+ addFinishedBuffer(bufferAndSubpartitionId);
+ } while (true);
+
+ releaseFreeBuffers();
+ sortBufferContainer.release();
+ }
+
+ private void flushBroadcastDataBuffer() {
+ if (broadcastDataBuffer != null) {
+ flushDataBuffer(broadcastDataBuffer);
+ broadcastDataBuffer.release();
+ broadcastDataBuffer = null;
+ }
+ }
+
+ private void flushUnicastDataBuffer() {
+ if (unicastDataBuffer != null) {
+ flushDataBuffer(unicastDataBuffer);
+ unicastDataBuffer.release();
+ unicastDataBuffer = null;
+ }
+ }
+
+ private void flushContainerWhenEndOfPartition(
+ boolean isEndOfPartition, SortBufferContainer sortBufferContainer)
{
+ if (isEndOfPartition) {
+ flushDataBuffer(sortBufferContainer);
+ }
+ }
+
+ private void writeLargeRecord(
+ ByteBuffer record, int targetSubpartition, Buffer.DataType
dataType) {
+
+ checkState(dataType != Buffer.DataType.EVENT_BUFFER);
+ while (record.hasRemaining()) {
+ int toCopy = Math.min(record.remaining(), bufferSizeBytes);
+ MemorySegment writeBuffer = checkNotNull(getFreeSegment());
+ writeBuffer.put(0, record, toCopy);
+
+ addFinishedBuffer(
+ Pair.of(
+ targetSubpartition,
+ new NetworkBuffer(
+ writeBuffer, checkNotNull(bufferRecycler),
dataType, toCopy)));
+ }
+
+ releaseFreeBuffers();
+ }
+
+ private MemorySegment getFreeSegment() {
+ MemorySegment freeSegment = freeSegments.poll();
+ if (freeSegment == null) {
+ BufferBuilder bufferBuilder =
storeMemoryManager.requestBufferBlocking(this);
+ Buffer buffer =
bufferBuilder.createBufferConsumerFromBeginning().build();
+ freeSegment = buffer.getMemorySegment();
+ }
+ return freeSegment;
+ }
+
+ private int effectiveNumRequestedBuffers() {
+ return Math.min(numSubpartitions + 1, numBuffers);
+ }
+
+ private void releaseDataBuffer(SortBufferContainer sortBufferContainer) {
+ if (sortBufferContainer != null) {
+ sortBufferContainer.release();
+ }
+ }
+
+ private void addFinishedBuffer(Pair<Integer, Buffer>
bufferAndSubpartitionId) {
Review Comment:
```suggestion
private void flushBuffer(Pair<Integer, Buffer> bufferAndSubpartitionId) {
```
##########
flink-runtime/src/main/java/org/apache/flink/runtime/io/network/partition/hybrid/tiered/storage/SortBufferAccumulator.java:
##########
@@ -0,0 +1,321 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.runtime.io.network.partition.hybrid.tiered.storage;
+
+import org.apache.flink.core.memory.MemorySegment;
+import org.apache.flink.runtime.io.network.buffer.Buffer;
+import org.apache.flink.runtime.io.network.buffer.BufferBuilder;
+import org.apache.flink.runtime.io.network.buffer.BufferRecycler;
+import org.apache.flink.runtime.io.network.buffer.NetworkBuffer;
+import
org.apache.flink.runtime.io.network.partition.hybrid.tiered.common.TieredStorageSubpartitionId;
+
+import org.apache.commons.lang3.tuple.Pair;
+
+import javax.annotation.Nullable;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.Collections;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.function.BiConsumer;
+
+import static org.apache.flink.util.Preconditions.checkNotNull;
+import static org.apache.flink.util.Preconditions.checkState;
+
+/**
+ * The sort-based implementation of the {@link BufferAccumulator}. The {@link
BufferAccumulator}
+ * receives the records from {@link TieredStorageProducerClient} and the
records will accumulate and
+ * transform to finished buffers. The accumulated buffers will be transferred
to the corresponding
+ * tier dynamically.
+ *
+ * <p>The {@link BufferAccumulator} can help use less buffers to accumulate
data, which decouples
+ * the buffer usage with the number of parallelism. The number of buffers used
by the {@link
+ * SortBufferAccumulator} will be numBuffers at most. Once the {@link
SortBufferContainer} is full,
+ * or receiving a different type of buffer, or receiving the end-of-partition
event, the buffer in
+ * the sort buffer container will be flushed to the tiers.
+ *
+ * <p>Note that this class need not be thread-safe, because it should only be
accessed from the main
+ * thread.
+ */
+public class SortBufferAccumulator implements BufferAccumulator {
+
+ /** The number of the subpartitions. */
+ private final int numSubpartitions;
+
+ /** The total number of the buffers used by the {@link
SortBufferAccumulator}. */
+ private final int numBuffers;
+
+ /** The byte size of one single buffer. */
+ private final int bufferSizeBytes;
+
+ /** The empty buffers without storing data. */
+ private final LinkedList<MemorySegment> freeSegments = new LinkedList<>();
+
+ /** The memory manager of the tiered storage. */
+ private final TieredStorageMemoryManager storeMemoryManager;
+
+ /** The number of buffers for sorting used in the {@link
SortBufferContainer}. */
+ private int numBuffersForSort;
+
+ /**
+ * The {@link SortBufferContainer} for accumulating broadcast data. Note
that this can be null
+ * before using it to store records, and this buffer container will be
released once flushed.
+ */
+ @Nullable private SortBufferContainer broadcastDataBuffer;
+
+ /**
+ * The {@link SortBufferContainer} for accumulating non-broadcast data.
Note that this can be
+ * null before using it to store records, and this buffer container will
be released once
+ * flushed.
+ */
+ @Nullable private SortBufferContainer unicastDataBuffer;
+
+ /**
+ * The buffer recycler. Note that this can be null before requesting
buffers from the memory
+ * manager.
+ */
+ @Nullable private BufferRecycler bufferRecycler;
+
+ /**
+ * The {@link SortBufferAccumulator}'s accumulated buffer flusher is not
prepared during
+ * construction, requiring the field to be initialized during setup.
Therefore, it is necessary
+ * to verify whether this field is null before using it.
+ */
+ @Nullable
+ private BiConsumer<TieredStorageSubpartitionId, List<Buffer>>
accumulatedBufferFlusher;
+
+ public SortBufferAccumulator(
+ int numSubpartitions,
+ int numBuffers,
+ int bufferSizeBytes,
+ TieredStorageMemoryManager storeMemoryManager) {
+ this.numSubpartitions = numSubpartitions;
+ this.bufferSizeBytes = bufferSizeBytes;
+ this.numBuffers = numBuffers;
+ this.storeMemoryManager = storeMemoryManager;
+ }
+
+ @Override
+ public void setup(BiConsumer<TieredStorageSubpartitionId, List<Buffer>>
bufferFlusher) {
+ this.accumulatedBufferFlusher = bufferFlusher;
+ }
+
+ @Override
+ public void receive(
+ ByteBuffer record,
+ TieredStorageSubpartitionId subpartitionId,
+ Buffer.DataType dataType,
+ boolean isBroadcast)
+ throws IOException {
+ int targetSubpartition = subpartitionId.getSubpartitionId();
+ SortBufferContainer sortBufferContainer =
+ isBroadcast ? getBroadcastDataBuffer() :
getUnicastDataBuffer();
+ if (!sortBufferContainer.writeRecord(record, targetSubpartition,
dataType)) {
+ return;
+ }
+
+ if (!sortBufferContainer.hasRemaining()) {
+ sortBufferContainer.release();
+ writeLargeRecord(record, targetSubpartition, dataType);
+ return;
+ }
+
+ flushDataBuffer(sortBufferContainer);
+ sortBufferContainer.release();
+ if (record.hasRemaining()) {
+ receive(record, subpartitionId, dataType, isBroadcast);
+ }
+ }
+
+ @Override
+ public void close() {
+ flushUnicastDataBuffer();
+ flushBroadcastDataBuffer();
+ releaseFreeBuffers();
+ releaseDataBuffer(unicastDataBuffer);
+ releaseDataBuffer(broadcastDataBuffer);
+ }
+
+ // ------------------------------------------------------------------------
+ // Internal Methods
+ // ------------------------------------------------------------------------
+
+ private SortBufferContainer getUnicastDataBuffer() {
+ flushBroadcastDataBuffer();
+
+ if (unicastDataBuffer != null
+ && !unicastDataBuffer.isFinished()
+ && !unicastDataBuffer.isReleased()) {
+ return unicastDataBuffer;
+ }
+
+ unicastDataBuffer = createNewDataBuffer();
+ return unicastDataBuffer;
+ }
+
+ private SortBufferContainer getBroadcastDataBuffer() {
+ flushUnicastDataBuffer();
+
+ if (broadcastDataBuffer != null
+ && !broadcastDataBuffer.isFinished()
+ && !broadcastDataBuffer.isReleased()) {
+ return broadcastDataBuffer;
+ }
+
+ broadcastDataBuffer = createNewDataBuffer();
+ return broadcastDataBuffer;
+ }
+
+ private SortBufferContainer createNewDataBuffer() {
+ requestNetworkBuffers();
+
+ return new SortBufferContainer(
+ freeSegments,
+ this::recycleBuffer,
+ numSubpartitions,
+ bufferSizeBytes,
+ numBuffersForSort);
+ }
+
+ private void requestGuaranteedBuffers() {
+ int effectiveRequiredBuffers = effectiveNumRequestedBuffers();
+
+ while (freeSegments.size() < effectiveRequiredBuffers) {
+ BufferBuilder bufferBuilder =
storeMemoryManager.requestBufferBlocking(this);
+ Buffer buffer =
bufferBuilder.createBufferConsumerFromBeginning().build();
+ freeSegments.add(checkNotNull(buffer).getMemorySegment());
+ if (bufferRecycler == null) {
+ bufferRecycler = buffer.getRecycler();
+ }
+ }
+ }
+
+ private void requestNetworkBuffers() {
+ requestGuaranteedBuffers();
+
+ // Use the half of the buffers for writing, and the other half for
reading
+ numBuffersForSort = freeSegments.size() / 2;
+ }
+
+ private void flushDataBuffer(SortBufferContainer sortBufferContainer) {
+ if (sortBufferContainer == null
+ || sortBufferContainer.isReleased()
+ || !sortBufferContainer.hasRemaining()) {
+ return;
+ }
+ sortBufferContainer.finish();
+
+ do {
+ MemorySegment freeSegment = getFreeSegment();
+ Pair<Integer, Buffer> bufferAndSubpartitionId =
+ sortBufferContainer.readBuffer(freeSegment);
+ if (bufferAndSubpartitionId == null) {
+ if (freeSegment != null) {
+ recycleBuffer(freeSegment);
+ }
+ break;
+ }
+ addFinishedBuffer(bufferAndSubpartitionId);
+ } while (true);
+
+ releaseFreeBuffers();
+ sortBufferContainer.release();
+ }
+
+ private void flushBroadcastDataBuffer() {
+ if (broadcastDataBuffer != null) {
+ flushDataBuffer(broadcastDataBuffer);
+ broadcastDataBuffer.release();
+ broadcastDataBuffer = null;
+ }
+ }
+
+ private void flushUnicastDataBuffer() {
+ if (unicastDataBuffer != null) {
+ flushDataBuffer(unicastDataBuffer);
+ unicastDataBuffer.release();
+ unicastDataBuffer = null;
+ }
+ }
+
+ private void flushContainerWhenEndOfPartition(
+ boolean isEndOfPartition, SortBufferContainer sortBufferContainer)
{
+ if (isEndOfPartition) {
+ flushDataBuffer(sortBufferContainer);
+ }
+ }
Review Comment:
This is unused.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]