Re: [PR] MSQ worker: Support in-memory shuffles. (druid)

via GitHub Wed, 24 Jul 2024 12:04:52 -0700


github-advanced-security[bot] commented on code in PR #16790:
URL: https://github.com/apache/druid/pull/16790#discussion_r1690295452



##########
extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/shuffle/output/ChannelStageOutputReader.java:
##########
@@ -0,0 +1,238 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.msq.shuffle.output;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.primitives.Ints;
+import com.google.common.util.concurrent.Futures;
+import com.google.common.util.concurrent.ListenableFuture;
+import it.unimi.dsi.fastutil.bytes.ByteArrays;
+import org.apache.druid.common.guava.FutureUtils;
+import org.apache.druid.error.DruidException;
+import org.apache.druid.frame.channel.ByteTracker;
+import org.apache.druid.frame.channel.ReadableFrameChannel;
+import org.apache.druid.frame.file.FrameFileWriter;
+import org.apache.druid.java.util.common.ISE;
+import org.apache.druid.msq.exec.OutputChannelMode;
+import org.apache.druid.msq.kernel.controller.ControllerQueryKernelUtils;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.ByteBuffer;
+import java.nio.channels.WritableByteChannel;
+import java.util.ArrayDeque;
+import java.util.Deque;
+
+/**
+ * Reader for {@link ReadableFrameChannel}.
+ *
+ * Because this reader returns an underlying channel directly, it must only be 
used when it is certain that
+ * only a single consumer exists, i.e., when using output mode {@link 
OutputChannelMode#MEMORY}. See
+ * {@link ControllerQueryKernelUtils#canUseMemoryOutput} for the code that 
ensures that there is only a single
+ * consumer in the in-memory case.
+ */
+public class ChannelStageOutputReader implements StageOutputReader
+{
+  enum State
+  {
+    INIT,
+    LOCAL,
+    REMOTE,
+    CLOSED
+  }
+
+  private final ReadableFrameChannel channel;
+  private final FrameFileWriter writer;
+
+  /**
+   * Pair of chunk size + chunk InputStream.
+   */
+  private final Deque<byte[]> chunks = new ArrayDeque<>();
+
+  /**
+   * State of this reader.
+   */
+  private State state = State.INIT;
+
+  /**
+   * Position within the overall stream.
+   */
+  private long cursor;
+
+  /**
+   * Offset of the first chunk in {@link #chunks} which corresponds to {@link 
#cursor}.
+   */
+  private int positionWithinFirstChunk;
+
+  /**
+   * Whether {@link FrameFileWriter#close()} is called on {@link #writer}.
+   */
+  private boolean didCloseWriter;
+
+  public ChannelStageOutputReader(final ReadableFrameChannel channel)
+  {
+    this.channel = channel;
+    this.writer = FrameFileWriter.open(new ChunkAcceptor(), null, 
ByteTracker.unboundedTracker());
+  }
+
+  @Override
+  public synchronized ListenableFuture<InputStream> readRemotelyFrom(final 
long offset)
+  {
+    if (state == State.INIT) {
+      state = State.REMOTE;
+    } else if (state == State.LOCAL) {
+      throw new ISE("Cannot read both remotely and locally");
+    } else if (state == State.CLOSED) {
+      throw new ISE("Closed");
+    }
+
+    if (offset < cursor) {
+      return Futures.immediateFailedFuture(
+          new ISE("Offset[%,d] no longer available, current cursor is[%,d]", 
offset, cursor));
+    }
+
+    while (chunks.isEmpty() || offset > cursor) {
+      // Fetch additional chunks if needed.
+      if (chunks.isEmpty()) {
+        if (didCloseWriter) {
+          if (offset == cursor) {
+            return Futures.immediateFuture(new 
ByteArrayInputStream(ByteArrays.EMPTY_ARRAY));
+          } else {
+            throw DruidException.defensive(
+                "Channel finished but cursor[%,d] does not match requested 
offset[%,d]",
+                cursor,
+                offset
+            );
+          }
+        } else if (channel.isFinished()) {
+          try {
+            writer.close();
+          }
+          catch (IOException e) {
+            throw new RuntimeException(e);
+          }
+
+          didCloseWriter = true;
+          continue;
+        } else if (channel.canRead()) {
+          try {
+            writer.writeFrame(channel.read(), FrameFileWriter.NO_PARTITION);
+          }
+          catch (Exception e) {
+            try {
+              writer.abort();
+            }
+            catch (IOException e2) {
+              e.addSuppressed(e2);
+            }
+
+            throw new RuntimeException(e);
+          }
+        } else {
+          return FutureUtils.transformAsync(channel.readabilityFuture(), 
ignored -> readRemotelyFrom(offset));

Review Comment:
   ## Useless parameter
   
   The parameter 'ignored' is never used.
   
   [Show more 
details](https://github.com/apache/druid/security/code-scanning/7591)



##########
extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/rpc/ResourcePermissionMapper.java:
##########
@@ -23,11 +23,9 @@
 
 import java.util.List;
 
-/**
- * Provides HTTP resources such as {@link ControllerResource} with information 
about which permissions are needed
- * for requests.
- */
 public interface ResourcePermissionMapper
 {
   List<ResourceAction> getAdminPermissions();
+
+  List<ResourceAction> getQueryPermissions(String queryId);

Review Comment:
   ## Useless parameter
   
   The parameter 'queryId' is never used.
   
   [Show more 
details](https://github.com/apache/druid/security/code-scanning/7592)



##########
extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/shuffle/output/NilStageOutputReader.java:
##########
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.msq.shuffle.output;
+
+import com.google.common.util.concurrent.Futures;
+import com.google.common.util.concurrent.ListenableFuture;
+import org.apache.druid.frame.channel.ByteTracker;
+import org.apache.druid.frame.channel.ReadableFrameChannel;
+import org.apache.druid.frame.channel.ReadableNilFrameChannel;
+import org.apache.druid.frame.file.FrameFileWriter;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.channels.Channels;
+
+/**
+ * Reader for empty channel.
+ */
+public class NilStageOutputReader implements StageOutputReader
+{
+  public static final NilStageOutputReader INSTANCE = new 
NilStageOutputReader();
+
+  private static final byte[] EMPTY_FRAME_FILE;
+
+  static {
+    try {
+      final ByteArrayOutputStream baos = new ByteArrayOutputStream();
+      FrameFileWriter.open(Channels.newChannel(baos), null, 
ByteTracker.unboundedTracker()).close();
+      EMPTY_FRAME_FILE = baos.toByteArray();
+    }
+    catch (IOException e) {
+      throw new RuntimeException(e);
+    }
+  }
+
+  @Override
+  public ListenableFuture<InputStream> readRemotelyFrom(final long offset)
+  {
+    final ByteArrayInputStream in = new ByteArrayInputStream(EMPTY_FRAME_FILE);
+
+    //noinspection ResultOfMethodCallIgnored: OK to ignore since "skip" always 
works for ByteArrayInputStream.
+    in.skip(offset);

Review Comment:
   ## Ignored error status of call
   
   Method readRemotelyFrom ignores exceptional return value of 
ByteArrayInputStream.skip.
   
   [Show more 
details](https://github.com/apache/druid/security/code-scanning/7595)



##########
extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/rpc/ControllerResource.java:
##########
@@ -82,6 +82,28 @@
     return Response.status(Response.Status.ACCEPTED).build();
   }
 
+  /**
+   * Used by subtasks to inform the controller that they are done reading 
their input, in cases where they would
+   * not be calling {@link #httpPostPartialKeyStatistics(Object, String, int, 
int, HttpServletRequest)}.
+   *
+   * See {@link ControllerClient#postDoneReadingInput(StageId, int)} for the 
client-side code that calls this API.
+   */
+  @POST
+  @Path("/doneReadingInput/{queryId}/{stageNumber}/{workerNumber}")
+  @Produces(MediaType.APPLICATION_JSON)
+  @Consumes(MediaType.APPLICATION_JSON)
+  public Response httpPostDoneReadingInput(
+      @PathParam("queryId") final String queryId,

Review Comment:
   ## Useless parameter
   
   The parameter 'queryId' is never used.
   
   [Show more 
details](https://github.com/apache/druid/security/code-scanning/7593)



##########
extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/exec/RunWorkOrder.java:
##########
@@ -0,0 +1,1045 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.msq.exec;
+
+import com.google.common.base.Function;
+import com.google.common.base.Preconditions;
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.ImmutableMap;
+import com.google.common.collect.Iterables;
+import com.google.common.util.concurrent.FutureCallback;
+import com.google.common.util.concurrent.Futures;
+import com.google.common.util.concurrent.ListenableFuture;
+import com.google.common.util.concurrent.MoreExecutors;
+import com.google.common.util.concurrent.SettableFuture;
+import org.apache.druid.common.guava.FutureUtils;
+import org.apache.druid.error.DruidException;
+import org.apache.druid.frame.allocation.ArenaMemoryAllocator;
+import org.apache.druid.frame.allocation.ArenaMemoryAllocatorFactory;
+import org.apache.druid.frame.channel.BlockingQueueFrameChannel;
+import org.apache.druid.frame.channel.ByteTracker;
+import org.apache.druid.frame.channel.FrameWithPartition;
+import org.apache.druid.frame.key.ClusterByPartitions;
+import org.apache.druid.frame.processor.BlockingQueueOutputChannelFactory;
+import org.apache.druid.frame.processor.Bouncer;
+import org.apache.druid.frame.processor.ComposingOutputChannelFactory;
+import org.apache.druid.frame.processor.FileOutputChannelFactory;
+import org.apache.druid.frame.processor.FrameChannelHashPartitioner;
+import org.apache.druid.frame.processor.FrameChannelMixer;
+import org.apache.druid.frame.processor.FrameProcessor;
+import org.apache.druid.frame.processor.FrameProcessorExecutor;
+import org.apache.druid.frame.processor.OutputChannel;
+import org.apache.druid.frame.processor.OutputChannelFactory;
+import org.apache.druid.frame.processor.OutputChannels;
+import org.apache.druid.frame.processor.PartitionedOutputChannel;
+import org.apache.druid.frame.processor.SuperSorter;
+import org.apache.druid.frame.processor.SuperSorterProgressTracker;
+import org.apache.druid.frame.processor.manager.ProcessorManager;
+import org.apache.druid.frame.processor.manager.ProcessorManagers;
+import org.apache.druid.frame.util.DurableStorageUtils;
+import org.apache.druid.frame.write.FrameWriters;
+import org.apache.druid.java.util.common.FileUtils;
+import org.apache.druid.java.util.common.ISE;
+import org.apache.druid.java.util.common.StringUtils;
+import org.apache.druid.java.util.common.UOE;
+import org.apache.druid.java.util.common.concurrent.Execs;
+import org.apache.druid.msq.counters.CounterNames;
+import org.apache.druid.msq.counters.CounterTracker;
+import org.apache.druid.msq.indexing.CountingOutputChannelFactory;
+import org.apache.druid.msq.indexing.InputChannelFactory;
+import org.apache.druid.msq.indexing.InputChannelsImpl;
+import 
org.apache.druid.msq.indexing.processor.KeyStatisticsCollectionProcessor;
+import org.apache.druid.msq.input.InputSlice;
+import org.apache.druid.msq.input.InputSliceReader;
+import org.apache.druid.msq.input.InputSlices;
+import org.apache.druid.msq.input.MapInputSliceReader;
+import org.apache.druid.msq.input.NilInputSlice;
+import org.apache.druid.msq.input.NilInputSliceReader;
+import org.apache.druid.msq.input.external.ExternalInputSlice;
+import org.apache.druid.msq.input.external.ExternalInputSliceReader;
+import org.apache.druid.msq.input.inline.InlineInputSlice;
+import org.apache.druid.msq.input.inline.InlineInputSliceReader;
+import org.apache.druid.msq.input.lookup.LookupInputSlice;
+import org.apache.druid.msq.input.lookup.LookupInputSliceReader;
+import org.apache.druid.msq.input.stage.InputChannels;
+import org.apache.druid.msq.input.stage.StageInputSlice;
+import org.apache.druid.msq.input.stage.StageInputSliceReader;
+import org.apache.druid.msq.input.table.SegmentsInputSlice;
+import org.apache.druid.msq.input.table.SegmentsInputSliceReader;
+import org.apache.druid.msq.kernel.FrameContext;
+import org.apache.druid.msq.kernel.FrameProcessorFactory;
+import org.apache.druid.msq.kernel.ProcessorsAndChannels;
+import org.apache.druid.msq.kernel.ShuffleSpec;
+import org.apache.druid.msq.kernel.StageDefinition;
+import org.apache.druid.msq.kernel.WorkOrder;
+import org.apache.druid.msq.shuffle.output.DurableStorageOutputChannelFactory;
+import org.apache.druid.msq.statistics.ClusterByStatisticsCollector;
+import org.apache.druid.msq.statistics.ClusterByStatisticsSnapshot;
+import org.apache.druid.utils.CloseableUtils;
+
+import javax.annotation.Nullable;
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.stream.Collectors;
+
+/**
+ * Main worker logic for executing a {@link WorkOrder} in a {@link 
FrameProcessorExecutor}.
+ */
+public class RunWorkOrder
+{
+  private final WorkOrder workOrder;
+  private final InputChannelFactory inputChannelFactory;
+  private final CounterTracker counterTracker;
+  private final FrameProcessorExecutor exec;
+  private final String cancellationId;
+  private final int parallelism;
+  private final WorkerContext workerContext;
+  private final FrameContext frameContext;
+  private final RunWorkOrderListener listener;
+  private final boolean reindex;
+  private final boolean removeNullBytes;
+  private final ByteTracker intermediateSuperSorterLocalStorageTracker;
+  private final AtomicBoolean started = new AtomicBoolean();
+
+  private InputSliceReader inputSliceReader;
+  private OutputChannelFactory workOutputChannelFactory;
+  private OutputChannelFactory shuffleOutputChannelFactory;
+  private ResultAndChannels<?> workResultAndOutputChannels;
+  private SettableFuture<ClusterByPartitions> stagePartitionBoundariesFuture;
+  private ListenableFuture<OutputChannels> stageOutputChannelsFuture;
+
+  public RunWorkOrder(
+      final WorkOrder workOrder,
+      final InputChannelFactory inputChannelFactory,
+      final CounterTracker counterTracker,
+      final FrameProcessorExecutor exec,
+      final String cancellationId,
+      final WorkerContext workerContext,
+      final FrameContext frameContext,
+      final RunWorkOrderListener listener,
+      final boolean reindex,
+      final boolean removeNullBytes
+  )
+  {
+    this.workOrder = workOrder;
+    this.inputChannelFactory = inputChannelFactory;
+    this.counterTracker = counterTracker;
+    this.exec = exec;
+    this.cancellationId = cancellationId;
+    this.parallelism = workerContext.threadCount();
+    this.workerContext = workerContext;
+    this.frameContext = frameContext;
+    this.listener = listener;
+    this.reindex = reindex;
+    this.removeNullBytes = removeNullBytes;
+    this.intermediateSuperSorterLocalStorageTracker =
+        new ByteTracker(
+            
frameContext.storageParameters().isIntermediateStorageLimitConfigured()
+            ? 
frameContext.storageParameters().getIntermediateSuperSorterStorageMaxLocalBytes()
+            : Long.MAX_VALUE
+        );
+  }
+
+  /**
+   * Start execution of the provided {@link WorkOrder} in the provided {@link 
FrameProcessorExecutor}.
+   *
+   * Execution proceeds asynchronously after this method returns. The {@link 
RunWorkOrderListener} passed to the
+   * constructor of this instance can be used to track progress.
+   */
+  public void start() throws IOException
+  {
+    if (started.getAndSet(true)) {
+      throw new ISE("Already started");
+    }
+
+    final StageDefinition stageDef = workOrder.getStageDefinition();
+
+    try {
+      makeInputSliceReader();
+      makeWorkOutputChannelFactory();
+      makeShuffleOutputChannelFactory();
+      makeAndRunWorkProcessors();
+
+      if (stageDef.doesShuffle()) {
+        makeAndRunShuffleProcessors();
+      } else {
+        // No shuffling: work output _is_ stage output. Retain read-only 
versions to reduce memory footprint.
+        stageOutputChannelsFuture =
+            
Futures.immediateFuture(workResultAndOutputChannels.getOutputChannels().readOnly());
+      }
+
+      setUpCompletionCallbacks();
+    }
+    catch (Throwable t) {
+      // If start() has problems, cancel anything that was already kicked off, 
and close the FrameContext.
+      try {
+        exec.cancel(cancellationId);
+      }
+      catch (Throwable t2) {
+        t.addSuppressed(t2);
+      }
+
+      CloseableUtils.closeAndSuppressExceptions(frameContext, 
t::addSuppressed);
+      throw t;
+    }
+  }
+
+  /**
+   * Settable {@link ClusterByPartitions} future for global sort. Necessary 
because we don't know ahead of time
+   * what the boundaries will be. The controller decides based on statistics 
from all workers. Once the controller
+   * decides, its decision is written to this future, which allows sorting on 
workers to proceed.
+   */
+  @Nullable
+  public SettableFuture<ClusterByPartitions> 
getStagePartitionBoundariesFuture()
+  {
+    return stagePartitionBoundariesFuture;
+  }
+
+  private void makeInputSliceReader()
+  {
+    if (inputSliceReader != null) {
+      throw new ISE("inputSliceReader already created");
+    }
+
+    final String queryId = workOrder.getQueryDefinition().getQueryId();
+
+    final InputChannels inputChannels =
+        new InputChannelsImpl(
+            workOrder.getQueryDefinition(),
+            InputSlices.allReadablePartitions(workOrder.getInputs()),
+            inputChannelFactory,
+            () -> 
ArenaMemoryAllocator.createOnHeap(frameContext.memoryParameters().getStandardFrameSize()),
+            exec,
+            cancellationId,
+            removeNullBytes
+        );
+
+    inputSliceReader = new MapInputSliceReader(
+        ImmutableMap.<Class<? extends InputSlice>, InputSliceReader>builder()
+                    .put(NilInputSlice.class, NilInputSliceReader.INSTANCE)
+                    .put(StageInputSlice.class, new 
StageInputSliceReader(queryId, inputChannels))
+                    .put(ExternalInputSlice.class, new 
ExternalInputSliceReader(frameContext.tempDir("external")))
+                    .put(InlineInputSlice.class, new 
InlineInputSliceReader(frameContext.segmentWrangler()))
+                    .put(LookupInputSlice.class, new 
LookupInputSliceReader(frameContext.segmentWrangler()))
+                    .put(SegmentsInputSlice.class, new 
SegmentsInputSliceReader(frameContext, reindex))
+                    .build()
+    );
+  }
+
+  private void makeWorkOutputChannelFactory()
+  {
+    if (workOutputChannelFactory != null) {
+      throw new ISE("processorOutputChannelFactory already created");
+    }
+
+    final OutputChannelFactory baseOutputChannelFactory;
+
+    if (workOrder.getStageDefinition().doesShuffle()) {
+      // Writing to a consumer in the same JVM (which will be set up later on 
in this method). Use the large frame
+      // size if we're writing to a SuperSorter, since we'll generate fewer 
temp files if we use larger frames.
+      // Otherwise, use the standard frame size.
+      final int frameSize;
+
+      if (workOrder.getStageDefinition().getShuffleSpec().kind().isSort()) {
+        frameSize = frameContext.memoryParameters().getLargeFrameSize();
+      } else {
+        frameSize = frameContext.memoryParameters().getStandardFrameSize();
+      }
+
+      baseOutputChannelFactory = new 
BlockingQueueOutputChannelFactory(frameSize);
+    } else {
+      // Writing stage output.
+      baseOutputChannelFactory = makeStageOutputChannelFactory();
+    }
+
+    workOutputChannelFactory = new CountingOutputChannelFactory(
+        baseOutputChannelFactory,
+        counterTracker.channel(CounterNames.outputChannel())
+    );
+  }
+
+  private void makeShuffleOutputChannelFactory()
+  {
+    shuffleOutputChannelFactory =
+        new CountingOutputChannelFactory(
+            makeStageOutputChannelFactory(),
+            counterTracker.channel(CounterNames.shuffleChannel())
+        );
+  }
+
+  /**
+   * Use {@link FrameProcessorFactory#makeProcessors} to create {@link 
ProcessorsAndChannels}. Executes the
+   * processors using {@link #exec} and sets the output channels in {@link 
#workResultAndOutputChannels}.
+   *
+   * @param <FactoryType>         type of {@link 
StageDefinition#getProcessorFactory()}
+   * @param <ProcessorReturnType> return type of {@link FrameProcessor} 
created by the manager
+   * @param <ManagerReturnType>   result type of {@link 
ProcessorManager#result()}
+   * @param <ExtraInfoType>       type of {@link WorkOrder#getExtraInfo()}
+   */
+  private <FactoryType extends FrameProcessorFactory<ProcessorReturnType, 
ManagerReturnType, ExtraInfoType>, ProcessorReturnType, ManagerReturnType, 
ExtraInfoType> void makeAndRunWorkProcessors()
+      throws IOException
+  {
+    if (workResultAndOutputChannels != null) {
+      throw new ISE("workResultAndOutputChannels already set");
+    }
+
+    @SuppressWarnings("unchecked")
+    final FactoryType processorFactory = (FactoryType) 
workOrder.getStageDefinition().getProcessorFactory();
+
+    @SuppressWarnings("unchecked")
+    final ProcessorsAndChannels<ProcessorReturnType, ManagerReturnType> 
processors =
+        processorFactory.makeProcessors(
+            workOrder.getStageDefinition(),
+            workOrder.getWorkerNumber(),
+            workOrder.getInputs(),
+            inputSliceReader,
+            (ExtraInfoType) workOrder.getExtraInfo(),
+            workOutputChannelFactory,
+            frameContext,
+            parallelism,
+            counterTracker,
+            listener::onWarning,
+            removeNullBytes
+        );
+
+    final ProcessorManager<ProcessorReturnType, ManagerReturnType> 
processorManager = processors.getProcessorManager();
+
+    final int maxOutstandingProcessors;
+
+    if (processors.getOutputChannels().getAllChannels().isEmpty()) {
+      // No output channels: run up to "parallelism" processors at once.
+      maxOutstandingProcessors = Math.max(1, parallelism);
+    } else {
+      // If there are output channels, that acts as a ceiling on the number of 
processors that can run at once.
+      maxOutstandingProcessors =
+          Math.max(1, Math.min(parallelism, 
processors.getOutputChannels().getAllChannels().size()));
+    }
+
+    final ListenableFuture<ManagerReturnType> workResultFuture = 
exec.runAllFully(
+        processorManager,
+        maxOutstandingProcessors,
+        frameContext.processorBouncer(),
+        cancellationId
+    );
+
+    workResultAndOutputChannels = new ResultAndChannels<>(workResultFuture, 
processors.getOutputChannels());
+  }
+
+  private void makeAndRunShuffleProcessors()
+  {
+    if (stageOutputChannelsFuture != null) {
+      throw new ISE("stageOutputChannelsFuture already set");
+    }
+
+    final ShuffleSpec shuffleSpec = 
workOrder.getStageDefinition().getShuffleSpec();
+
+    final ShufflePipelineBuilder shufflePipeline = new ShufflePipelineBuilder(
+        workOrder,
+        counterTracker,
+        exec,
+        cancellationId,
+        frameContext
+    );
+
+    shufflePipeline.initialize(workResultAndOutputChannels);
+    
shufflePipeline.gatherResultKeyStatisticsAndReportDoneReadingInputIfNeeded();
+
+    switch (shuffleSpec.kind()) {
+      case MIX:
+        shufflePipeline.mix(shuffleOutputChannelFactory);
+        break;
+
+      case HASH:
+        shufflePipeline.hashPartition(shuffleOutputChannelFactory);
+        break;
+
+      case HASH_LOCAL_SORT:
+        final OutputChannelFactory hashOutputChannelFactory;
+
+        if (shuffleSpec.partitionCount() == 1) {
+          // Single partition; no need to write temporary files.
+          hashOutputChannelFactory =
+              new 
BlockingQueueOutputChannelFactory(frameContext.memoryParameters().getStandardFrameSize());
+        } else {
+          // Multi-partition; write temporary files and then sort each one 
file-by-file.
+          hashOutputChannelFactory =
+              new FileOutputChannelFactory(
+                  frameContext.tempDir("hash-parts"),
+                  frameContext.memoryParameters().getStandardFrameSize(),
+                  null
+              );
+        }
+
+        shufflePipeline.hashPartition(hashOutputChannelFactory);
+        shufflePipeline.localSort(shuffleOutputChannelFactory);
+        break;
+
+      case GLOBAL_SORT:
+        shufflePipeline.globalSort(shuffleOutputChannelFactory, 
makeGlobalSortPartitionBoundariesFuture());
+        break;
+
+      default:
+        throw new UOE("Cannot handle shuffle kind [%s]", shuffleSpec.kind());
+    }
+
+    stageOutputChannelsFuture = shufflePipeline.build();
+  }
+
+  private ListenableFuture<ClusterByPartitions> 
makeGlobalSortPartitionBoundariesFuture()
+  {
+    if (workOrder.getStageDefinition().mustGatherResultKeyStatistics()) {
+      if (stagePartitionBoundariesFuture != null) {
+        throw new ISE("Cannot call 'makeGlobalSortPartitionBoundariesFuture' 
twice");
+      }
+
+      return (stagePartitionBoundariesFuture = SettableFuture.create());
+    } else {
+      // Result key stats aren't needed, so the partition boundaries are 
knowable ahead of time. Compute them now.
+      final ClusterByPartitions boundaries =
+          workOrder.getStageDefinition()
+                   .generatePartitionBoundariesForShuffle(null)
+                   .valueOrThrow();
+
+      return Futures.immediateFuture(boundaries);
+    }
+  }
+
+  private void setUpCompletionCallbacks()
+  {
+    Futures.addCallback(
+        Futures.allAsList(
+            Arrays.asList(
+                workResultAndOutputChannels.getResultFuture(),
+                stageOutputChannelsFuture
+            )
+        ),
+        new FutureCallback<List<Object>>()
+        {
+          @Override
+          public void onSuccess(final List<Object> 
workerResultAndOutputChannelsResolved)
+          {
+            final Object resultObject = 
workerResultAndOutputChannelsResolved.get(0);
+            final OutputChannels outputChannels = (OutputChannels) 
workerResultAndOutputChannelsResolved.get(1);
+
+            if (workOrder.getOutputChannelMode() != OutputChannelMode.MEMORY) {
+              // In non-MEMORY output channel modes, call 
onOutputChannelAvailable when all work is done.
+              // (In MEMORY mode, we would have called 
onOutputChannelAvailable when the channels were created.)
+              for (final OutputChannel channel : 
outputChannels.getAllChannels()) {
+                listener.onOutputChannelAvailable(channel);
+              }
+            }
+
+            if (workOrder.getOutputChannelMode().isDurable()) {
+              // In DURABLE_STORAGE output channel mode, write a success file 
once all work is done.
+              writeDurableStorageSuccessFile();
+            }
+
+            listener.onSuccess(resultObject);
+          }
+
+          @Override
+          public void onFailure(final Throwable t)
+          {
+            listener.onFailure(t);
+          }
+        },
+        Execs.directExecutor()
+    );
+  }
+
+  /**
+   * Write {@link DurableStorageUtils#SUCCESS_MARKER_FILENAME} for a 
particular stage, if durable storage is enabled.
+   */
+  private void writeDurableStorageSuccessFile()
+  {
+    final DurableStorageOutputChannelFactory 
durableStorageOutputChannelFactory =
+        makeDurableStorageOutputChannelFactory(
+            frameContext.tempDir("durable"),
+            frameContext.memoryParameters().getStandardFrameSize(),
+            workOrder.getOutputChannelMode() == 
OutputChannelMode.DURABLE_STORAGE_QUERY_RESULTS
+        );
+
+    try {
+      
durableStorageOutputChannelFactory.createSuccessFile(workerContext.workerId());
+    }
+    catch (IOException e) {
+      throw new ISE(
+          e,
+          "Unable to create success file at location[%s]",
+          DurableStorageUtils.SUCCESS_MARKER_FILENAME,
+          durableStorageOutputChannelFactory.getSuccessFilePath()
+      );

Review Comment:
   ## Unused format argument
   
   This format call refers to 1 argument(s) but supplies 2 argument(s).
   
   [Show more 
details](https://github.com/apache/druid/security/code-scanning/7590)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] MSQ worker: Support in-memory shuffles. (druid)

Reply via email to