(druid) branch 30.0.0 updated: Fix another deadlock which can occur while acquiring merge buffers (#16372) (#16427)

adarshsanjeev Fri, 10 May 2024 02:45:51 -0700

This is an automated email from the ASF dual-hosted git repository.

adarshsanjeev pushed a commit to branch 30.0.0
in repository https://gitbox.apache.org/repos/asf/druid.git



The following commit(s) were added to refs/heads/30.0.0 by this push:
     new 305abae8b19 Fix another deadlock which can occur while acquiring merge 
buffers (#16372) (#16427)
305abae8b19 is described below

commit 305abae8b198cd10f8f718d6da053348e3e5ac89
Author: Adarsh Sanjeev <[email protected]>
AuthorDate: Fri May 10 15:15:38 2024 +0530

    Fix another deadlock which can occur while acquiring merge buffers (#16372) 
(#16427)
    
    Fixes a deadlock while acquiring merge buffers
    
    Co-authored-by: Laksh Singla <[email protected]>
---
 .../groupby/GroupByResourcesReservationPool.java   |  76 +++++--
 .../apache/druid/query/groupby/GroupingEngine.java |   2 +
 .../GroupByResourcesReservationPoolTest.java       | 227 +++++++++++++++++++++
 3 files changed, 291 insertions(+), 14 deletions(-)

diff --git 
a/processing/src/main/java/org/apache/druid/query/groupby/GroupByResourcesReservationPool.java
 
b/processing/src/main/java/org/apache/druid/query/groupby/GroupByResourcesReservationPool.java
index 49a698b42ae..9c65ce445b1 100644
--- 
a/processing/src/main/java/org/apache/druid/query/groupby/GroupByResourcesReservationPool.java
+++ 
b/processing/src/main/java/org/apache/druid/query/groupby/GroupByResourcesReservationPool.java
@@ -28,6 +28,7 @@ import javax.annotation.Nullable;
 import javax.inject.Inject;
 import java.nio.ByteBuffer;
 import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.atomic.AtomicReference;
 
 /**
  * Reserves the {@link GroupByQueryResources} for a given group by query and 
maps them to the query's resource ID.
@@ -67,21 +68,25 @@ import java.util.concurrent.ConcurrentHashMap;
  * nested ones execute via an unoptimized way.
  * 3. There's some knowledge to the mergeResults that the query runner passed 
to it is the one created by the corresponding toolchest's
  * mergeRunners (which is the typical use case). This is encoded in the 
argument {@code willMergeRunner}, and is to be set by the callers.
- * The only production use case where this isn't true is when the broker is 
merging the results gathered from the historical)
+ * The only production use case where this isn't true is when the broker is 
merging the results gathered from the historical
  * <p>
  * TESTING
  * Unit tests mimic the broker-historical interaction in many places, which 
can lead to the code not working as intended because the assumptions don't hold.
  * In many test cases, there are two nested mergeResults calls, the outer call 
mimics what the broker does, while the inner one mimics what the historical 
does,
  * and the assumption (1) fails. Therefore, the testing code should assign a 
unique resource id b/w each mergeResults call, and also make sure that the top 
level mergeResults
  * would have willMergeRunner = false, since it's being called on top of a 
mergeResults's runner, while the inner one would have willMergeRunner = true 
because its being
- * called on actual runners (as it happens in the brokers, and the historicals)
+ * called on actual runners (as it happens in the brokers, and the 
historicals).
+ * <p>
+ * There is a test in GroupByResourcesReservationPoolTest that checks for 
deadlocks when the operations are interleaved in a
+ * certain maanner. It is ignored because it sleeps and can increase time when 
the test suite is run. Developers making any changes
+ * to this class, or a related class should manually verify that all the tests 
in the test class are running as expected.
  */
 public class GroupByResourcesReservationPool
 {
   /**
    * Map of query's resource id -> group by resources reserved for the query 
to execute
    */
-  final ConcurrentHashMap<QueryResourceId, GroupByQueryResources> pool = new 
ConcurrentHashMap<>();
+  final ConcurrentHashMap<QueryResourceId, 
AtomicReference<GroupByQueryResources>> pool = new ConcurrentHashMap<>();
 
   /**
    * Buffer pool from where the merge buffers are picked and reserved
@@ -104,19 +109,42 @@ public class GroupByResourcesReservationPool
   }
 
   /**
-   * Reserves appropriate resources, and maps it to the queryResourceId 
(usually the query's resource id) in the internal map
+   * Reserves appropriate resources, and maps it to the queryResourceId 
(usually the query's resource id) in the internal map.
+   * This is a blocking call, and can block up to the given query's timeout
    */
   public void reserve(QueryResourceId queryResourceId, GroupByQuery 
groupByQuery, boolean willMergeRunner)
   {
     if (queryResourceId == null) {
       throw DruidException.defensive("Query resource id must be populated");
     }
-    pool.compute(queryResourceId, (id, existingResource) -> {
-      if (existingResource != null) {
-        throw DruidException.defensive("Resource with the given identifier 
[%s] is already present", id);
-      }
-      return GroupingEngine.prepareResource(groupByQuery, mergeBufferPool, 
willMergeRunner, groupByQueryConfig);
-    });
+
+    // First check if the query resource id is present in the map, and if not, 
populate a dummy reference. This will
+    // block other threads from populating the map with the same query id, and 
is essentially same as reserving a spot in
+    // the map for the given query id. Since the actual allocation of the 
resource might take longer than expected, we
+    // do it out of the critical section, once we have "reserved" the spot
+    AtomicReference<GroupByQueryResources> reference = new 
AtomicReference<>(null);
+    AtomicReference<GroupByQueryResources> existingResource = 
pool.putIfAbsent(queryResourceId, reference);
+
+    // Multiple attempts made to allocate the query resource for a given 
resource id. Throw an exception
+    //noinspection VariableNotUsedInsideIf
+    if (existingResource != null) {
+      throw DruidException.defensive("Resource with the given identifier [%s] 
is already present", queryResourceId);
+    }
+
+    GroupByQueryResources resources;
+    try {
+      // We have reserved a spot in the map. Now begin the blocking call.
+      resources = GroupingEngine.prepareResource(groupByQuery, 
mergeBufferPool, willMergeRunner, groupByQueryConfig);
+    }
+    catch (Throwable t) {
+      // Unable to allocate the resources, perform cleanup and rethrow the 
exception
+      pool.remove(queryResourceId);
+      throw t;
+    }
+
+    // Resources have been allocated, spot has been reserved. The reference 
would ALWAYS refer to 'null'. Refer the
+    // allocated resources from it
+    reference.compareAndSet(null, resources);
   }
 
   /**
@@ -125,7 +153,19 @@ public class GroupByResourcesReservationPool
   @Nullable
   public GroupByQueryResources fetch(QueryResourceId queryResourceId)
   {
-    return pool.get(queryResourceId);
+    AtomicReference<GroupByQueryResources> resourcesReference = 
pool.get(queryResourceId);
+    if (resourcesReference == null) {
+      // There weren't any resources allocated corresponding to the provided 
resource id
+      return null;
+    }
+    GroupByQueryResources resource = resourcesReference.get();
+    if (resource == null) {
+      throw DruidException.defensive(
+          "Query id [%s] had a non-null reference in the resource reservation 
pool, but no resources were found",
+          queryResourceId
+      );
+    }
+    return resource;
   }
 
   /**
@@ -133,9 +173,17 @@ public class GroupByResourcesReservationPool
    */
   public void clean(QueryResourceId queryResourceId)
   {
-    GroupByQueryResources resources = pool.remove(queryResourceId);
-    if (resources != null) {
-      resources.close();
+    AtomicReference<GroupByQueryResources> resourcesReference = 
pool.remove(queryResourceId);
+    if (resourcesReference != null) {
+      GroupByQueryResources resource = resourcesReference.get();
+      // Reference should refer to a non-empty resource
+      if (resource == null) {
+        throw DruidException.defensive(
+            "Query id [%s] had a non-null reference in the resource 
reservation pool, but no resources were found",
+            queryResourceId
+        );
+      }
+      resource.close();
     }
   }
 }
diff --git 
a/processing/src/main/java/org/apache/druid/query/groupby/GroupingEngine.java 
b/processing/src/main/java/org/apache/druid/query/groupby/GroupingEngine.java
index 5fa26d3fa2b..6451fb9b943 100644
--- 
a/processing/src/main/java/org/apache/druid/query/groupby/GroupingEngine.java
+++ 
b/processing/src/main/java/org/apache/druid/query/groupby/GroupingEngine.java
@@ -142,6 +142,8 @@ public class GroupingEngine
    * {@link GroupByMergingQueryRunner} for a particular query. The resources 
are to be acquired once throughout the
    * execution of the query, or need to be re-acquired (if needed). Users must 
ensure that throughout the execution,
    * a query already holding the resources shouldn't request for more 
resources, because that can cause deadlocks.
+   * <p>
+   * This method throws an exception if it is not able to allocate sufficient 
resources required for the query to succeed
    */
   public static GroupByQueryResources prepareResource(
       GroupByQuery query,
diff --git 
a/processing/src/test/java/org/apache/druid/query/groupby/GroupByResourcesReservationPoolTest.java
 
b/processing/src/test/java/org/apache/druid/query/groupby/GroupByResourcesReservationPoolTest.java
new file mode 100644
index 00000000000..8d0f2d9e37d
--- /dev/null
+++ 
b/processing/src/test/java/org/apache/druid/query/groupby/GroupByResourcesReservationPoolTest.java
@@ -0,0 +1,227 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.query.groupby;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.ImmutableMap;
+import org.apache.druid.collections.BlockingPool;
+import org.apache.druid.collections.DefaultBlockingPool;
+import org.apache.druid.error.DruidException;
+import org.apache.druid.java.util.common.Intervals;
+import org.apache.druid.java.util.common.concurrent.Execs;
+import org.apache.druid.java.util.common.granularity.Granularities;
+import org.apache.druid.query.QueryResourceId;
+import org.apache.druid.query.dimension.DefaultDimensionSpec;
+import org.junit.Assert;
+import org.junit.Ignore;
+import org.junit.Test;
+
+import java.nio.ByteBuffer;
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.ExecutorService;
+
+public class GroupByResourcesReservationPoolTest
+{
+
+  /**
+   * CONFIG + QUERY require exactly 1 merge buffer to succeed if 
'willMergeRunners' is true while allocating the resources
+   */
+  private static final GroupByQueryConfig CONFIG = new GroupByQueryConfig();
+  private static final GroupByQuery QUERY = GroupByQuery.builder()
+                                                        
.setInterval(Intervals.ETERNITY)
+                                                        .setDataSource("foo")
+                                                        .setDimensions(
+                                                            ImmutableList.of(
+                                                                new 
DefaultDimensionSpec("dim2", "_d0")
+                                                            )
+                                                        )
+                                                        
.setGranularity(Granularities.ALL)
+                                                        .setContext(
+                                                            
ImmutableMap.of("timeout", 0)
+                                                        ) // Query can block 
indefinitely
+                                                        .build();
+
+  /**
+   * This test confirms that the interleaved 
GroupByResourcesReservationPool.reserve() and 
GroupByResourcesReservationPool.clean()
+   * between multiple threads succeed. It is specifically designed to test the 
case when the operations are interleaved in the
+   * following manner:
+   * <p>
+   * THREAD1                      THREAD2
+   * pool.reserve(query1)
+   *                              pool.reserve(query2)
+   * pool.clean(query1)
+   * <p>
+   * This test assumes a few things about the implementation of the 
interfaces, which are laid out in the comments.
+   * <p>
+   * The test should complete under 10 seconds, and the majority of the time 
would be consumed by waiting for the thread
+   * that sleeps for 5 seconds
+   */
+  @Ignore(
+      "Isn't run as a part of CI since it sleeps for 5 seconds. Callers must 
run the test manually if any changes are made "
+      + "to the corresponding class"
+  )
+  @Test(timeout = 100_000L)
+  public void testInterleavedReserveAndRemove()
+  {
+    ExecutorService executor = Execs.multiThreaded(3, 
"group-by-resources-reservation-pool-test-%d");
+
+    // Sanity checks that the query will acquire exactly one merge buffer. 
This safeguards the test being useful in
+    // case the merge buffer acquisition code changes to acquire less than one 
merge buffer (the test would be
+    // useless in that case) or more than one merge buffer (the test would 
incorrectly fail in that case)
+    Assert.assertEquals(
+        1,
+        
GroupByQueryResources.countRequiredMergeBufferNumForMergingQueryRunner(CONFIG, 
QUERY)
+        + 
GroupByQueryResources.countRequiredMergeBufferNumForToolchestMerge(QUERY)
+    );
+
+    // Blocking pool with a single buffer, which means only one of the queries 
can succeed at a time
+    BlockingPool<ByteBuffer> mergeBufferPool = new DefaultBlockingPool<>(() -> 
ByteBuffer.allocate(100), 1);
+    GroupByResourcesReservationPool groupByResourcesReservationPool =
+        new GroupByResourcesReservationPool(mergeBufferPool, CONFIG);
+
+    // Latch indicating that the first thread has called 
reservationPool.reserve()
+    CountDownLatch reserveCalledByFirstThread = new CountDownLatch(1);
+    // Latch indicating that the second thread has called 
reservationPool.reserve()
+    CountDownLatch reserveCalledBySecondThread = new CountDownLatch(1);
+    // Latch indicating that all the threads have been completed successfully. 
Main thread waits on this latch before exiting
+    CountDownLatch threadsCompleted = new CountDownLatch(2);
+
+    // THREAD 1
+    executor.submit(() -> {
+
+      QueryResourceId queryResourceId1 = new QueryResourceId("test-id-1")
+      {
+        @Override
+        public int hashCode()
+        {
+          // IMPORTANT ASSUMPTION: For the test to be useful, it assumes that 
under the hood we are using a
+          // ConcurrentHashMap<QueryResourceId, GroupByResources> (or a 
concurrent map with similar implementation) that
+          // implements granular locking of the nodes
+          // The hashCode of the queryResourceId used in Thread1 and Thread2 
is the same. Therefore, both the queryIds
+          // would be guarded by the same lock
+          return 10;
+        }
+
+        @Override
+        public boolean equals(Object o)
+        {
+          return super.equals(o);
+        }
+      };
+      groupByResourcesReservationPool.reserve(queryResourceId1, QUERY, true);
+      reserveCalledByFirstThread.countDown();
+      try {
+        reserveCalledBySecondThread.await();
+      }
+      catch (InterruptedException e) {
+        Assert.fail("Interrupted while waiting for second reserve call to be 
made");
+      }
+      groupByResourcesReservationPool.clean(queryResourceId1);
+      threadsCompleted.countDown();
+    });
+
+    // THREAD 2
+    executor.submit(() -> {
+      try {
+        reserveCalledByFirstThread.await();
+      }
+      catch (InterruptedException e) {
+        Assert.fail("Interrupted while waiting for first reserve call to be 
made");
+      }
+
+      QueryResourceId queryResourceId2 = new QueryResourceId("test-id-2")
+      {
+        @Override
+        public int hashCode()
+        {
+          return 10;
+        }
+
+        @Override
+        public boolean equals(Object o)
+        {
+          return super.equals(o);
+        }
+      };
+
+      // Since the reserve() call is blocking, we need to execute it 
separately, so that we can count down the latch
+      // and inform Thread1 the reserve call has been made by this thread
+      executor.submit(() -> {
+        groupByResourcesReservationPool.reserve(queryResourceId2, QUERY, true);
+        threadsCompleted.countDown();
+      });
+      try {
+        // This sleep call "ensures" that the statment 
pool.reserve(queryResourceId2) is called before we release the
+        // latch (that will cause Thread1 to release the acquired resources). 
It still doesn't guarantee the previous
+        // statement, however that's the best we can do, given that reserve() 
is blocking
+        Thread.sleep(5_000);
+      }
+      catch (InterruptedException e) {
+        Assert.fail("Interrupted while sleeping");
+      }
+      reserveCalledBySecondThread.countDown();
+    });
+
+    try {
+      threadsCompleted.await();
+    }
+    catch (InterruptedException e) {
+      Assert.fail("Interrupted while waiting for the threads to complete");
+    }
+  }
+
+  @Test
+  public void testMultipleSimultaneousAllocationAttemptsFail()
+  {
+    BlockingPool<ByteBuffer> mergeBufferPool = new DefaultBlockingPool<>(() -> 
ByteBuffer.allocate(100), 1);
+    GroupByResourcesReservationPool groupByResourcesReservationPool =
+        new GroupByResourcesReservationPool(mergeBufferPool, CONFIG);
+    QueryResourceId queryResourceId = new QueryResourceId("test-id");
+
+    groupByResourcesReservationPool.reserve(queryResourceId, QUERY, true);
+
+    Assert.assertThrows(
+        DruidException.class,
+        () -> groupByResourcesReservationPool.reserve(queryResourceId, QUERY, 
true)
+    );
+  }
+
+  @Test
+  public void testMultipleSequentialAllocationAttemptsSucceed()
+  {
+    BlockingPool<ByteBuffer> mergeBufferPool = new DefaultBlockingPool<>(() -> 
ByteBuffer.allocate(100), 1);
+    GroupByResourcesReservationPool groupByResourcesReservationPool =
+        new GroupByResourcesReservationPool(mergeBufferPool, CONFIG);
+    QueryResourceId queryResourceId = new QueryResourceId("test-id");
+
+    groupByResourcesReservationPool.reserve(queryResourceId, QUERY, true);
+    GroupByQueryResources oldResources = 
groupByResourcesReservationPool.fetch(queryResourceId);
+
+    // Cleanup the resources
+    groupByResourcesReservationPool.clean(queryResourceId);
+
+    // Repeat the calls
+    groupByResourcesReservationPool.reserve(queryResourceId, QUERY, true);
+    GroupByQueryResources newResources = 
groupByResourcesReservationPool.fetch(queryResourceId);
+    Assert.assertNotNull(newResources);
+
+    Assert.assertNotSame(oldResources, newResources);
+  }
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(druid) branch 30.0.0 updated: Fix another deadlock which can occur while acquiring merge buffers (#16372) (#16427)

Reply via email to