maedhroz commented on code in PR #3543:
URL: https://github.com/apache/cassandra/pull/3543#discussion_r1778897908


##########
test/unit/org/apache/cassandra/metrics/DecayingEstimatedHistogramReservoirTest.java:
##########
@@ -604,6 +616,110 @@ public void testSize()
             assertEquals(2, toSnapshot.apply(histogram).size());
         }
 
+        /**
+         * This looks for invalid percentiles that are unchanged for too long 
to expose the CASSANDRA-19365 race
+         * condition between rescale and update. The idea is to update a 
histogram from multiple threads and observe
+         * if the reported p99 doesn't get stuck at a low value or p50 at a 
high value due to update with high weight
+         * being inserted after the buckets are rescaled.
+         * <p>
+         * The load has 95% of 42, and 5% of the time it's 1109. Despite that 
the histogram may be convinced for a long
+         * time that p99 is 42 or that p50 is 1109. The reason may be seen in 
the snapshot dump, where after rescale
+         * the bucket values may get very big due to the race condition and 
too big weight of the inserted samples.
+         * The values were picked to match bucket boundaries, but that's only 
for aesthetics.
+         * <p>
+         * In production the rescale happens every 30 minutes. In this test 
time we're pushing time to run faster,
+         * roughly 1000 times faster to hit the race condition in a reasonable 
time.
+         */
+        @Test
+        public void testConcurrentUpdateAndRescale() throws 
InterruptedException
+        {
+            int UPDATE_THREADS = 60;
+            int maxTestDurationMillis = 60_000;
+            // how many times in a row the percentiles may be invalid before 
we fail the test
+            int tooManySuspiciousPercentilesThreshold = 5; // 5 translates to 
500ms * 1000 speedup = 500s = 8m20s;
+            AtomicBoolean stop = new AtomicBoolean(false);
+            AtomicBoolean failed = new AtomicBoolean(false);
+            TestClock clock = new TestClock();
+
+            DecayingEstimatedHistogramReservoir histogram = new 
DecayingEstimatedHistogramReservoir(clock);
+            ExecutorService executors = Executors.newFixedThreadPool(2 + 
UPDATE_THREADS);
+
+            for (int i = 0; i < UPDATE_THREADS; i++)
+            {
+                executors.submit(() -> {
+                    while (!stop.get() && 
!Thread.currentThread().isInterrupted())
+                    {
+                        // a mischievous usage pattern to quickly trigger the
+                        // CASSANDRA-19365 race condition;
+                        // the load has 95% of 42, and only 5% of the time 
it's 1109
+                        // and yet, the histogram may be convinced for a long 
time that
+                        // the p99 is 42 or that the p50 is 1109
+                        for (int sampleIdx = 0; sampleIdx < 900; sampleIdx++)
+                            histogram.update(42);
+
+                        for (int sampleIdx = 0; sampleIdx < 50; sampleIdx++)
+                        {
+                            // add some noise so that low value samples do not 
race with the same likelyhood as the high value samples
+                            
Uninterruptibles.sleepUninterruptibly(ThreadLocalRandom.current().nextInt(1, 
10), MILLISECONDS);
+                            histogram.update(1109);
+                        }
+                    }
+                });
+            }
+            // clock update thread
+            executors.submit(() -> {
+                while (!stop.get() && !Thread.currentThread().isInterrupted())
+                {
+                    Uninterruptibles.sleepUninterruptibly(1, 
TimeUnit.MILLISECONDS);
+                    // x1000 speedup so that we hit rescale interval every 30 
minutes / 1000 = 1.8s
+                    clock.addMillis(1000);
+                }
+            });
+            // percentiles check thread
+            executors.submit(() -> {
+                // how many times in a row p99 was suspiciously low or P50 
suspiciously high
+                int consecutiveInvalidPercentiles = 0;
+
+                // how often to check the percentiles
+                int iterationDelayMillis = 100;
+
+                for (int i = 0; i < maxTestDurationMillis / 
iterationDelayMillis; i++)
+                {
+                    
Uninterruptibles.sleepUninterruptibly(iterationDelayMillis, MILLISECONDS);
+                    Snapshot snapshot = toSnapshot.apply(histogram);
+                    double p99 = snapshot.getValue(0.99);
+                    double p50 = snapshot.getValue(0.50);
+                    logger.info("{} p99: {}", clock.now() / 1_000_000, p99);
+                    logger.info("{} p50: {}", clock.now() / 1_000_000, p50);
+                    ByteArrayOutputStream output = new ByteArrayOutputStream();
+                    snapshot.dump(output);
+                    String decayingNonZeroBuckets = 
Arrays.stream(output.toString().split("\n"))
+                                                          .filter(s -> 
!s.equals("0"))
+                                                          
.collect(Collectors.joining(","));
+                    logger.info("decaying non-zero buckets:  {}", 
decayingNonZeroBuckets);
+                    if (p99 < 100 || p50 > 900)
+                    {
+                        consecutiveInvalidPercentiles++;
+                        logger.warn("p50 or p99 at suspicious level p50={}, 
p99={}", p50, p99);
+                        if (consecutiveInvalidPercentiles > 
tooManySuspiciousPercentilesThreshold)
+                        {
+                            failed.set(true);
+                            stop.set(true);
+                            break;
+                        }
+                    }
+                    else
+                    {
+                        consecutiveInvalidPercentiles = 0;
+                    }
+                }
+                stop.set(true);
+            });
+            executors.shutdown();
+            executors.awaitTermination(480000, MILLISECONDS);

Review Comment:
   ....and perhaps at least warn, if not fail outright, if we aren't able to 
terminate in time, which means the whole test may not have run?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to