neoremind commented on PR #15805:
URL: https://github.com/apache/lucene/pull/15805#issuecomment-4053967741

   hi folks, jump in to share my findings and view.
   
   Let data speak. I've spun up a quick JMH benchmark to test.
   
   Code:
   ```
   @Fork(1)
   @Warmup(iterations = 3, time = 3)
   @Measurement(iterations = 5, time = 3)
   @BenchmarkMode(Mode.Throughput)
   @OutputTimeUnit(TimeUnit.MICROSECONDS)
   @State(Scope.Benchmark)
   public class StringHelperRandomIdBenchmark {
   
     private static BigInteger nextId;
   
     // Holds 128 bit unsigned value:
     private static AtomicReference<BigInteger> casNextId;
   
     private static BigInteger mask128;
     private static final Object idLock = new Object();
   
     @Setup
     public void init() {
       // 128 bit unsigned mask
       byte[] maskBytes128 = new byte[16];
       Arrays.fill(maskBytes128, (byte) 0xff);
       mask128 = new BigInteger(1, maskBytes128);
   
       String prop = System.getProperty("tests.seed");
   
       // State for xorshift128:
       long x0;
       long x1;
   
       if (prop != null) {
         // So if there is a test failure that somehow relied on this id,
         // we remain reproducible based on the test seed:
         if (prop.length() > 8) {
           prop = prop.substring(prop.length() - 8);
         }
         x0 = Long.parseLong(prop, 16);
         x1 = x0;
       } else {
         // seed from /dev/urandom, if its available
         try (DataInputStream is =
             new 
DataInputStream(Files.newInputStream(Paths.get("/dev/urandom")))) {
           x0 = is.readLong();
           x1 = is.readLong();
         } catch (Exception _) {
           // may not be available on this platform
           // fall back to lower quality randomness from 3 different sources:
           x0 = System.nanoTime();
           x1 = (long) StringHelperRandomIdBenchmark.class.hashCode() << 32;
   
           StringBuilder sb = new StringBuilder();
           // Properties can vary across JVM instances:
           try {
             Properties p = System.getProperties();
             for (String s : p.stringPropertyNames()) {
               sb.append(s);
               sb.append(p.getProperty(s));
             }
             x1 |= sb.toString().hashCode();
           } catch (SecurityException _) {
             // getting Properties requires wildcard read-write: may not be 
allowed
             x1 |= StringBuffer.class.hashCode();
           }
         }
       }
   
       // Use a few iterations of xorshift128 to scatter the seed
       // in case multiple Lucene instances starting up "near" the same
       // nanoTime, since we use ++ (mod 2^128) for full period cycle:
       for (int i = 0; i < 10; i++) {
         long s1 = x0;
         long s0 = x1;
         x0 = s0;
         s1 ^= s1 << 23; // a
         x1 = s1 ^ s0 ^ (s1 >>> 17) ^ (s0 >>> 26); // b, c
       }
   
       // 64-bit unsigned mask
       byte[] maskBytes64 = new byte[8];
       Arrays.fill(maskBytes64, (byte) 0xff);
       BigInteger mask64 = new BigInteger(1, maskBytes64);
   
       // First make unsigned versions of x0, x1:
       BigInteger unsignedX0 = BigInteger.valueOf(x0).and(mask64);
       BigInteger unsignedX1 = BigInteger.valueOf(x1).and(mask64);
   
       // Concatentate bits of x0 and x1, as unsigned 128 bit integer:
       nextId = unsignedX0.shiftLeft(64).or(unsignedX1);
   
       casNextId = new 
AtomicReference<>(unsignedX0.shiftLeft(64).or(unsignedX1));
     }
   
     private byte[] synchronizedRandomId() {
       byte[] bits;
       synchronized (idLock) {
         bits = nextId.toByteArray();
         nextId = nextId.add(BigInteger.ONE).and(mask128);
       }
       return bits;
     }
   
     private byte[] lockScopeReducedSynchronizedRandomId() {
       final BigInteger scratch;
       synchronized (idLock) {
         scratch = nextId;
         nextId = nextId.add(BigInteger.ONE).and(mask128);
       }
       return scratch.toByteArray();
     }
   
     private byte[] casRandomId() {
       BigInteger next = casNextId.getAndUpdate(n -> 
n.add(BigInteger.ONE).and(mask128));
       return next.toByteArray();
     }
   
     // ---- 1 thread ----
   
     @Benchmark
     @Threads(1)
     public void synchronized_1thread(Blackhole bh) {
       bh.consume(synchronizedRandomId());
     }
   
     @Benchmark
     @Threads(1)
     public void lockScopeReducedSynchronized_1thread(Blackhole bh) {
       bh.consume(lockScopeReducedSynchronizedRandomId());
     }
   
     @Benchmark
     @Threads(1)
     public void cas_1thread(Blackhole bh) {
       bh.consume(casRandomId());
     }
   
     // ---- 2 threads ----
   
     @Benchmark
     @Threads(2)
     public void synchronized_2threads(Blackhole bh) {
       bh.consume(synchronizedRandomId());
     }
   
     @Benchmark
     @Threads(2)
     public void lockScopeReducedSynchronized_2thread(Blackhole bh) {
       bh.consume(lockScopeReducedSynchronizedRandomId());
     }
   
     @Benchmark
     @Threads(2)
     public void cas_2threads(Blackhole bh) {
       bh.consume(casRandomId());
     }
   
     // ---- 4 threads ----
   
     @Benchmark
     @Threads(4)
     public void synchronized_4threads(Blackhole bh) {
       bh.consume(synchronizedRandomId());
     }
   
     @Benchmark
     @Threads(4)
     public void lockScopeReducedSynchronized_4thread(Blackhole bh) {
       bh.consume(lockScopeReducedSynchronizedRandomId());
     }
   
     @Benchmark
     @Threads(4)
     public void cas_4threads(Blackhole bh) {
       bh.consume(casRandomId());
     }
   
     // ---- 8 threads ----
   
     @Benchmark
     @Threads(8)
     public void synchronized_8threads(Blackhole bh) {
       bh.consume(synchronizedRandomId());
     }
   
     @Benchmark
     @Threads(8)
     public void lockScopeReducedSynchronized_8thread(Blackhole bh) {
       bh.consume(lockScopeReducedSynchronizedRandomId());
     }
   
     @Benchmark
     @Threads(8)
     public void cas_8threads(Blackhole bh) {
       bh.consume(casRandomId());
     }
   }
   ```
   
   Results:
   
   Mac M3 w/ 12 cores
   OpenJDK 64-Bit Server VM Corretto-25.0.1.8.1 (build 25.0.1+8-LTS, mixed 
mode, sharing)
   
   ```
   Benchmark                                                            Mode  
Cnt   Score   Error   Units
   StringHelperRandomIdBenchmark.cas_1thread                           thrpt    
5  14.888 ± 0.326  ops/us
   StringHelperRandomIdBenchmark.cas_2threads                          thrpt    
5  20.645 ± 0.063  ops/us
   StringHelperRandomIdBenchmark.cas_4threads                          thrpt    
5  11.658 ± 0.104  ops/us
   StringHelperRandomIdBenchmark.cas_8threads                          thrpt    
5   3.893 ± 0.107  ops/us
   StringHelperRandomIdBenchmark.lockScopeReducedSynchronized_1thread  thrpt    
5  18.833 ± 2.372  ops/us
   StringHelperRandomIdBenchmark.lockScopeReducedSynchronized_2thread  thrpt    
5  10.412 ± 0.695  ops/us
   StringHelperRandomIdBenchmark.lockScopeReducedSynchronized_4thread  thrpt    
5   9.870 ± 0.269  ops/us
   StringHelperRandomIdBenchmark.lockScopeReducedSynchronized_8thread  thrpt    
5   6.249 ± 0.043  ops/us
   StringHelperRandomIdBenchmark.synchronized_1thread                  thrpt    
5  35.273 ± 2.583  ops/us
   StringHelperRandomIdBenchmark.synchronized_2threads                 thrpt    
5  17.062 ± 0.665  ops/us
   StringHelperRandomIdBenchmark.synchronized_4threads                 thrpt    
5  16.695 ± 0.990  ops/us
   StringHelperRandomIdBenchmark.synchronized_8threads                 thrpt    
5  13.370 ± 0.455  ops/us
   ```
   
   EC2 r4.4xlarge with 16vCPU Intel(R) Xeon(R) CPU E5-2686 v4 @ 2.30GHz 
   OpenJDK 64-Bit Server VM (build 25.0.1+8-27, mixed mode, sharing)
   ```
   Benchmark                                                            Mode  
Cnt   Score   Error   Units
   StringHelperRandomIdBenchmark.cas_1thread                           thrpt    
5  10.064 ± 1.125  ops/us
   StringHelperRandomIdBenchmark.cas_2threads                          thrpt    
5   4.092 ± 0.376  ops/us
   StringHelperRandomIdBenchmark.cas_4threads                          thrpt    
5   3.761 ± 0.663  ops/us
   StringHelperRandomIdBenchmark.cas_8threads                          thrpt    
5   2.029 ± 0.830  ops/us
   StringHelperRandomIdBenchmark.lockScopeReducedSynchronized_1thread  thrpt    
5   9.744 ± 1.027  ops/us
   StringHelperRandomIdBenchmark.lockScopeReducedSynchronized_2thread  thrpt    
5   3.406 ± 1.007  ops/us
   StringHelperRandomIdBenchmark.lockScopeReducedSynchronized_4thread  thrpt    
5   5.245 ± 3.767  ops/us
   StringHelperRandomIdBenchmark.lockScopeReducedSynchronized_8thread  thrpt    
5   4.957 ± 0.888  ops/us
   StringHelperRandomIdBenchmark.synchronized_1thread                  thrpt    
5   9.504 ± 0.740  ops/us
   StringHelperRandomIdBenchmark.synchronized_2threads                 thrpt    
5   5.261 ± 1.351  ops/us
   StringHelperRandomIdBenchmark.synchronized_4threads                 thrpt    
5   5.591 ± 0.808  ops/us
   StringHelperRandomIdBenchmark.synchronized_8threads                 thrpt    
5   5.259 ± 0.844  ops/us
   ```
   
   Better visualized, unit is *ops/us*:
   
   On the M3 Pro (Apple Silicon, 12 cores):
   
   | Threads | synchronized | lockScopeReduced | CAS |
   |---------|-------------|-----------------|-----|
   | 1 | 35.3 | 18.8 | 14.9 |
   | 2 | 17.1 | 10.4 | 20.6 |
   | 4 | 16.7 | 9.9 | 11.7 |
   | 8 | 13.4 | 6.2 | 3.9 |
   
   On the EC2 r4.4xlarge (Xeon E5-2686, 16 vCPUs):
   
   | Threads | synchronized | lockScopeReduced | CAS |
   |---------|-------------|-----------------|-----|
   | 1 | 9.5 | 9.7 | 10.1 |
   | 2 | 5.3 | 3.4 | 4.1 |
   | 4 | 5.6 | 5.2 | 3.8 |
   | 8 | 5.3 | 5.0 | 2.0 |
   
   The original synchronized version is actually the fastest single-threaded on 
M3 with 35.3 ops/us, more than 2x the other alternatives. The synchronization 
on uncontended lock is basically free thanks to biased locking and/or thin 
lock. In multi-threaded scenarios, original one looks more stable.
   
   CAS barely outperforms at 1 thread, then falls off a cliff under higher 
contention. The retry storm in threads keep spinning with CAS operation, 
wasting cycles.
   
   Interestingly, the "lockScopeReduced" variant doesn't actually do that well. 
It moves `toByteArray()` outside the lock, which sounds like a win. But through 
data, the lock hold time might already be so short that reducing scope doesn't 
help. Maybe the extra local variable copy introduces a bit overhead that 
offsets the gain.
   
   Anyway, the original synchronized version remains better (or tied for 
fastest) in nearly every scenario across x86_64 and Mac ARM. The two 
"improvements" either make things worse or offer limited gains.
   
   As @rmuir pointed out, `randomId()` is not a hot path in Lucene. I would 
suggest we take another look, correct me if I am wrong, from avoiding 
over-optimization perspective, consider the option to keep the original and 
simpler one.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to