On Tue, 13 Aug 2024 12:57:23 GMT, Axel Boldt-Christmas <abold...@openjdk.org> wrote:
>> src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp line 674: >> >>> 672: >>> 673: // Search for obj in cache. >>> 674: bind(loop); >> >> Same loop transformation would be possible here. > > I tried the following (see diff below) and it shows about a 5-10% regression > in most the `LockUnlock.testInflated*` micros. Also tried with just > `num_unrolled = 1` saw the same regression. Maybe there was some other > pattern you were thinking of. There are probably architecture and platform > differences. This can and should probably be explored in a followup PR. > > > > diff --git a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp > b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp > index 5dbfdbc225d..4e6621cfece 100644 > --- a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp > +++ b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp > @@ -663,25 +663,28 @@ void C2_MacroAssembler::fast_lock_lightweight(Register > obj, Register box, Regist > > const int num_unrolled = 2; > for (int i = 0; i < num_unrolled; i++) { > - cmpptr(obj, Address(t)); > - jccb(Assembler::equal, monitor_found); > - increment(t, in_bytes(OMCache::oop_to_oop_difference())); > + Label next; > + cmpptr(obj, Address(t, OMCache::oop_to_oop_difference() * i)); > + jccb(Assembler::notEqual, next); > + increment(t, in_bytes(OMCache::oop_to_oop_difference() * i)); > + jmpb(monitor_found); > + bind(next); > } > + increment(t, in_bytes(OMCache::oop_to_oop_difference() * (num_unrolled > - 1))); > > Label loop; > > // Search for obj in cache. > bind(loop); > - > - // Check for match. > - cmpptr(obj, Address(t)); > - jccb(Assembler::equal, monitor_found); > - > + // Advance. > + increment(t, in_bytes(OMCache::oop_to_oop_difference())); > // Search until null encountered, guaranteed _null_sentinel at end. > cmpptr(Address(t), 1); > jcc(Assembler::below, slow_path); // 0 check, but with ZF=0 when *t == > 0 > - increment(t, in_bytes(OMCache::oop_to_oop_difference())); > - jmpb(loop); > + > + // Check for match. > + cmpptr(obj, Address(t)); > + jccb(Assembler::notEqual, loop); > > // Cache hit. > bind(monitor_found); Yeah it's probably not very important. But it's not quite what I had in mind, I was thinking more something like (aarch64 version, untested, may be wrong): diff --git a/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp b/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp index 19af03d3488..05bbb5760b8 100644 --- a/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp @@ -302,14 +302,14 @@ void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Regist Label monitor_found; // Load cache address - lea(t3_t, Address(rthread, JavaThread::om_cache_oops_offset())); + lea(t3_t, Address(rthread, JavaThread::om_cache_oops_offset() - OMCache::oop_to_oop_difference())); const int num_unrolled = 2; for (int i = 0; i < num_unrolled; i++) { + increment(t3_t, in_bytes(OMCache::oop_to_oop_difference())); ldr(t1, Address(t3_t)); cmp(obj, t1); br(Assembler::EQ, monitor_found); - increment(t3_t, in_bytes(OMCache::oop_to_oop_difference())); } Label loop; @@ -317,16 +317,14 @@ void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Regist // Search for obj in cache. bind(loop); - // Check for match. - ldr(t1, Address(t3_t)); - cmp(obj, t1); - br(Assembler::EQ, monitor_found); + increment(t3_t, in_bytes(OMCache::oop_to_oop_difference())); + ldr(t1, Address(t3_t)); // Search until null encountered, guaranteed _null_sentinel at end. - increment(t3_t, in_bytes(OMCache::oop_to_oop_difference())); - cbnz(t1, loop); - // Cache Miss, NE set from cmp above, cbnz does not set flags - b(slow_path); + cbz(t1, slow_path); + // Check for match. + cmp(obj, t1); + br(Assembler::NE, loop); bind(monitor_found); ------------- PR Review Comment: https://git.openjdk.org/jdk/pull/20067#discussion_r1715557161