Hi

After spending some time looking at the assembly code generated by llvmjit for 
a simple query (SELECT * FROM demo WHERE a = 42), digging in the IR showed that 
by simply tweaking the IR one could push llvm into generating better code, kind 
of "for free", without having to spend time in the LLVM optimizer.

I noticed the following patterns:

- in EEOP_QUAL, the test is done, then if the test succeeds a jump is done, 
otherwise another jump is done:

```
0x...123 test $0x1, %al
0x...125 jne 0x....129
0x...127 jmp 0x...147
```
This can be fixed by inverting the jump: ```0x...125 je 0x.....147```
One less jump "for free".

- in tuple deforming, the end blocks are generated before the attributes 
blocks. In O0 their assembly code end up before the attributes code, and thus 
there are jumps around these for no reason. This non-natural code layout can be 
prevented by simply creating the end blocks after the attributes blocks.
- in tuple deforming, some blocks are created even when empty, and instead of 
jumping to the next non-empty block we jump to the next block. This creates the 
weird jump pattern I mentioned in my previous patch about always enabling the 
simplifycfg pass. By playing around the attributes blocks, it's possible to 
remove most jumps without having to rely on the optimizer (note that I do not 
withdraw the suggestion of adding the simplifycfg pass, it can do more than I 
do in this patch).

This may make the C code a bit harder to read, but the end result is quite 
positive.
On my Zen2 desktop system, running a very basic query with jit on and below 
optimization, I get the following:
no-patch:
AVG: 31.0667 run;1.2143000000000002 jit
MIN: 31.003 run;1.191 jit
MAX: 31.096 run;1.235 jit
STDEV: 0.030408514889382194;0.01458347618977652
patch:
AVG: 26.159 run;1.1922 jit
MIN: 26.069 run;1.165 jit
MAX: 26.235 run;1.215 jit
STDEV: 0.05207473262320014;0.01559772063119765

As you can see, this gives an interesting boost in performance for no CPU cost.
From 1c1da11022c64ebb8bd1bec9bdca9783a78c94ee Mon Sep 17 00:00:00 2001
From: Pierre Ducroquet <[email protected]>
Date: Fri, 30 Jan 2026 10:35:43 +0100
Subject: [PATCH] llvmjit: reduce the number of jumps generated in O0

When using O0, LLVM doesn't try to change the basic blocks order in
order to have a linear code in memory. LLVM also doesn't remove
jumps even when they target the instruction next to the current one.

Adding an optimizer step in O0 could end up having bad side effects,
so instead of asking LLVM to fix it, we can modify the IR code we
generate in order to get rid of as many jumps as possible.

- EEOP_QUAL was written following the C logic, thus:
	if null or value is false:
		jump to qualfail
	jump to next block
	qualfail:
	....

  By inverting the if, we have instead:
        if !null and value is not false:
		jump to next block
	....

  This is one less jump on amd64 with O0

- change the block creation order in tuple_deforming so that the
  outblock stays at the end of the function, removing a jump back from
  the last attribute to the outblock before
- don't create the adjust_unavail_cols block if not needed
- jump directly above the attisnull and the attcheckalign blocks if
  they are empty

All these together remove 7 jumps on a very basic query, and makes the
generated assembly code far more natural and easier for the CPU.
---
 src/backend/jit/llvm/llvmjit_deform.c | 62 +++++++++++++++++++++------
 src/backend/jit/llvm/llvmjit_expr.c   | 22 +++++-----
 2 files changed, 59 insertions(+), 25 deletions(-)

diff --git a/src/backend/jit/llvm/llvmjit_deform.c b/src/backend/jit/llvm/llvmjit_deform.c
index 3eb087eb56b..9aeff1e4ff5 100644
--- a/src/backend/jit/llvm/llvmjit_deform.c
+++ b/src/backend/jit/llvm/llvmjit_deform.c
@@ -145,14 +145,8 @@ slot_compile_deform(LLVMJitContext *context, TupleDesc desc,
 
 	b_entry =
 		LLVMAppendBasicBlockInContext(lc, v_deform_fn, "entry");
-	b_adjust_unavail_cols =
-		LLVMAppendBasicBlockInContext(lc, v_deform_fn, "adjust_unavail_cols");
 	b_find_start =
 		LLVMAppendBasicBlockInContext(lc, v_deform_fn, "find_startblock");
-	b_out =
-		LLVMAppendBasicBlockInContext(lc, v_deform_fn, "outblock");
-	b_dead =
-		LLVMAppendBasicBlockInContext(lc, v_deform_fn, "deadblock");
 
 	b = LLVMCreateBuilderInContext(lc);
 
@@ -314,6 +308,10 @@ slot_compile_deform(LLVMJitContext *context, TupleDesc desc,
 			l_bb_append_v(v_deform_fn, "block.attr.%d.store", attnum);
 	}
 
+	/* create the exit and dead blocks at the end, so that even with O0 they will be at the end */
+	b_out = LLVMAppendBasicBlockInContext(lc, v_deform_fn, "outblock");
+	b_dead = LLVMAppendBasicBlockInContext(lc, v_deform_fn, "deadblock");
+
 	/*
 	 * Check if it is guaranteed that all the desired attributes are available
 	 * in the tuple (but still possibly NULL), by dint of either the last
@@ -325,8 +323,6 @@ slot_compile_deform(LLVMJitContext *context, TupleDesc desc,
 	if ((natts - 1) <= guaranteed_column_number)
 	{
 		/* just skip through unnecessary blocks */
-		LLVMBuildBr(b, b_adjust_unavail_cols);
-		LLVMPositionBuilderAtEnd(b, b_adjust_unavail_cols);
 		LLVMBuildBr(b, b_find_start);
 	}
 	else
@@ -334,6 +330,9 @@ slot_compile_deform(LLVMJitContext *context, TupleDesc desc,
 		LLVMValueRef v_params[3];
 		LLVMValueRef f;
 
+		/* create the block since it is now needed */
+		b_adjust_unavail_cols = LLVMAppendBasicBlockInContext(lc, v_deform_fn, "adjust_unavail_cols");
+
 		/* branch if not all columns available */
 		LLVMBuildCondBr(b,
 						LLVMBuildICmp(b, LLVMIntULT,
@@ -399,6 +398,8 @@ slot_compile_deform(LLVMJitContext *context, TupleDesc desc,
 		LLVMValueRef l_attno = l_int16_const(lc, attnum);
 		LLVMValueRef v_attdatap;
 		LLVMValueRef v_resultp;
+		bool		delayed_jump_in_nonnullable;
+		bool		delayed_jump_in_attcheckno;
 
 		/* build block checking whether we did all the necessary attributes */
 		LLVMPositionBuilderAtEnd(b, attcheckattnoblocks[attnum]);
@@ -419,7 +420,7 @@ slot_compile_deform(LLVMJitContext *context, TupleDesc desc,
 		 */
 		if (attnum <= guaranteed_column_number)
 		{
-			LLVMBuildBr(b, attstartblocks[attnum]);
+			delayed_jump_in_attcheckno = true;
 		}
 		else
 		{
@@ -430,6 +431,7 @@ slot_compile_deform(LLVMJitContext *context, TupleDesc desc,
 									 v_maxatt,
 									 "heap_natts");
 			LLVMBuildCondBr(b, v_islast, b_out, attstartblocks[attnum]);
+			delayed_jump_in_attcheckno = false;
 		}
 		LLVMPositionBuilderAtEnd(b, attstartblocks[attnum]);
 
@@ -484,13 +486,19 @@ slot_compile_deform(LLVMJitContext *context, TupleDesc desc,
 
 			LLVMBuildBr(b, b_next);
 			attguaranteedalign = false;
+			delayed_jump_in_nonnullable = false;
+			/* add the jump to our attisnull block in start */
+			if (delayed_jump_in_attcheckno)
+			{
+				LLVMPositionBuilderAtEnd(b, attcheckattnoblocks[attnum]);
+				LLVMBuildBr(b, attstartblocks[attnum]);
+				delayed_jump_in_attcheckno = false;
+			}
 		}
 		else
 		{
 			/* nothing to do */
-			LLVMBuildBr(b, attcheckalignblocks[attnum]);
-			LLVMPositionBuilderAtEnd(b, attisnullblocks[attnum]);
-			LLVMBuildBr(b, attcheckalignblocks[attnum]);
+			delayed_jump_in_nonnullable = true;
 		}
 		LLVMPositionBuilderAtEnd(b, attcheckalignblocks[attnum]);
 
@@ -574,14 +582,40 @@ slot_compile_deform(LLVMJitContext *context, TupleDesc desc,
 			}
 
 			LLVMBuildBr(b, attstoreblocks[attnum]);
-			LLVMPositionBuilderAtEnd(b, attstoreblocks[attnum]);
+			if (delayed_jump_in_nonnullable)
+			{
+				LLVMPositionBuilderAtEnd(b, attstartblocks[attnum]);
+				LLVMBuildBr(b, attcheckalignblocks[attnum]);
+				LLVMPositionBuilderAtEnd(b, attisnullblocks[attnum]);
+				LLVMBuildBr(b, attcheckalignblocks[attnum]);
+			}
+			if (delayed_jump_in_attcheckno)
+			{
+				LLVMPositionBuilderAtEnd(b, attcheckattnoblocks[attnum]);
+				LLVMBuildBr(b, attcheckalignblocks[attnum]);
+				delayed_jump_in_attcheckno = false;
+			}
 		}
 		else
 		{
 			LLVMPositionBuilderAtEnd(b, attcheckalignblocks[attnum]);
-			LLVMBuildBr(b, attalignblocks[attnum]);
+			LLVMBuildBr(b, attstoreblocks[attnum]);
 			LLVMPositionBuilderAtEnd(b, attalignblocks[attnum]);
 			LLVMBuildBr(b, attstoreblocks[attnum]);
+			if (delayed_jump_in_nonnullable)
+			{
+				LLVMPositionBuilderAtEnd(b, attstartblocks[attnum]);
+				LLVMBuildBr(b, attstoreblocks[attnum]);
+				LLVMPositionBuilderAtEnd(b, attisnullblocks[attnum]);
+				LLVMBuildBr(b, attstoreblocks[attnum]);
+			}
+
+			if (delayed_jump_in_attcheckno)
+			{
+				LLVMPositionBuilderAtEnd(b, attcheckattnoblocks[attnum]);
+				LLVMBuildBr(b, attstoreblocks[attnum]);
+				delayed_jump_in_attcheckno = false;
+			}
 		}
 		LLVMPositionBuilderAtEnd(b, attstoreblocks[attnum]);
 
diff --git a/src/backend/jit/llvm/llvmjit_expr.c b/src/backend/jit/llvm/llvmjit_expr.c
index 885b34c27e4..c942e6f4557 100644
--- a/src/backend/jit/llvm/llvmjit_expr.c
+++ b/src/backend/jit/llvm/llvmjit_expr.c
@@ -1224,7 +1224,7 @@ llvm_compile_expr(ExprState *state)
 				{
 					LLVMValueRef v_resnull;
 					LLVMValueRef v_resvalue;
-					LLVMValueRef v_nullorfalse;
+					LLVMValueRef v_notnullnorfalse;
 					LLVMBasicBlockRef b_qualfail;
 
 					b_qualfail = l_bb_before_v(opblocks[opno + 1],
@@ -1233,18 +1233,18 @@ llvm_compile_expr(ExprState *state)
 					v_resvalue = l_load(b, TypeDatum, v_resvaluep, "");
 					v_resnull = l_load(b, TypeStorageBool, v_resnullp, "");
 
-					v_nullorfalse =
-						LLVMBuildOr(b,
-									LLVMBuildICmp(b, LLVMIntEQ, v_resnull,
-												  l_sbool_const(1), ""),
-									LLVMBuildICmp(b, LLVMIntEQ, v_resvalue,
-												  l_datum_const(0), ""),
-									"");
+					v_notnullnorfalse =
+						LLVMBuildAnd(b,
+									 LLVMBuildICmp(b, LLVMIntNE, v_resnull,
+												   l_sbool_const(1), ""),
+									 LLVMBuildICmp(b, LLVMIntNE, v_resvalue,
+												   l_datum_const(0), ""),
+									 "");
 
 					LLVMBuildCondBr(b,
-									v_nullorfalse,
-									b_qualfail,
-									opblocks[opno + 1]);
+									v_notnullnorfalse,
+									opblocks[opno + 1],
+									b_qualfail);
 
 					/* build block handling NULL or false */
 					LLVMPositionBuilderAtEnd(b, b_qualfail);
-- 
2.43.0

Reply via email to