On Wed, 9 Jul 2025 at 22:31, Dean Rasheed <dean.a.rash...@gmail.com> wrote:
>
> On Wed, 9 Jul 2025 at 18:27, Andres Freund <and...@anarazel.de> wrote:
> >
> > I think we should wire this up to the buildsystem and our testsuite...  
> > Having
> > testcode that is not run automatically may be helpful while originally
> > developing something, but it doesn't do anything to detect portability 
> > issues
> > or regressions.
>
> Yes, perhaps we should convert src/tools/testint128.c into a new test
> extension, src/test/modules/test_int128

Here's an update doing that (in 0001). 0002-0005 are unchanged.

Regards,
Dean
From 34f72f257ff9cd521175c7169022db0435d990a4 Mon Sep 17 00:00:00 2001
From: Dean Rasheed <dean.a.rash...@gmail.com>
Date: Thu, 10 Jul 2025 14:45:10 +0100
Subject: [PATCH v2 5/5] Extend int128.h to support more numeric code.

This adds a few more functions to int128.h, allowing more of numeric.c
to use 128-bit integers on all platforms.

Specifically, int64_div_fast_to_numeric() and the following aggregate
functions can now use 128-bit integers for improved performance on all
platforms, rather than just platforms with native support for int128:

- SUM(int8)
- AVG(int8)
- STDDEV_POP(int2 or int4)
- STDDEV_SAMP(int2 or int4)
- VAR_POP(int2 or int4)
- VAR_SAMP(int2 or int4)

In addition to improved performance on platforms lacking native
128-bit integer support, this significantly simplifies this numeric
code by allowing a lot of conditionally compiled code to be deleted.

A couple of numeric functions (div_var_int64() and sqrt_var()) still
contain conditionally compiled 128-bit integer code that only works on
platforms with native 128-bit integer support. Making those work more
portably would require rolling our own higher precision 128-bit
division, which isn't supported for now.
---
 src/backend/utils/adt/numeric.c            | 502 +++++----------------
 src/include/common/int128.h                | 239 ++++++++++
 src/test/modules/test_int128/test_int128.c | 103 ++++-
 3 files changed, 460 insertions(+), 384 deletions(-)

diff --git a/src/backend/utils/adt/numeric.c b/src/backend/utils/adt/numeric.c
index c9233565d57..1f1eb57d832 100644
--- a/src/backend/utils/adt/numeric.c
+++ b/src/backend/utils/adt/numeric.c
@@ -28,6 +28,7 @@
 
 #include "common/hashfn.h"
 #include "common/int.h"
+#include "common/int128.h"
 #include "funcapi.h"
 #include "lib/hyperloglog.h"
 #include "libpq/pqformat.h"
@@ -534,10 +535,7 @@ static bool numericvar_to_int32(const NumericVar *var, int32 *result);
 static bool numericvar_to_int64(const NumericVar *var, int64 *result);
 static void int64_to_numericvar(int64 val, NumericVar *var);
 static bool numericvar_to_uint64(const NumericVar *var, uint64 *result);
-#ifdef HAVE_INT128
-static bool numericvar_to_int128(const NumericVar *var, int128 *result);
-static void int128_to_numericvar(int128 val, NumericVar *var);
-#endif
+static void int128_to_numericvar(INT128 val, NumericVar *var);
 static double numericvar_to_double_no_overflow(const NumericVar *var);
 
 static Datum numeric_abbrev_convert(Datum original_datum, SortSupport ssup);
@@ -4463,25 +4461,13 @@ int64_div_fast_to_numeric(int64 val1, int log10val2)
 
 		if (unlikely(pg_mul_s64_overflow(val1, factor, &new_val1)))
 		{
-#ifdef HAVE_INT128
 			/* do the multiplication using 128-bit integers */
-			int128		tmp;
+			INT128		tmp;
 
-			tmp = (int128) val1 * (int128) factor;
+			tmp = int64_to_int128(0);
+			int128_add_int64_mul_int64(&tmp, val1, factor);
 
 			int128_to_numericvar(tmp, &result);
-#else
-			/* do the multiplication using numerics */
-			NumericVar	tmp;
-
-			init_var(&tmp);
-
-			int64_to_numericvar(val1, &result);
-			int64_to_numericvar(factor, &tmp);
-			mul_var(&result, &tmp, &result, 0);
-
-			free_var(&tmp);
-#endif
 		}
 		else
 			int64_to_numericvar(new_val1, &result);
@@ -4901,8 +4887,8 @@ numeric_pg_lsn(PG_FUNCTION_ARGS)
  * Actually, it's a pointer to a NumericAggState allocated in the aggregate
  * context.  The digit buffers for the NumericVars will be there too.
  *
- * On platforms which support 128-bit integers some aggregates instead use a
- * 128-bit integer based transition datatype to speed up calculations.
+ * For integer inputs, some aggregates use special-purpose 64-bit or 128-bit
+ * integer based transition datatypes to speed up calculations.
  *
  * ----------------------------------------------------------------------
  */
@@ -5566,26 +5552,27 @@ numeric_accum_inv(PG_FUNCTION_ARGS)
 
 
 /*
- * Integer data types in general use Numeric accumulators to share code
- * and avoid risk of overflow.
+ * Integer data types in general use Numeric accumulators to share code and
+ * avoid risk of overflow.  However for performance reasons optimized
+ * special-purpose accumulator routines are used when possible:
  *
- * However for performance reasons optimized special-purpose accumulator
- * routines are used when possible.
+ * For 16-bit and 32-bit inputs, N and sum(X) fit into 64-bit, so 64-bit
+ * accumulators are used for SUM and AVG of these data types.
  *
- * On platforms with 128-bit integer support, the 128-bit routines will be
- * used when sum(X) or sum(X*X) fit into 128-bit.
+ * For 16-bit and 32-bit inputs, sum(X^2) fits into 128-bit, so 128-bit
+ * accumulators are used for STDDEV_POP, STDDEV_SAMP, VAR_POP, and VAR_SAMP of
+ * these data types.
  *
- * For 16 and 32 bit inputs, the N and sum(X) fit into 64-bit so the 64-bit
- * accumulators will be used for SUM and AVG of these data types.
+ * For 64-bit inputs, sum(X) fits into 128-bit, so a 128-bit accumulator is
+ * used for SUM(int8) and AVG(int8).
  */
 
-#ifdef HAVE_INT128
 typedef struct Int128AggState
 {
 	bool		calcSumX2;		/* if true, calculate sumX2 */
 	int64		N;				/* count of processed numbers */
-	int128		sumX;			/* sum of processed numbers */
-	int128		sumX2;			/* sum of squares of processed numbers */
+	INT128		sumX;			/* sum of processed numbers */
+	INT128		sumX2;			/* sum of squares of processed numbers */
 } Int128AggState;
 
 /*
@@ -5631,12 +5618,12 @@ makeInt128AggStateCurrentContext(bool calcSumX2)
  * Accumulate a new input value for 128-bit aggregate functions.
  */
 static void
-do_int128_accum(Int128AggState *state, int128 newval)
+do_int128_accum(Int128AggState *state, int64 newval)
 {
 	if (state->calcSumX2)
-		state->sumX2 += newval * newval;
+		int128_add_int64_mul_int64(&state->sumX2, newval, newval);
 
-	state->sumX += newval;
+	int128_add_int64(&state->sumX, newval);
 	state->N++;
 }
 
@@ -5644,43 +5631,28 @@ do_int128_accum(Int128AggState *state, int128 newval)
  * Remove an input value from the aggregated state.
  */
 static void
-do_int128_discard(Int128AggState *state, int128 newval)
+do_int128_discard(Int128AggState *state, int64 newval)
 {
 	if (state->calcSumX2)
-		state->sumX2 -= newval * newval;
+		int128_sub_int64_mul_int64(&state->sumX2, newval, newval);
 
-	state->sumX -= newval;
+	int128_sub_int64(&state->sumX, newval);
 	state->N--;
 }
 
-typedef Int128AggState PolyNumAggState;
-#define makePolyNumAggState makeInt128AggState
-#define makePolyNumAggStateCurrentContext makeInt128AggStateCurrentContext
-#else
-typedef NumericAggState PolyNumAggState;
-#define makePolyNumAggState makeNumericAggState
-#define makePolyNumAggStateCurrentContext makeNumericAggStateCurrentContext
-#endif
-
 Datum
 int2_accum(PG_FUNCTION_ARGS)
 {
-	PolyNumAggState *state;
+	Int128AggState *state;
 
-	state = PG_ARGISNULL(0) ? NULL : (PolyNumAggState *) PG_GETARG_POINTER(0);
+	state = PG_ARGISNULL(0) ? NULL : (Int128AggState *) PG_GETARG_POINTER(0);
 
 	/* Create the state data on the first call */
 	if (state == NULL)
-		state = makePolyNumAggState(fcinfo, true);
+		state = makeInt128AggState(fcinfo, true);
 
 	if (!PG_ARGISNULL(1))
-	{
-#ifdef HAVE_INT128
-		do_int128_accum(state, (int128) PG_GETARG_INT16(1));
-#else
-		do_numeric_accum(state, int64_to_numeric(PG_GETARG_INT16(1)));
-#endif
-	}
+		do_int128_accum(state, PG_GETARG_INT16(1));
 
 	PG_RETURN_POINTER(state);
 }
@@ -5688,22 +5660,16 @@ int2_accum(PG_FUNCTION_ARGS)
 Datum
 int4_accum(PG_FUNCTION_ARGS)
 {
-	PolyNumAggState *state;
+	Int128AggState *state;
 
-	state = PG_ARGISNULL(0) ? NULL : (PolyNumAggState *) PG_GETARG_POINTER(0);
+	state = PG_ARGISNULL(0) ? NULL : (Int128AggState *) PG_GETARG_POINTER(0);
 
 	/* Create the state data on the first call */
 	if (state == NULL)
-		state = makePolyNumAggState(fcinfo, true);
+		state = makeInt128AggState(fcinfo, true);
 
 	if (!PG_ARGISNULL(1))
-	{
-#ifdef HAVE_INT128
-		do_int128_accum(state, (int128) PG_GETARG_INT32(1));
-#else
-		do_numeric_accum(state, int64_to_numeric(PG_GETARG_INT32(1)));
-#endif
-	}
+		do_int128_accum(state, PG_GETARG_INT32(1));
 
 	PG_RETURN_POINTER(state);
 }
@@ -5726,21 +5692,21 @@ int8_accum(PG_FUNCTION_ARGS)
 }
 
 /*
- * Combine function for numeric aggregates which require sumX2
+ * Combine function for Int128AggState for aggregates which require sumX2
  */
 Datum
 numeric_poly_combine(PG_FUNCTION_ARGS)
 {
-	PolyNumAggState *state1;
-	PolyNumAggState *state2;
+	Int128AggState *state1;
+	Int128AggState *state2;
 	MemoryContext agg_context;
 	MemoryContext old_context;
 
 	if (!AggCheckCallContext(fcinfo, &agg_context))
 		elog(ERROR, "aggregate function called in non-aggregate context");
 
-	state1 = PG_ARGISNULL(0) ? NULL : (PolyNumAggState *) PG_GETARG_POINTER(0);
-	state2 = PG_ARGISNULL(1) ? NULL : (PolyNumAggState *) PG_GETARG_POINTER(1);
+	state1 = PG_ARGISNULL(0) ? NULL : (Int128AggState *) PG_GETARG_POINTER(0);
+	state2 = PG_ARGISNULL(1) ? NULL : (Int128AggState *) PG_GETARG_POINTER(1);
 
 	if (state2 == NULL)
 		PG_RETURN_POINTER(state1);
@@ -5750,16 +5716,10 @@ numeric_poly_combine(PG_FUNCTION_ARGS)
 	{
 		old_context = MemoryContextSwitchTo(agg_context);
 
-		state1 = makePolyNumAggState(fcinfo, true);
+		state1 = makeInt128AggState(fcinfo, true);
 		state1->N = state2->N;
-
-#ifdef HAVE_INT128
 		state1->sumX = state2->sumX;
 		state1->sumX2 = state2->sumX2;
-#else
-		accum_sum_copy(&state1->sumX, &state2->sumX);
-		accum_sum_copy(&state1->sumX2, &state2->sumX2);
-#endif
 
 		MemoryContextSwitchTo(old_context);
 
@@ -5769,54 +5729,51 @@ numeric_poly_combine(PG_FUNCTION_ARGS)
 	if (state2->N > 0)
 	{
 		state1->N += state2->N;
+		int128_add_int128(&state1->sumX, state2->sumX);
+		int128_add_int128(&state1->sumX2, state2->sumX2);
+	}
+	PG_RETURN_POINTER(state1);
+}
 
-#ifdef HAVE_INT128
-		state1->sumX += state2->sumX;
-		state1->sumX2 += state2->sumX2;
-#else
-		/* The rest of this needs to work in the aggregate context */
-		old_context = MemoryContextSwitchTo(agg_context);
-
-		/* Accumulate sums */
-		accum_sum_combine(&state1->sumX, &state2->sumX);
-		accum_sum_combine(&state1->sumX2, &state2->sumX2);
+/*
+ * int128_serialize - serialize a 128-bit integer to binary format
+ */
+static inline void
+int128_serialize(StringInfo buf, INT128 val)
+{
+	pq_sendint64(buf, PG_INT128_HI_INT64(val));
+	pq_sendint64(buf, PG_INT128_LO_UINT64(val));
+}
 
-		MemoryContextSwitchTo(old_context);
-#endif
+/*
+ * int128_deserialize - deserialize binary format to a 128-bit integer.
+ */
+static inline INT128
+int128_deserialize(StringInfo buf)
+{
+	int64		hi = pq_getmsgint64(buf);
+	uint64		lo = pq_getmsgint64(buf);
 
-	}
-	PG_RETURN_POINTER(state1);
+	return make_int128(hi, lo);
 }
 
 /*
  * numeric_poly_serialize
- *		Serialize PolyNumAggState into bytea for aggregate functions which
+ *		Serialize Int128AggState into bytea for aggregate functions which
  *		require sumX2.
  */
 Datum
 numeric_poly_serialize(PG_FUNCTION_ARGS)
 {
-	PolyNumAggState *state;
+	Int128AggState *state;
 	StringInfoData buf;
 	bytea	   *result;
-	NumericVar	tmp_var;
 
 	/* Ensure we disallow calling when not in aggregate context */
 	if (!AggCheckCallContext(fcinfo, NULL))
 		elog(ERROR, "aggregate function called in non-aggregate context");
 
-	state = (PolyNumAggState *) PG_GETARG_POINTER(0);
-
-	/*
-	 * If the platform supports int128 then sumX and sumX2 will be a 128 bit
-	 * integer type. Here we'll convert that into a numeric type so that the
-	 * combine state is in the same format for both int128 enabled machines
-	 * and machines which don't support that type. The logic here is that one
-	 * day we might like to send these over to another server for further
-	 * processing and we want a standard format to work with.
-	 */
-
-	init_var(&tmp_var);
+	state = (Int128AggState *) PG_GETARG_POINTER(0);
 
 	pq_begintypsend(&buf);
 
@@ -5824,48 +5781,33 @@ numeric_poly_serialize(PG_FUNCTION_ARGS)
 	pq_sendint64(&buf, state->N);
 
 	/* sumX */
-#ifdef HAVE_INT128
-	int128_to_numericvar(state->sumX, &tmp_var);
-#else
-	accum_sum_final(&state->sumX, &tmp_var);
-#endif
-	numericvar_serialize(&buf, &tmp_var);
+	int128_serialize(&buf, state->sumX);
 
 	/* sumX2 */
-#ifdef HAVE_INT128
-	int128_to_numericvar(state->sumX2, &tmp_var);
-#else
-	accum_sum_final(&state->sumX2, &tmp_var);
-#endif
-	numericvar_serialize(&buf, &tmp_var);
+	int128_serialize(&buf, state->sumX2);
 
 	result = pq_endtypsend(&buf);
 
-	free_var(&tmp_var);
-
 	PG_RETURN_BYTEA_P(result);
 }
 
 /*
  * numeric_poly_deserialize
- *		Deserialize PolyNumAggState from bytea for aggregate functions which
+ *		Deserialize Int128AggState from bytea for aggregate functions which
  *		require sumX2.
  */
 Datum
 numeric_poly_deserialize(PG_FUNCTION_ARGS)
 {
 	bytea	   *sstate;
-	PolyNumAggState *result;
+	Int128AggState *result;
 	StringInfoData buf;
-	NumericVar	tmp_var;
 
 	if (!AggCheckCallContext(fcinfo, NULL))
 		elog(ERROR, "aggregate function called in non-aggregate context");
 
 	sstate = PG_GETARG_BYTEA_PP(0);
 
-	init_var(&tmp_var);
-
 	/*
 	 * Initialize a StringInfo so that we can "receive" it using the standard
 	 * recv-function infrastructure.
@@ -5873,31 +5815,19 @@ numeric_poly_deserialize(PG_FUNCTION_ARGS)
 	initReadOnlyStringInfo(&buf, VARDATA_ANY(sstate),
 						   VARSIZE_ANY_EXHDR(sstate));
 
-	result = makePolyNumAggStateCurrentContext(false);
+	result = makeInt128AggStateCurrentContext(false);
 
 	/* N */
 	result->N = pq_getmsgint64(&buf);
 
 	/* sumX */
-	numericvar_deserialize(&buf, &tmp_var);
-#ifdef HAVE_INT128
-	numericvar_to_int128(&tmp_var, &result->sumX);
-#else
-	accum_sum_add(&result->sumX, &tmp_var);
-#endif
+	result->sumX = int128_deserialize(&buf);
 
 	/* sumX2 */
-	numericvar_deserialize(&buf, &tmp_var);
-#ifdef HAVE_INT128
-	numericvar_to_int128(&tmp_var, &result->sumX2);
-#else
-	accum_sum_add(&result->sumX2, &tmp_var);
-#endif
+	result->sumX2 = int128_deserialize(&buf);
 
 	pq_getmsgend(&buf);
 
-	free_var(&tmp_var);
-
 	PG_RETURN_POINTER(result);
 }
 
@@ -5907,43 +5837,37 @@ numeric_poly_deserialize(PG_FUNCTION_ARGS)
 Datum
 int8_avg_accum(PG_FUNCTION_ARGS)
 {
-	PolyNumAggState *state;
+	Int128AggState *state;
 
-	state = PG_ARGISNULL(0) ? NULL : (PolyNumAggState *) PG_GETARG_POINTER(0);
+	state = PG_ARGISNULL(0) ? NULL : (Int128AggState *) PG_GETARG_POINTER(0);
 
 	/* Create the state data on the first call */
 	if (state == NULL)
-		state = makePolyNumAggState(fcinfo, false);
+		state = makeInt128AggState(fcinfo, false);
 
 	if (!PG_ARGISNULL(1))
-	{
-#ifdef HAVE_INT128
-		do_int128_accum(state, (int128) PG_GETARG_INT64(1));
-#else
-		do_numeric_accum(state, int64_to_numeric(PG_GETARG_INT64(1)));
-#endif
-	}
+		do_int128_accum(state, PG_GETARG_INT64(1));
 
 	PG_RETURN_POINTER(state);
 }
 
 /*
- * Combine function for PolyNumAggState for aggregates which don't require
+ * Combine function for Int128AggState for aggregates which don't require
  * sumX2
  */
 Datum
 int8_avg_combine(PG_FUNCTION_ARGS)
 {
-	PolyNumAggState *state1;
-	PolyNumAggState *state2;
+	Int128AggState *state1;
+	Int128AggState *state2;
 	MemoryContext agg_context;
 	MemoryContext old_context;
 
 	if (!AggCheckCallContext(fcinfo, &agg_context))
 		elog(ERROR, "aggregate function called in non-aggregate context");
 
-	state1 = PG_ARGISNULL(0) ? NULL : (PolyNumAggState *) PG_GETARG_POINTER(0);
-	state2 = PG_ARGISNULL(1) ? NULL : (PolyNumAggState *) PG_GETARG_POINTER(1);
+	state1 = PG_ARGISNULL(0) ? NULL : (Int128AggState *) PG_GETARG_POINTER(0);
+	state2 = PG_ARGISNULL(1) ? NULL : (Int128AggState *) PG_GETARG_POINTER(1);
 
 	if (state2 == NULL)
 		PG_RETURN_POINTER(state1);
@@ -5953,14 +5877,10 @@ int8_avg_combine(PG_FUNCTION_ARGS)
 	{
 		old_context = MemoryContextSwitchTo(agg_context);
 
-		state1 = makePolyNumAggState(fcinfo, false);
+		state1 = makeInt128AggState(fcinfo, false);
 		state1->N = state2->N;
-
-#ifdef HAVE_INT128
 		state1->sumX = state2->sumX;
-#else
-		accum_sum_copy(&state1->sumX, &state2->sumX);
-#endif
+
 		MemoryContextSwitchTo(old_context);
 
 		PG_RETURN_POINTER(state1);
@@ -5969,52 +5889,28 @@ int8_avg_combine(PG_FUNCTION_ARGS)
 	if (state2->N > 0)
 	{
 		state1->N += state2->N;
-
-#ifdef HAVE_INT128
-		state1->sumX += state2->sumX;
-#else
-		/* The rest of this needs to work in the aggregate context */
-		old_context = MemoryContextSwitchTo(agg_context);
-
-		/* Accumulate sums */
-		accum_sum_combine(&state1->sumX, &state2->sumX);
-
-		MemoryContextSwitchTo(old_context);
-#endif
-
+		int128_add_int128(&state1->sumX, state2->sumX);
 	}
 	PG_RETURN_POINTER(state1);
 }
 
 /*
  * int8_avg_serialize
- *		Serialize PolyNumAggState into bytea using the standard
- *		recv-function infrastructure.
+ *		Serialize Int128AggState into bytea for aggregate functions which
+ *		don't require sumX2.
  */
 Datum
 int8_avg_serialize(PG_FUNCTION_ARGS)
 {
-	PolyNumAggState *state;
+	Int128AggState *state;
 	StringInfoData buf;
 	bytea	   *result;
-	NumericVar	tmp_var;
 
 	/* Ensure we disallow calling when not in aggregate context */
 	if (!AggCheckCallContext(fcinfo, NULL))
 		elog(ERROR, "aggregate function called in non-aggregate context");
 
-	state = (PolyNumAggState *) PG_GETARG_POINTER(0);
-
-	/*
-	 * If the platform supports int128 then sumX will be a 128 integer type.
-	 * Here we'll convert that into a numeric type so that the combine state
-	 * is in the same format for both int128 enabled machines and machines
-	 * which don't support that type. The logic here is that one day we might
-	 * like to send these over to another server for further processing and we
-	 * want a standard format to work with.
-	 */
-
-	init_var(&tmp_var);
+	state = (Int128AggState *) PG_GETARG_POINTER(0);
 
 	pq_begintypsend(&buf);
 
@@ -6022,39 +5918,30 @@ int8_avg_serialize(PG_FUNCTION_ARGS)
 	pq_sendint64(&buf, state->N);
 
 	/* sumX */
-#ifdef HAVE_INT128
-	int128_to_numericvar(state->sumX, &tmp_var);
-#else
-	accum_sum_final(&state->sumX, &tmp_var);
-#endif
-	numericvar_serialize(&buf, &tmp_var);
+	int128_serialize(&buf, state->sumX);
 
 	result = pq_endtypsend(&buf);
 
-	free_var(&tmp_var);
-
 	PG_RETURN_BYTEA_P(result);
 }
 
 /*
  * int8_avg_deserialize
- *		Deserialize bytea back into PolyNumAggState.
+ *		Deserialize Int128AggState from bytea for aggregate functions which
+ *		don't require sumX2.
  */
 Datum
 int8_avg_deserialize(PG_FUNCTION_ARGS)
 {
 	bytea	   *sstate;
-	PolyNumAggState *result;
+	Int128AggState *result;
 	StringInfoData buf;
-	NumericVar	tmp_var;
 
 	if (!AggCheckCallContext(fcinfo, NULL))
 		elog(ERROR, "aggregate function called in non-aggregate context");
 
 	sstate = PG_GETARG_BYTEA_PP(0);
 
-	init_var(&tmp_var);
-
 	/*
 	 * Initialize a StringInfo so that we can "receive" it using the standard
 	 * recv-function infrastructure.
@@ -6062,23 +5949,16 @@ int8_avg_deserialize(PG_FUNCTION_ARGS)
 	initReadOnlyStringInfo(&buf, VARDATA_ANY(sstate),
 						   VARSIZE_ANY_EXHDR(sstate));
 
-	result = makePolyNumAggStateCurrentContext(false);
+	result = makeInt128AggStateCurrentContext(false);
 
 	/* N */
 	result->N = pq_getmsgint64(&buf);
 
 	/* sumX */
-	numericvar_deserialize(&buf, &tmp_var);
-#ifdef HAVE_INT128
-	numericvar_to_int128(&tmp_var, &result->sumX);
-#else
-	accum_sum_add(&result->sumX, &tmp_var);
-#endif
+	result->sumX = int128_deserialize(&buf);
 
 	pq_getmsgend(&buf);
 
-	free_var(&tmp_var);
-
 	PG_RETURN_POINTER(result);
 }
 
@@ -6089,24 +5969,16 @@ int8_avg_deserialize(PG_FUNCTION_ARGS)
 Datum
 int2_accum_inv(PG_FUNCTION_ARGS)
 {
-	PolyNumAggState *state;
+	Int128AggState *state;
 
-	state = PG_ARGISNULL(0) ? NULL : (PolyNumAggState *) PG_GETARG_POINTER(0);
+	state = PG_ARGISNULL(0) ? NULL : (Int128AggState *) PG_GETARG_POINTER(0);
 
 	/* Should not get here with no state */
 	if (state == NULL)
 		elog(ERROR, "int2_accum_inv called with NULL state");
 
 	if (!PG_ARGISNULL(1))
-	{
-#ifdef HAVE_INT128
-		do_int128_discard(state, (int128) PG_GETARG_INT16(1));
-#else
-		/* Should never fail, all inputs have dscale 0 */
-		if (!do_numeric_discard(state, int64_to_numeric(PG_GETARG_INT16(1))))
-			elog(ERROR, "do_numeric_discard failed unexpectedly");
-#endif
-	}
+		do_int128_discard(state, PG_GETARG_INT16(1));
 
 	PG_RETURN_POINTER(state);
 }
@@ -6114,24 +5986,16 @@ int2_accum_inv(PG_FUNCTION_ARGS)
 Datum
 int4_accum_inv(PG_FUNCTION_ARGS)
 {
-	PolyNumAggState *state;
+	Int128AggState *state;
 
-	state = PG_ARGISNULL(0) ? NULL : (PolyNumAggState *) PG_GETARG_POINTER(0);
+	state = PG_ARGISNULL(0) ? NULL : (Int128AggState *) PG_GETARG_POINTER(0);
 
 	/* Should not get here with no state */
 	if (state == NULL)
 		elog(ERROR, "int4_accum_inv called with NULL state");
 
 	if (!PG_ARGISNULL(1))
-	{
-#ifdef HAVE_INT128
-		do_int128_discard(state, (int128) PG_GETARG_INT32(1));
-#else
-		/* Should never fail, all inputs have dscale 0 */
-		if (!do_numeric_discard(state, int64_to_numeric(PG_GETARG_INT32(1))))
-			elog(ERROR, "do_numeric_discard failed unexpectedly");
-#endif
-	}
+		do_int128_discard(state, PG_GETARG_INT32(1));
 
 	PG_RETURN_POINTER(state);
 }
@@ -6160,24 +6024,16 @@ int8_accum_inv(PG_FUNCTION_ARGS)
 Datum
 int8_avg_accum_inv(PG_FUNCTION_ARGS)
 {
-	PolyNumAggState *state;
+	Int128AggState *state;
 
-	state = PG_ARGISNULL(0) ? NULL : (PolyNumAggState *) PG_GETARG_POINTER(0);
+	state = PG_ARGISNULL(0) ? NULL : (Int128AggState *) PG_GETARG_POINTER(0);
 
 	/* Should not get here with no state */
 	if (state == NULL)
 		elog(ERROR, "int8_avg_accum_inv called with NULL state");
 
 	if (!PG_ARGISNULL(1))
-	{
-#ifdef HAVE_INT128
-		do_int128_discard(state, (int128) PG_GETARG_INT64(1));
-#else
-		/* Should never fail, all inputs have dscale 0 */
-		if (!do_numeric_discard(state, int64_to_numeric(PG_GETARG_INT64(1))))
-			elog(ERROR, "do_numeric_discard failed unexpectedly");
-#endif
-	}
+		do_int128_discard(state, PG_GETARG_INT64(1));
 
 	PG_RETURN_POINTER(state);
 }
@@ -6185,12 +6041,11 @@ int8_avg_accum_inv(PG_FUNCTION_ARGS)
 Datum
 numeric_poly_sum(PG_FUNCTION_ARGS)
 {
-#ifdef HAVE_INT128
-	PolyNumAggState *state;
+	Int128AggState *state;
 	Numeric		res;
 	NumericVar	result;
 
-	state = PG_ARGISNULL(0) ? NULL : (PolyNumAggState *) PG_GETARG_POINTER(0);
+	state = PG_ARGISNULL(0) ? NULL : (Int128AggState *) PG_GETARG_POINTER(0);
 
 	/* If there were no non-null inputs, return NULL */
 	if (state == NULL || state->N == 0)
@@ -6205,21 +6060,17 @@ numeric_poly_sum(PG_FUNCTION_ARGS)
 	free_var(&result);
 
 	PG_RETURN_NUMERIC(res);
-#else
-	return numeric_sum(fcinfo);
-#endif
 }
 
 Datum
 numeric_poly_avg(PG_FUNCTION_ARGS)
 {
-#ifdef HAVE_INT128
-	PolyNumAggState *state;
+	Int128AggState *state;
 	NumericVar	result;
 	Datum		countd,
 				sumd;
 
-	state = PG_ARGISNULL(0) ? NULL : (PolyNumAggState *) PG_GETARG_POINTER(0);
+	state = PG_ARGISNULL(0) ? NULL : (Int128AggState *) PG_GETARG_POINTER(0);
 
 	/* If there were no non-null inputs, return NULL */
 	if (state == NULL || state->N == 0)
@@ -6235,9 +6086,6 @@ numeric_poly_avg(PG_FUNCTION_ARGS)
 	free_var(&result);
 
 	PG_RETURN_DATUM(DirectFunctionCall2(numeric_div, sumd, countd));
-#else
-	return numeric_avg(fcinfo);
-#endif
 }
 
 Datum
@@ -6470,7 +6318,6 @@ numeric_stddev_pop(PG_FUNCTION_ARGS)
 		PG_RETURN_NUMERIC(res);
 }
 
-#ifdef HAVE_INT128
 static Numeric
 numeric_poly_stddev_internal(Int128AggState *state,
 							 bool variance, bool sample,
@@ -6514,17 +6361,15 @@ numeric_poly_stddev_internal(Int128AggState *state,
 
 	return res;
 }
-#endif
 
 Datum
 numeric_poly_var_samp(PG_FUNCTION_ARGS)
 {
-#ifdef HAVE_INT128
-	PolyNumAggState *state;
+	Int128AggState *state;
 	Numeric		res;
 	bool		is_null;
 
-	state = PG_ARGISNULL(0) ? NULL : (PolyNumAggState *) PG_GETARG_POINTER(0);
+	state = PG_ARGISNULL(0) ? NULL : (Int128AggState *) PG_GETARG_POINTER(0);
 
 	res = numeric_poly_stddev_internal(state, true, true, &is_null);
 
@@ -6532,20 +6377,16 @@ numeric_poly_var_samp(PG_FUNCTION_ARGS)
 		PG_RETURN_NULL();
 	else
 		PG_RETURN_NUMERIC(res);
-#else
-	return numeric_var_samp(fcinfo);
-#endif
 }
 
 Datum
 numeric_poly_stddev_samp(PG_FUNCTION_ARGS)
 {
-#ifdef HAVE_INT128
-	PolyNumAggState *state;
+	Int128AggState *state;
 	Numeric		res;
 	bool		is_null;
 
-	state = PG_ARGISNULL(0) ? NULL : (PolyNumAggState *) PG_GETARG_POINTER(0);
+	state = PG_ARGISNULL(0) ? NULL : (Int128AggState *) PG_GETARG_POINTER(0);
 
 	res = numeric_poly_stddev_internal(state, false, true, &is_null);
 
@@ -6553,20 +6394,16 @@ numeric_poly_stddev_samp(PG_FUNCTION_ARGS)
 		PG_RETURN_NULL();
 	else
 		PG_RETURN_NUMERIC(res);
-#else
-	return numeric_stddev_samp(fcinfo);
-#endif
 }
 
 Datum
 numeric_poly_var_pop(PG_FUNCTION_ARGS)
 {
-#ifdef HAVE_INT128
-	PolyNumAggState *state;
+	Int128AggState *state;
 	Numeric		res;
 	bool		is_null;
 
-	state = PG_ARGISNULL(0) ? NULL : (PolyNumAggState *) PG_GETARG_POINTER(0);
+	state = PG_ARGISNULL(0) ? NULL : (Int128AggState *) PG_GETARG_POINTER(0);
 
 	res = numeric_poly_stddev_internal(state, true, false, &is_null);
 
@@ -6574,20 +6411,16 @@ numeric_poly_var_pop(PG_FUNCTION_ARGS)
 		PG_RETURN_NULL();
 	else
 		PG_RETURN_NUMERIC(res);
-#else
-	return numeric_var_pop(fcinfo);
-#endif
 }
 
 Datum
 numeric_poly_stddev_pop(PG_FUNCTION_ARGS)
 {
-#ifdef HAVE_INT128
-	PolyNumAggState *state;
+	Int128AggState *state;
 	Numeric		res;
 	bool		is_null;
 
-	state = PG_ARGISNULL(0) ? NULL : (PolyNumAggState *) PG_GETARG_POINTER(0);
+	state = PG_ARGISNULL(0) ? NULL : (Int128AggState *) PG_GETARG_POINTER(0);
 
 	res = numeric_poly_stddev_internal(state, false, false, &is_null);
 
@@ -6595,9 +6428,6 @@ numeric_poly_stddev_pop(PG_FUNCTION_ARGS)
 		PG_RETURN_NULL();
 	else
 		PG_RETURN_NUMERIC(res);
-#else
-	return numeric_stddev_pop(fcinfo);
-#endif
 }
 
 /*
@@ -8330,105 +8160,23 @@ numericvar_to_uint64(const NumericVar *var, uint64 *result)
 	return true;
 }
 
-#ifdef HAVE_INT128
-/*
- * Convert numeric to int128, rounding if needed.
- *
- * If overflow, return false (no error is raised).  Return true if okay.
- */
-static bool
-numericvar_to_int128(const NumericVar *var, int128 *result)
-{
-	NumericDigit *digits;
-	int			ndigits;
-	int			weight;
-	int			i;
-	int128		val,
-				oldval;
-	bool		neg;
-	NumericVar	rounded;
-
-	/* Round to nearest integer */
-	init_var(&rounded);
-	set_var_from_var(var, &rounded);
-	round_var(&rounded, 0);
-
-	/* Check for zero input */
-	strip_var(&rounded);
-	ndigits = rounded.ndigits;
-	if (ndigits == 0)
-	{
-		*result = 0;
-		free_var(&rounded);
-		return true;
-	}
-
-	/*
-	 * For input like 10000000000, we must treat stripped digits as real. So
-	 * the loop assumes there are weight+1 digits before the decimal point.
-	 */
-	weight = rounded.weight;
-	Assert(weight >= 0 && ndigits <= weight + 1);
-
-	/* Construct the result */
-	digits = rounded.digits;
-	neg = (rounded.sign == NUMERIC_NEG);
-	val = digits[0];
-	for (i = 1; i <= weight; i++)
-	{
-		oldval = val;
-		val *= NBASE;
-		if (i < ndigits)
-			val += digits[i];
-
-		/*
-		 * The overflow check is a bit tricky because we want to accept
-		 * INT128_MIN, which will overflow the positive accumulator.  We can
-		 * detect this case easily though because INT128_MIN is the only
-		 * nonzero value for which -val == val (on a two's complement machine,
-		 * anyway).
-		 */
-		if ((val / NBASE) != oldval)	/* possible overflow? */
-		{
-			if (!neg || (-val) != val || val == 0 || oldval < 0)
-			{
-				free_var(&rounded);
-				return false;
-			}
-		}
-	}
-
-	free_var(&rounded);
-
-	*result = neg ? -val : val;
-	return true;
-}
-
 /*
  * Convert 128 bit integer to numeric.
  */
 static void
-int128_to_numericvar(int128 val, NumericVar *var)
+int128_to_numericvar(INT128 val, NumericVar *var)
 {
-	uint128		uval,
-				newuval;
+	int			sign;
 	NumericDigit *ptr;
 	int			ndigits;
+	int32		dig;
 
 	/* int128 can require at most 39 decimal digits; add one for safety */
 	alloc_var(var, 40 / DEC_DIGITS);
-	if (val < 0)
-	{
-		var->sign = NUMERIC_NEG;
-		uval = -val;
-	}
-	else
-	{
-		var->sign = NUMERIC_POS;
-		uval = val;
-	}
+	sign = int128_sign(val);
+	var->sign = sign < 0 ? NUMERIC_NEG : NUMERIC_POS;
 	var->dscale = 0;
-	if (val == 0)
+	if (sign == 0)
 	{
 		var->ndigits = 0;
 		var->weight = 0;
@@ -8440,15 +8188,13 @@ int128_to_numericvar(int128 val, NumericVar *var)
 	{
 		ptr--;
 		ndigits++;
-		newuval = uval / NBASE;
-		*ptr = uval - newuval * NBASE;
-		uval = newuval;
-	} while (uval);
+		int128_div_mod_int32(&val, NBASE, &dig);
+		*ptr = dig;
+	} while (!int128_is_zero(val));
 	var->digits = ptr;
 	var->ndigits = ndigits;
 	var->weight = ndigits - 1;
 }
-#endif
 
 /*
  * Convert a NumericVar to float8; if out of range, return +/- HUGE_VAL
diff --git a/src/include/common/int128.h b/src/include/common/int128.h
index d45296e1ad1..1360e1c4ed1 100644
--- a/src/include/common/int128.h
+++ b/src/include/common/int128.h
@@ -37,11 +37,18 @@
  * that a native int128 type would (probably) have.  This makes no difference
  * for ordinary use of INT128, but allows union'ing INT128 with int128 for
  * testing purposes.
+ *
+ * PG_INT128_HI_INT64 and PG_INT128_LO_UINT64 allow the (signed) high and
+ * (unsigned) low 64-bit integer parts to be extracted portably on all
+ * platforms.
  */
 #if USE_NATIVE_INT128
 
 typedef int128 INT128;
 
+#define PG_INT128_HI_INT64(i128)	((int64) ((i128) >> 64))
+#define PG_INT128_LO_UINT64(i128)	((uint64) (i128))
+
 #else
 
 typedef struct
@@ -55,7 +62,28 @@ typedef struct
 #endif
 } INT128;
 
+#define PG_INT128_HI_INT64(i128)	((i128).hi)
+#define PG_INT128_LO_UINT64(i128)	((i128).lo)
+
+#endif
+
+/*
+ * Construct an INT128 from (signed) high and (unsigned) low 64-bit integer
+ * parts.
+ */
+static inline INT128
+make_int128(int64 hi, uint64 lo)
+{
+#if USE_NATIVE_INT128
+	return (((int128) hi) << 64) + lo;
+#else
+	INT128		val;
+
+	val.hi = hi;
+	val.lo = lo;
+	return val;
 #endif
+}
 
 /*
  * Add an unsigned int64 value into an INT128 variable.
@@ -108,6 +136,58 @@ int128_add_int64(INT128 *i128, int64 v)
 #endif
 }
 
+/*
+ * Add an INT128 value into an INT128 variable.
+ */
+static inline void
+int128_add_int128(INT128 *i128, INT128 v)
+{
+#if USE_NATIVE_INT128
+	*i128 += v;
+#else
+	int128_add_uint64(i128, v.lo);
+	i128->hi += v.hi;
+#endif
+}
+
+/*
+ * Subtract an unsigned int64 value from an INT128 variable.
+ */
+static inline void
+int128_sub_uint64(INT128 *i128, uint64 v)
+{
+#if USE_NATIVE_INT128
+	*i128 -= v;
+#else
+	/*
+	 * This is like int128_add_uint64(), except we must propagate a borrow to
+	 * (subtract 1 from) the .hi part if the new .lo part is greater than the
+	 * old .lo part.
+	 */
+	uint64		oldlo = i128->lo;
+
+	i128->lo -= v;
+	i128->hi -= (i128->lo > oldlo);
+#endif
+}
+
+/*
+ * Subtract a signed int64 value from an INT128 variable.
+ */
+static inline void
+int128_sub_int64(INT128 *i128, int64 v)
+{
+#if USE_NATIVE_INT128
+	*i128 -= v;
+#else
+	/* Like int128_add_int64() with the sign of v inverted */
+	uint64		oldlo = i128->lo;
+
+	i128->lo -= v;
+	i128->hi -= (i128->lo > oldlo) + (v >> 63);
+#endif
+}
+
 /*
  * INT64_HI_INT32 extracts the most significant 32 bits of int64 as int32.
  * INT64_LO_UINT32 extracts the least significant 32 bits as uint32.
@@ -178,6 +258,165 @@ int128_add_int64_mul_int64(INT128 *i128, int64 x, int64 y)
 #endif
 }
 
+/*
+ * Subtract the 128-bit product of two int64 values from an INT128 variable.
+ */
+static inline void
+int128_sub_int64_mul_int64(INT128 *i128, int64 x, int64 y)
+{
+#if USE_NATIVE_INT128
+	*i128 -= (int128) x * (int128) y;
+#else
+	/* As above, except subtract the 128-bit product */
+	if (x != 0 && y != 0)
+	{
+		int32		x_hi = INT64_HI_INT32(x);
+		uint32		x_lo = INT64_LO_UINT32(x);
+		int32		y_hi = INT64_HI_INT32(y);
+		uint32		y_lo = INT64_LO_UINT32(y);
+		int64		tmp;
+
+		/* the first term */
+		i128->hi -= (int64) x_hi * (int64) y_hi;
+
+		/* the second term: sign-extended with the sign of x */
+		tmp = (int64) x_hi * (int64) y_lo;
+		i128->hi -= INT64_HI_INT32(tmp);
+		int128_sub_uint64(i128, ((uint64) INT64_LO_UINT32(tmp)) << 32);
+
+		/* the third term: sign-extended with the sign of y */
+		tmp = (int64) x_lo * (int64) y_hi;
+		i128->hi -= INT64_HI_INT32(tmp);
+		int128_sub_uint64(i128, ((uint64) INT64_LO_UINT32(tmp)) << 32);
+
+		/* the fourth term: always unsigned */
+		int128_sub_uint64(i128, (uint64) x_lo * (uint64) y_lo);
+	}
+#endif
+}
+
+/*
+ * Divide an INT128 variable by a signed int32 value, returning the quotient
+ * and remainder.  The remainder will have the same sign as *i128.
+ *
+ * Note: This provides no protection against dividing by 0, or dividing
+ * INT128_MIN by -1, which overflows.  It is the caller's responsibility to
+ * guard against those.
+ */
+static inline void
+int128_div_mod_int32(INT128 *i128, int32 v, int32 *remainder)
+{
+#if USE_NATIVE_INT128
+	int128		old_i128 = *i128;
+
+	*i128 /= v;
+	*remainder = (int32) (old_i128 - *i128 * v);
+#else
+	/*
+	 * To avoid any intermediate values overflowing (as happens if INT64_MIN
+	 * is divided by -1), we first compute the quotient abs(*i128) / abs(v)
+	 * using unsigned 64-bit arithmetic, and then fix the signs up at the end.
+	 *
+	 * The quotient is computed using the short division algorithm described
+	 * in Knuth volume 2, section 4.3.1 exercise 16 (cf. div_var_int() in
+	 * numeric.c).  Since the absolute value of the divisor is known to be at
+	 * most 2^31, the remainder carried from one digit to the next is at most
+	 * 2^31 - 1, and so there is no danger of overflow when this is combined
+	 * with the next digit (a 32-bit unsigned integer).
+	 */
+	uint64		n_hi;
+	uint64		n_lo;
+	uint32		d;
+	uint64		q;
+	uint64		r;
+	uint64		tmp;
+
+	/* numerator: absolute value of *i128 */
+	if (i128->hi < 0)
+	{
+		n_hi = -((uint64) i128->hi);
+		n_lo = -i128->lo;
+		if (n_lo != 0)
+			n_hi--;
+	}
+	else
+	{
+		n_hi = i128->hi;
+		n_lo = i128->lo;
+	}
+
+	/* denomimator: absolute value of v */
+	d = abs(v);
+
+	/* quotient and remainder of high 64 bits */
+	q = n_hi / d;
+	r = n_hi % d;
+	n_hi = q;
+
+	/* quotient and remainder of next 32 bits (upper half of n_lo) */
+	tmp = (r << 32) + (n_lo >> 32);
+	q = tmp / d;
+	r = tmp % d;
+
+	/* quotient and remainder of last 32 bits (lower half of n_lo) */
+	tmp = (r << 32) + (uint32) n_lo;
+	n_lo = q << 32;
+	q = tmp / d;
+	r = tmp % d;
+	n_lo += q;
+
+	/* final remainder should have the same sign as *i128 */
+	*remainder = i128->hi < 0 ? (int32) -r : (int32) r;
+
+	/* store the quotient in *i128, negating it if necessary */
+	if ((i128->hi < 0) != (v < 0))
+	{
+		n_hi = -n_hi;
+		n_lo = -n_lo;
+		if (n_lo != 0)
+			n_hi--;
+	}
+	i128->hi = (int64) n_hi;
+	i128->lo = n_lo;
+#endif
+}
+
+/*
+ * Test if an INT128 value is zero.
+ */
+static inline bool
+int128_is_zero(INT128 x)
+{
+#if USE_NATIVE_INT128
+	return x == 0;
+#else
+	return x.hi == 0 && x.lo == 0;
+#endif
+}
+
+/*
+ * Return the sign of an INT128 value (returns -1, 0, or +1).
+ */
+static inline int
+int128_sign(INT128 x)
+{
+#if USE_NATIVE_INT128
+	if (x < 0)
+		return -1;
+	if (x > 0)
+		return 1;
+	return 0;
+#else
+	if (x.hi < 0)
+		return -1;
+	if (x.hi > 0)
+		return 1;
+	if (x.lo > 0)
+		return 1;
+	return 0;
+#endif
+}
+
 /*
  * Compare two INT128 values, return -1, 0, or +1.
  */
diff --git a/src/test/modules/test_int128/test_int128.c b/src/test/modules/test_int128/test_int128.c
index 62b67823fbd..3de20269f22 100644
--- a/src/test/modules/test_int128/test_int128.c
+++ b/src/test/modules/test_int128/test_int128.c
@@ -87,8 +87,13 @@ main(int argc, char **argv)
 		int64		x = pg_prng_uint64(&pg_global_prng_state);
 		int64		y = pg_prng_uint64(&pg_global_prng_state);
 		int64		z = pg_prng_uint64(&pg_global_prng_state);
+		int64		w = pg_prng_uint64(&pg_global_prng_state);
+		int32		z32 = (int32) z;
 		test128		t1;
 		test128		t2;
+		test128		t3;
+		int32		r1;
+		int32		r2;
 
 		/* check unsigned addition */
 		t1.hl.hi = x;
@@ -120,25 +125,111 @@ main(int argc, char **argv)
 			return 1;
 		}
 
-		/* check multiplication */
-		t1.i128 = (int128) x * (int128) y;
+		/* check 128-bit signed addition */
+		t1.hl.hi = x;
+		t1.hl.lo = y;
+		t2 = t1;
+		t3.hl.hi = z;
+		t3.hl.lo = w;
+		t1.i128 += t3.i128;
+		int128_add_int128(&t2.I128, t3.I128);
 
-		t2.hl.hi = t2.hl.lo = 0;
-		int128_add_int64_mul_int64(&t2.I128, x, y);
+		if (t1.hl.hi != t2.hl.hi || t1.hl.lo != t2.hl.lo)
+		{
+			printf("%016lX%016lX + %016lX%016lX\n", x, y, z, w);
+			printf("native = %016lX%016lX\n", t1.hl.hi, t1.hl.lo);
+			printf("result = %016lX%016lX\n", t2.hl.hi, t2.hl.lo);
+			return 1;
+		}
+
+		/* check unsigned subtraction */
+		t1.hl.hi = x;
+		t1.hl.lo = y;
+		t2 = t1;
+		t1.i128 -= (int128) (uint64) z;
+		int128_sub_uint64(&t2.I128, (uint64) z);
 
 		if (t1.hl.hi != t2.hl.hi || t1.hl.lo != t2.hl.lo)
 		{
-			printf("%lX * %lX\n", x, y);
+			printf("%016lX%016lX - unsigned %lX\n", x, y, z);
 			printf("native = %016lX%016lX\n", t1.hl.hi, t1.hl.lo);
 			printf("result = %016lX%016lX\n", t2.hl.hi, t2.hl.lo);
 			return 1;
 		}
 
+		/* check signed subtraction */
+		t1.hl.hi = x;
+		t1.hl.lo = y;
+		t2 = t1;
+		t1.i128 -= (int128) z;
+		int128_sub_int64(&t2.I128, z);
+
+		if (t1.hl.hi != t2.hl.hi || t1.hl.lo != t2.hl.lo)
+		{
+			printf("%016lX%016lX - signed %lX\n", x, y, z);
+			printf("native = %016lX%016lX\n", t1.hl.hi, t1.hl.lo);
+			printf("result = %016lX%016lX\n", t2.hl.hi, t2.hl.lo);
+			return 1;
+		}
+
+		/* check 64x64-bit multiply-add */
+		t1.hl.hi = x;
+		t1.hl.lo = y;
+		t2 = t1;
+		t1.i128 += (int128) z * (int128) w;
+		int128_add_int64_mul_int64(&t2.I128, z, w);
+
+		if (t1.hl.hi != t2.hl.hi || t1.hl.lo != t2.hl.lo)
+		{
+			printf("%016lX%016lX + %lX * %lX\n", x, y, z, w);
+			printf("native = %016lX%016lX\n", t1.hl.hi, t1.hl.lo);
+			printf("result = %016lX%016lX\n", t2.hl.hi, t2.hl.lo);
+			return 1;
+		}
+
+		/* check 64x64-bit multiply-subtract */
+		t1.hl.hi = x;
+		t1.hl.lo = y;
+		t2 = t1;
+		t1.i128 -= (int128) z * (int128) w;
+		int128_sub_int64_mul_int64(&t2.I128, z, w);
+
+		if (t1.hl.hi != t2.hl.hi || t1.hl.lo != t2.hl.lo)
+		{
+			printf("%016lX%016lX - %lX * %lX\n", x, y, z, w);
+			printf("native = %016lX%016lX\n", t1.hl.hi, t1.hl.lo);
+			printf("result = %016lX%016lX\n", t2.hl.hi, t2.hl.lo);
+			return 1;
+		}
+
+		/* check 128/32-bit division */
+		t3.hl.hi = x;
+		t3.hl.lo = y;
+		t1.i128 = t3.i128 / z32;
+		r1 = (int32) (t3.i128 % z32);
+		t2 = t3;
+		int128_div_mod_int32(&t2.I128, z32, &r2);
+
+		if (t1.hl.hi != t2.hl.hi || t1.hl.lo != t2.hl.lo)
+		{
+			printf("%016lX%016lX / signed %lX\n", t3.hl.hi, t3.hl.lo, z32);
+			printf("native = %016lX%016lX\n", t1.hl.hi, t1.hl.lo);
+			printf("result = %016lX%016lX\n", t2.hl.hi, t2.hl.lo);
+			return 1;
+		}
+		if (r1 != r2)
+		{
+			printf("%016lX%016lX % signed %lX\n", t3.hl.hi, t3.hl.lo, z32);
+			printf("native = %lX\n", r1);
+			printf("result = %lX\n", r2);
+			return 1;
+		}
+
 		/* check comparison */
 		t1.hl.hi = x;
 		t1.hl.lo = y;
 		t2.hl.hi = z;
-		t2.hl.lo = pg_prng_uint64(&pg_global_prng_state);
+		t2.hl.lo = w;
 
 		if (my_int128_compare(t1.i128, t2.i128) !=
 			int128_compare(t1.I128, t2.I128))
-- 
2.43.0

From c0bc41311e582bbaead97328b1c3dc59e217aefc Mon Sep 17 00:00:00 2001
From: Dean Rasheed <dean.a.rash...@gmail.com>
Date: Sat, 21 Jun 2025 12:22:30 +0100
Subject: [PATCH v2 2/5] Refactor int128.h, bringing the native and non-native
 code together.

This rearranges the code in include/common/int128.h, so that the
native and non-native implementations of each function are together
inside the function body (as they are in include/common/int.h), rather
than being in separate parts of the file.

This improves readability and maintainability, making it easier to
compare the native and non-native implementations, and avoiding the
need to duplicate every function comment and declaration.
---
 src/include/common/int128.h | 112 ++++++++++++++----------------------
 1 file changed, 42 insertions(+), 70 deletions(-)

diff --git a/src/include/common/int128.h b/src/include/common/int128.h
index f22530a164e..8c300e56d9a 100644
--- a/src/include/common/int128.h
+++ b/src/include/common/int128.h
@@ -29,81 +29,21 @@
 #endif
 #endif
 
-
-#if USE_NATIVE_INT128
-
-typedef int128 INT128;
-
-/*
- * Add an unsigned int64 value into an INT128 variable.
- */
-static inline void
-int128_add_uint64(INT128 *i128, uint64 v)
-{
-	*i128 += v;
-}
-
 /*
- * Add a signed int64 value into an INT128 variable.
- */
-static inline void
-int128_add_int64(INT128 *i128, int64 v)
-{
-	*i128 += v;
-}
-
-/*
- * Add the 128-bit product of two int64 values into an INT128 variable.
+ * If native int128 support is enabled, INT128 is just int128. Otherwise, it
+ * is a structure with separate 64-bit high and low parts.
  *
- * XXX with a stupid compiler, this could actually be less efficient than
- * the other implementation; maybe we should do it by hand always?
- */
-static inline void
-int128_add_int64_mul_int64(INT128 *i128, int64 x, int64 y)
-{
-	*i128 += (int128) x * (int128) y;
-}
-
-/*
- * Compare two INT128 values, return -1, 0, or +1.
- */
-static inline int
-int128_compare(INT128 x, INT128 y)
-{
-	if (x < y)
-		return -1;
-	if (x > y)
-		return 1;
-	return 0;
-}
-
-/*
- * Widen int64 to INT128.
- */
-static inline INT128
-int64_to_int128(int64 v)
-{
-	return (INT128) v;
-}
-
-/*
- * Convert INT128 to int64 (losing any high-order bits).
- * This also works fine for casting down to uint64.
- */
-static inline int64
-int128_to_int64(INT128 val)
-{
-	return (int64) val;
-}
-
-#else							/* !USE_NATIVE_INT128 */
-
-/*
  * We lay out the INT128 structure with the same content and byte ordering
  * that a native int128 type would (probably) have.  This makes no difference
  * for ordinary use of INT128, but allows union'ing INT128 with int128 for
  * testing purposes.
  */
+#if USE_NATIVE_INT128
+
+typedef int128 INT128;
+
+#else
+
 typedef struct
 {
 #ifdef WORDS_BIGENDIAN
@@ -115,12 +55,17 @@ typedef struct
 #endif
 } INT128;
 
+#endif
+
 /*
  * Add an unsigned int64 value into an INT128 variable.
  */
 static inline void
 int128_add_uint64(INT128 *i128, uint64 v)
 {
+#if USE_NATIVE_INT128
+	*i128 += v;
+#else
 	/*
 	 * First add the value to the .lo part, then check to see if a carry needs
 	 * to be propagated into the .hi part.  A carry is needed if both inputs
@@ -134,6 +79,7 @@ int128_add_uint64(INT128 *i128, uint64 v)
 	if (((int64) v < 0 && (int64) oldlo < 0) ||
 		(((int64) v < 0 || (int64) oldlo < 0) && (int64) i128->lo >= 0))
 		i128->hi++;
+#endif
 }
 
 /*
@@ -142,6 +88,9 @@ int128_add_uint64(INT128 *i128, uint64 v)
 static inline void
 int128_add_int64(INT128 *i128, int64 v)
 {
+#if USE_NATIVE_INT128
+	*i128 += v;
+#else
 	/*
 	 * This is much like the above except that the carry logic differs for
 	 * negative v.  Ordinarily we'd need to subtract 1 from the .hi part
@@ -161,6 +110,7 @@ int128_add_int64(INT128 *i128, int64 v)
 		if (!((int64) oldlo < 0 || (int64) i128->lo >= 0))
 			i128->hi--;
 	}
+#endif
 }
 
 /*
@@ -176,6 +126,13 @@ int128_add_int64(INT128 *i128, int64 v)
 static inline void
 int128_add_int64_mul_int64(INT128 *i128, int64 x, int64 y)
 {
+#if USE_NATIVE_INT128
+	/*
+	 * XXX with a stupid compiler, this could actually be less efficient than
+	 * the non-native implementation; maybe we should do it by hand always?
+	 */
+	*i128 += (int128) x * (int128) y;
+#else
 	/* INT64_AU32 must use arithmetic right shift */
 	StaticAssertDecl(((int64) -1 >> 1) == (int64) -1,
 					 "arithmetic right shift is needed");
@@ -229,6 +186,7 @@ int128_add_int64_mul_int64(INT128 *i128, int64 x, int64 y)
 		/* the fourth term: always unsigned */
 		int128_add_uint64(i128, x_l32 * y_l32);
 	}
+#endif
 }
 
 /*
@@ -237,6 +195,13 @@ int128_add_int64_mul_int64(INT128 *i128, int64 x, int64 y)
 static inline int
 int128_compare(INT128 x, INT128 y)
 {
+#if USE_NATIVE_INT128
+	if (x < y)
+		return -1;
+	if (x > y)
+		return 1;
+	return 0;
+#else
 	if (x.hi < y.hi)
 		return -1;
 	if (x.hi > y.hi)
@@ -246,6 +211,7 @@ int128_compare(INT128 x, INT128 y)
 	if (x.lo > y.lo)
 		return 1;
 	return 0;
+#endif
 }
 
 /*
@@ -254,11 +220,15 @@ int128_compare(INT128 x, INT128 y)
 static inline INT128
 int64_to_int128(int64 v)
 {
+#if USE_NATIVE_INT128
+	return (INT128) v;
+#else
 	INT128		val;
 
 	val.lo = (uint64) v;
 	val.hi = (v < 0) ? -INT64CONST(1) : INT64CONST(0);
 	return val;
+#endif
 }
 
 /*
@@ -268,9 +238,11 @@ int64_to_int128(int64 v)
 static inline int64
 int128_to_int64(INT128 val)
 {
+#if USE_NATIVE_INT128
+	return (int64) val;
+#else
 	return (int64) val.lo;
+#endif
 }
 
-#endif							/* USE_NATIVE_INT128 */
-
 #endif							/* INT128_H */
-- 
2.43.0

From 0578f2a824270859a572287d9aee7c3894cb73fb Mon Sep 17 00:00:00 2001
From: Dean Rasheed <dean.a.rash...@gmail.com>
Date: Thu, 10 Jul 2025 13:51:53 +0100
Subject: [PATCH v2 1/5] Convert src/tools/testint128.c into a test module.

This creates a new test module "test_int128" and moves
src/tools/testint128.c to src/test/modules/test_int128/test_int128.c,
so that it can be built using the normal build system, and 128-bit
integer arithmetic gets tested automatically.

While at it, fix the test128 union in the test code: the "hl" member
of test128 was incorrectly defined to be a union instead of a struct,
which meant that the tests were only ever setting and checking half of
each 128-bit integer value.
---
 src/include/common/int128.h                   |  2 +-
 src/test/modules/Makefile                     |  1 +
 src/test/modules/meson.build                  |  1 +
 src/test/modules/test_int128/.gitignore       |  2 ++
 src/test/modules/test_int128/Makefile         | 23 +++++++++++++
 src/test/modules/test_int128/meson.build      | 33 +++++++++++++++++++
 .../modules/test_int128/t/001_test_int128.pl  | 22 +++++++++++++
 .../modules/test_int128/test_int128.c}        |  6 ++--
 8 files changed, 86 insertions(+), 4 deletions(-)
 create mode 100644 src/test/modules/test_int128/.gitignore
 create mode 100644 src/test/modules/test_int128/Makefile
 create mode 100644 src/test/modules/test_int128/meson.build
 create mode 100644 src/test/modules/test_int128/t/001_test_int128.pl
 rename src/{tools/testint128.c => test/modules/test_int128/test_int128.c} (98%)

diff --git a/src/include/common/int128.h b/src/include/common/int128.h
index a50f5709c29..f22530a164e 100644
--- a/src/include/common/int128.h
+++ b/src/include/common/int128.h
@@ -6,7 +6,7 @@
  * We make use of the native int128 type if there is one, otherwise
  * implement things the hard way based on two int64 halves.
  *
- * See src/tools/testint128.c for a simple test harness for this file.
+ * See src/test/modules/test_int128 for a simple test harness for this file.
  *
  * Copyright (c) 2017-2025, PostgreSQL Global Development Group
  *
diff --git a/src/test/modules/Makefile b/src/test/modules/Makefile
index aa1d27bbed3..a31fad53497 100644
--- a/src/test/modules/Makefile
+++ b/src/test/modules/Makefile
@@ -24,6 +24,7 @@ SUBDIRS = \
 		  test_escape \
 		  test_extensions \
 		  test_ginpostinglist \
+		  test_int128 \
 		  test_integerset \
 		  test_json_parser \
 		  test_lfind \
diff --git a/src/test/modules/meson.build b/src/test/modules/meson.build
index 9de0057bd1d..df4f13fcbb0 100644
--- a/src/test/modules/meson.build
+++ b/src/test/modules/meson.build
@@ -23,6 +23,7 @@ subdir('test_dsm_registry')
 subdir('test_escape')
 subdir('test_extensions')
 subdir('test_ginpostinglist')
+subdir('test_int128')
 subdir('test_integerset')
 subdir('test_json_parser')
 subdir('test_lfind')
diff --git a/src/test/modules/test_int128/.gitignore b/src/test/modules/test_int128/.gitignore
new file mode 100644
index 00000000000..277fec6ed2c
--- /dev/null
+++ b/src/test/modules/test_int128/.gitignore
@@ -0,0 +1,2 @@
+/tmp_check/
+/test_int128
diff --git a/src/test/modules/test_int128/Makefile b/src/test/modules/test_int128/Makefile
new file mode 100644
index 00000000000..2e86ee93a9d
--- /dev/null
+++ b/src/test/modules/test_int128/Makefile
@@ -0,0 +1,23 @@
+# src/test/modules/test_int128/Makefile
+
+PGFILEDESC = "test_int128 - test 128-bit integer arithmetic"
+
+PROGRAM = test_int128
+OBJS = $(WIN32RES) test_int128.o
+
+PG_CPPFLAGS = -I$(libpq_srcdir)
+PG_LIBS_INTERNAL += $(libpq_pgport)
+
+NO_INSTALL = 1
+TAP_TESTS = 1
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = src/test/modules/test_int128
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/src/test/modules/test_int128/meson.build b/src/test/modules/test_int128/meson.build
new file mode 100644
index 00000000000..4c2be7a0326
--- /dev/null
+++ b/src/test/modules/test_int128/meson.build
@@ -0,0 +1,33 @@
+# Copyright (c) 2025, PostgreSQL Global Development Group
+
+test_int128_sources = files(
+  'test_int128.c',
+)
+
+if host_system == 'windows'
+  test_int128_sources += rc_bin_gen.process(win32ver_rc, extra_args: [
+    '--NAME', 'test_int128',
+    '--FILEDESC', 'test int128 program',])
+endif
+
+test_int128 = executable('test_int128',
+  test_int128_sources,
+  dependencies: [frontend_code, libpq],
+  kwargs: default_bin_args + {
+    'install': false,
+  },
+)
+testprep_targets += test_int128
+
+
+tests += {
+  'name': 'test_int128',
+  'sd': meson.current_source_dir(),
+  'bd': meson.current_build_dir(),
+  'tap': {
+    'tests': [
+      't/001_test_int128.pl',
+    ],
+    'deps': [test_int128],
+  },
+}
diff --git a/src/test/modules/test_int128/t/001_test_int128.pl b/src/test/modules/test_int128/t/001_test_int128.pl
new file mode 100644
index 00000000000..03f4abfb1bb
--- /dev/null
+++ b/src/test/modules/test_int128/t/001_test_int128.pl
@@ -0,0 +1,22 @@
+# Copyright (c) 2025, PostgreSQL Global Development Group
+
+# Test 128-bit integer arithmetic code in int128.h
+
+use strict;
+use warnings FATAL => 'all';
+
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+# Run the test program with 10M iterations
+my $exe = "test_int128";
+my $size = 10_000_000;
+
+note "testing executable $exe";
+
+my ($stdout, $stderr) = run_command([ $exe, $size ]);
+
+is($stdout, "", "test_int128");
+is($stderr, "", "test_int128");
+
+done_testing();
diff --git a/src/tools/testint128.c b/src/test/modules/test_int128/test_int128.c
similarity index 98%
rename from src/tools/testint128.c
rename to src/test/modules/test_int128/test_int128.c
index a25631e277d..62b67823fbd 100644
--- a/src/tools/testint128.c
+++ b/src/test/modules/test_int128/test_int128.c
@@ -1,6 +1,6 @@
 /*-------------------------------------------------------------------------
  *
- * testint128.c
+ * test_int128.c
  *	  Testbed for roll-our-own 128-bit integer arithmetic.
  *
  * This is a standalone test program that compares the behavior of an
@@ -10,7 +10,7 @@
  *
  *
  * IDENTIFICATION
- *	  src/tools/testint128.c
+ *	  src/test/modules/test_int128/test_int128.c
  *
  *-------------------------------------------------------------------------
  */
@@ -36,7 +36,7 @@ typedef union
 {
 	int128		i128;
 	INT128		I128;
-	union
+	struct
 	{
 #ifdef WORDS_BIGENDIAN
 		int64		hi;
-- 
2.43.0

From a0d835265851d1188e9a723500035372b175366a Mon Sep 17 00:00:00 2001
From: Dean Rasheed <dean.a.rash...@gmail.com>
Date: Sat, 21 Jun 2025 13:00:50 +0100
Subject: [PATCH v2 3/5] Optimise non-native 128-bit addition in int128.h.

On platforms without native 128-bit integer support, the unsigned
addition code in int128.h can be made significantly simpler and faster
by noting that the low-part addition is unsigned integer arithmetic,
which is just modular arithmetic, and so the test for carry can be
written as a single "new < old" test. This can then be made branchless
to produce the same machine instructions as native 128-bit addition.

The signed addition case can be coded in almost the same way, with
just a single extra term to compensate for the sign of the input.
Again, this is intended to be branchless, and to match the native
128-bit integer addition code.
---
 src/include/common/int128.h | 35 +++++++++++++++--------------------
 1 file changed, 15 insertions(+), 20 deletions(-)

diff --git a/src/include/common/int128.h b/src/include/common/int128.h
index 8c300e56d9a..0f7e90ee887 100644
--- a/src/include/common/int128.h
+++ b/src/include/common/int128.h
@@ -68,17 +68,17 @@ int128_add_uint64(INT128 *i128, uint64 v)
 #else
 	/*
 	 * First add the value to the .lo part, then check to see if a carry needs
-	 * to be propagated into the .hi part.  A carry is needed if both inputs
-	 * have high bits set, or if just one input has high bit set while the new
-	 * .lo part doesn't.  Remember that .lo part is unsigned; we cast to
-	 * signed here just as a cheap way to check the high bit.
+	 * to be propagated into the .hi part.  Since this is unsigned integer
+	 * arithmetic, which is just modular arithmetic, a carry is needed if the
+	 * new .lo part is less than the old .lo part (i.e., if modular
+	 * wrap-around occurred).  Writing this in the form below, rather than
+	 * using an "if" statement causes modern compilers to produce branchless
+	 * machine code identical to the native code.
 	 */
 	uint64		oldlo = i128->lo;
 
 	i128->lo += v;
-	if (((int64) v < 0 && (int64) oldlo < 0) ||
-		(((int64) v < 0 || (int64) oldlo < 0) && (int64) i128->lo >= 0))
-		i128->hi++;
+	i128->hi += (i128->lo < oldlo);
 #endif
 }
 
@@ -93,23 +93,18 @@ int128_add_int64(INT128 *i128, int64 v)
 #else
 	/*
 	 * This is much like the above except that the carry logic differs for
-	 * negative v.  Ordinarily we'd need to subtract 1 from the .hi part
-	 * (corresponding to adding the sign-extended bits of v to it); but if
-	 * there is a carry out of the .lo part, that cancels and we do nothing.
+	 * negative v -- we need to subtract 1 from the .hi part if the new .lo
+	 * value is greater than the old .lo value.  That can be achieved without
+	 * any branching by adding the sign bit from v (v >> 63 = 0 or -1) to the
+	 * previous result (for negative v, if the new .lo value is less than the
+	 * old .lo value, the two terms cancel and we leave the .hi part
+	 * unchanged, otherwise we subtract 1 from the .hi part).  Again, this
+	 * produces identical output to the native code with modern compilers.
 	 */
 	uint64		oldlo = i128->lo;
 
 	i128->lo += v;
-	if (v >= 0)
-	{
-		if ((int64) oldlo < 0 && (int64) i128->lo >= 0)
-			i128->hi++;
-	}
-	else
-	{
-		if (!((int64) oldlo < 0 || (int64) i128->lo >= 0))
-			i128->hi--;
-	}
+	i128->hi += (i128->lo < oldlo) + (v >> 63);
 #endif
 }
 
-- 
2.43.0

From 83d9663323ec70138844ba0021a011e5791b1f3b Mon Sep 17 00:00:00 2001
From: Dean Rasheed <dean.a.rash...@gmail.com>
Date: Sat, 21 Jun 2025 18:09:15 +0100
Subject: [PATCH v2 4/5] Simplify non-native 64x64-bit multiplication in
 int128.h.

In int128_add_int64_mul_int64(), in the non-native code, use signed
64-bit integer multiplication instead of unsigned multiplication for
the first three product terms. This simplifies the code needed to add
each product term to the result, leading to more compact and efficient
code. The actual performance gain is quite modest, but this seems
worth it to improve the code's readability.
---
 src/include/common/int128.h | 48 ++++++++++++++++---------------------
 1 file changed, 21 insertions(+), 27 deletions(-)

diff --git a/src/include/common/int128.h b/src/include/common/int128.h
index 0f7e90ee887..d45296e1ad1 100644
--- a/src/include/common/int128.h
+++ b/src/include/common/int128.h
@@ -109,11 +109,11 @@ int128_add_int64(INT128 *i128, int64 v)
 }
 
 /*
- * INT64_AU32 extracts the most significant 32 bits of int64 as int64, while
- * INT64_AL32 extracts the least significant 32 bits as uint64.
+ * INT64_HI_INT32 extracts the most significant 32 bits of int64 as int32.
+ * INT64_LO_UINT32 extracts the least significant 32 bits as uint32.
  */
-#define INT64_AU32(i64) ((i64) >> 32)
-#define INT64_AL32(i64) ((i64) & UINT64CONST(0xFFFFFFFF))
+#define INT64_HI_INT32(i64)		((int32) ((i64) >> 32))
+#define INT64_LO_UINT32(i64)	((uint32) (i64))
 
 /*
  * Add the 128-bit product of two int64 values into an INT128 variable.
@@ -128,7 +128,7 @@ int128_add_int64_mul_int64(INT128 *i128, int64 x, int64 y)
 	 */
 	*i128 += (int128) x * (int128) y;
 #else
-	/* INT64_AU32 must use arithmetic right shift */
+	/* INT64_HI_INT32 must use arithmetic right shift */
 	StaticAssertDecl(((int64) -1 >> 1) == (int64) -1,
 					 "arithmetic right shift is needed");
 
@@ -153,33 +153,27 @@ int128_add_int64_mul_int64(INT128 *i128, int64 x, int64 y)
 	/* No need to work hard if product must be zero */
 	if (x != 0 && y != 0)
 	{
-		int64		x_u32 = INT64_AU32(x);
-		uint64		x_l32 = INT64_AL32(x);
-		int64		y_u32 = INT64_AU32(y);
-		uint64		y_l32 = INT64_AL32(y);
+		int32		x_hi = INT64_HI_INT32(x);
+		uint32		x_lo = INT64_LO_UINT32(x);
+		int32		y_hi = INT64_HI_INT32(y);
+		uint32		y_lo = INT64_LO_UINT32(y);
 		int64		tmp;
 
 		/* the first term */
-		i128->hi += x_u32 * y_u32;
-
-		/* the second term: sign-extend it only if x is negative */
-		tmp = x_u32 * y_l32;
-		if (x < 0)
-			i128->hi += INT64_AU32(tmp);
-		else
-			i128->hi += ((uint64) tmp) >> 32;
-		int128_add_uint64(i128, ((uint64) INT64_AL32(tmp)) << 32);
-
-		/* the third term: sign-extend it only if y is negative */
-		tmp = x_l32 * y_u32;
-		if (y < 0)
-			i128->hi += INT64_AU32(tmp);
-		else
-			i128->hi += ((uint64) tmp) >> 32;
-		int128_add_uint64(i128, ((uint64) INT64_AL32(tmp)) << 32);
+		i128->hi += (int64) x_hi * (int64) y_hi;
+
+		/* the second term: sign-extended with the sign of x */
+		tmp = (int64) x_hi * (int64) y_lo;
+		i128->hi += INT64_HI_INT32(tmp);
+		int128_add_uint64(i128, ((uint64) INT64_LO_UINT32(tmp)) << 32);
+
+		/* the third term: sign-extended with the sign of y */
+		tmp = (int64) x_lo * (int64) y_hi;
+		i128->hi += INT64_HI_INT32(tmp);
+		int128_add_uint64(i128, ((uint64) INT64_LO_UINT32(tmp)) << 32);
 
 		/* the fourth term: always unsigned */
-		int128_add_uint64(i128, x_l32 * y_l32);
+		int128_add_uint64(i128, (uint64) x_lo * (uint64) y_lo);
 	}
 #endif
 }
-- 
2.43.0

Reply via email to