From 11a33fba2d02f3f76f1202735f87f4e27416253b Mon Sep 17 00:00:00 2001
From: Florents Tselai <florents.tselai@gmail.com>
Date: Sun, 31 Aug 2025 15:23:18 +0300
Subject: [PATCH v1] Add xicorr(X, Y)

---
 doc/src/sgml/func/func-aggregate.sgml    |  28 ++++
 src/backend/utils/adt/float.c            | 156 +++++++++++++++++++++++
 src/include/catalog/pg_aggregate.dat     |   2 +
 src/include/catalog/pg_proc.dat          |   9 ++
 src/test/regress/expected/aggregates.out |  10 ++
 src/test/regress/sql/aggregates.sql      |   5 +
 6 files changed, 210 insertions(+)

diff --git a/doc/src/sgml/func/func-aggregate.sgml b/doc/src/sgml/func/func-aggregate.sgml
index f50b692516b..4de10a5cb30 100644
--- a/doc/src/sgml/func/func-aggregate.sgml
+++ b/doc/src/sgml/func/func-aggregate.sgml
@@ -1091,6 +1091,34 @@ SELECT count(*) FROM sometable;
        </para></entry>
        <entry>Yes</entry>
       </row>
+
+      <row>
+       <entry role="func_table_entry"><para role="func_signature">
+        <indexterm>
+         <primary>xi correlation</primary>
+        </indexterm>
+        <indexterm>
+         <primary>xicorr</primary>
+        </indexterm>
+        <function>xicorr</function> ( <parameter>Y</parameter> <type>double precision</type>, <parameter>X</parameter> <type>double precision</type> )
+        <returnvalue>double precision</returnvalue>
+       </para>
+       <para>
+        Computes Chaterjee's Χi (ξ) correlation coefficient.
+        The xi correlation coefficient is a measure of association between two variables;
+        the value tends to be close to zero when the variables
+        are independent and close to 1 when there is a strong association.
+        Unlike other correlation coefficients,
+        the xi correlation is effective even when the association is not monotonic.
+        Note that this statistic is non-symmetric by design.
+        Tied values of <replaceable>x</replaceable> are ordered
+        deterministically by their <replaceable>y</replaceable>
+        values. This differs from some software which breaks ties arbitrarily.
+        With small samples, the sample coefficient can be slightly negative.
+        This implementation clamps such values to 0.
+       </para></entry>
+       <entry>Yes</entry>
+      </row>
      </tbody>
     </tgroup>
    </table>
diff --git a/src/backend/utils/adt/float.c b/src/backend/utils/adt/float.c
index 7b97d2be6ca..990430c4c53 100644
--- a/src/backend/utils/adt/float.c
+++ b/src/backend/utils/adt/float.c
@@ -3754,6 +3754,162 @@ float8_corr(PG_FUNCTION_ARGS)
 	PG_RETURN_FLOAT8(Sxy / sqrt(Sxx * Syy));
 }
 
+/* xicorr(X, Y) infrastructure */
+
+typedef struct XiState
+{
+	int     n;       /* number of rows */
+	int     cap;     /* allocated capacity */
+	double *xs;
+	double *ys;
+} XiState;
+
+Datum
+xicorr_transfn(PG_FUNCTION_ARGS)
+{
+	MemoryContext aggctx;
+	XiState *st;
+	int newcap;
+	double x = PG_GETARG_FLOAT8(1);
+	double y = PG_GETARG_FLOAT8(2);
+
+	if (!AggCheckCallContext(fcinfo, &aggctx))
+		elog(ERROR, "must be called as aggregate");
+
+	if (PG_ARGISNULL(0))
+	{
+		/* first call */
+		st = (XiState *) MemoryContextAllocZero(aggctx, sizeof(XiState));
+		st->cap = 64;
+		st->xs = (double *) MemoryContextAlloc(aggctx, sizeof(double)*st->cap);
+		st->ys = (double *) MemoryContextAlloc(aggctx, sizeof(double)*st->cap);
+		st->n  = 0;
+	}
+	else
+	{
+		st = (XiState *) PG_GETARG_POINTER(0);
+	}
+
+	if (PG_ARGISNULL(1) || PG_ARGISNULL(2))
+		PG_RETURN_POINTER(st);
+
+	/* grow if needed */
+	if (st->n >= st->cap)
+	{
+		newcap = st->cap * 2;
+		st->xs = (double *) repalloc(st->xs, sizeof(double)*newcap);
+		st->ys = (double *) repalloc(st->ys, sizeof(double)*newcap);
+		st->cap = newcap;
+	}
+
+	st->xs[st->n] = x;
+	st->ys[st->n] = y;
+	st->n++;
+
+	PG_RETURN_POINTER(st);
+}
+
+/* Compare indices by Y */
+static int
+cmp_y(const void *a, const void *b, void *arg)
+{
+	const XiState *st = (const XiState *) arg;
+	int ia = *(const int *)a;
+	int ib = *(const int *)b;
+
+	if (st->ys[ia] < st->ys[ib]) return -1;
+	if (st->ys[ia] > st->ys[ib]) return 1;
+	return 0;
+}
+
+/* Compare indices by X (break ties with Y) */
+static int
+cmp_x_then_y(const void *a, const void *b, void *arg)
+{
+	const XiState *st = (const XiState *) arg;
+	int ia = *(const int *)a;
+	int ib = *(const int *)b;
+
+	if (st->xs[ia] < st->xs[ib]) return -1;
+	if (st->xs[ia] > st->xs[ib]) return 1;
+
+	if (st->ys[ia] < st->ys[ib]) return -1;
+	if (st->ys[ia] > st->ys[ib]) return 1;
+
+	return 0;
+}
+
+Datum
+float8_xicorr(PG_FUNCTION_ARGS)
+{
+	XiState *st;
+	int n, j;
+	int *ix, *iy;
+	double *rankY;
+	double avg, S, denom, xi;
+
+	if (PG_ARGISNULL(0))
+		PG_RETURN_NULL();
+
+	st = (XiState *) PG_GETARG_POINTER(0);
+	n = st->n;
+
+	if (n < 2)
+		PG_RETURN_NULL();
+
+	/* --- 1. rank Y --- */
+	iy = palloc(sizeof(int) * n);
+	for (int i = 0; i < n; i++) iy[i] = i;
+
+	qsort_arg(iy, n, sizeof(int), cmp_y, st);
+
+	rankY = palloc(sizeof(double) * n);
+
+	for (int i = 0; i < n; )
+	{
+		j = i + 1;
+		while (j < n && st->ys[iy[j]] == st->ys[iy[i]])
+			j++;
+
+		/* average rank for ties */
+		avg = ((double)(i + 1) + (double)j) / 2.0;
+		for (int k = i; k < j; k++)
+			rankY[iy[k]] = avg;
+
+		i = j;
+	}
+
+	/* --- 2. order indices by X (break ties by Y) --- */
+	ix = palloc(sizeof(int) * n);
+	for (int i = 0; i < n; i++) ix[i] = i;
+
+	qsort_arg(ix, n, sizeof(int), cmp_x_then_y, st);
+
+	/* --- 3. sum successive rank differences --- */
+	S = 0.0;
+	for (int t = 0; t < n - 1; t++)
+	{
+		int i0 = ix[t];
+		int i1 = ix[t + 1];
+		S += fabsl(rankY[i1] - rankY[i0]);
+	}
+
+	/* --- 4. coefficient --- */
+	denom = (double)n * (double)n - 1.0L;
+	if (denom <= 0.0L)
+		PG_RETURN_NULL();
+
+	xi = 1.0 - (3.0 * S) / denom;
+
+	/* Clamp to [0,1] if desired */
+	if (xi < 0.0)
+		xi = 0.0;
+	else if (xi > 1.0)
+		xi = 1.0;
+
+	PG_RETURN_FLOAT8(xi);
+}
+
 Datum
 float8_regr_r2(PG_FUNCTION_ARGS)
 {
diff --git a/src/include/catalog/pg_aggregate.dat b/src/include/catalog/pg_aggregate.dat
index d6aa1f6ec47..4a69ec2486a 100644
--- a/src/include/catalog/pg_aggregate.dat
+++ b/src/include/catalog/pg_aggregate.dat
@@ -506,6 +506,8 @@
 { aggfnoid => 'corr', aggtransfn => 'float8_regr_accum',
   aggfinalfn => 'float8_corr', aggcombinefn => 'float8_regr_combine',
   aggtranstype => '_float8', agginitval => '{0,0,0,0,0,0}' },
+{ aggfnoid => 'xicorr', aggtransfn => 'xicorr_transfn',
+  aggfinalfn => 'float8_xicorr', aggtranstype => 'internal'},
 
 # boolean-and and boolean-or
 { aggfnoid => 'bool_and', aggtransfn => 'booland_statefunc',
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 118d6da1ace..715f3ae04cb 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -5165,6 +5165,12 @@
 { oid => '2817', descr => 'aggregate final function',
   proname => 'float8_corr', prorettype => 'float8', proargtypes => '_float8',
   prosrc => 'float8_corr' },
+{ oid => '2775', descr => 'xicorr transition function', prokind => 'f',
+  proname => 'xicorr_transfn', prorettype => 'internal', proargtypes => 'internal float8 float8',
+  prosrc => 'xicorr_transfn', proisstrict => 'f', provolatile  => 'i', proparallel  => 's' },
+{ oid => '3814', descr => 'aggregate final function', prokind => 'f',
+  proname => 'float8_xicorr', prorettype => 'float8', proargtypes => 'internal',
+  prosrc => 'float8_xicorr', proisstrict  => 't', provolatile  => 'i', proparallel  => 's' },
 
 { oid => '3535', descr => 'aggregate transition function',
   proname => 'string_agg_transfn', proisstrict => 'f', prorettype => 'internal',
@@ -7311,6 +7317,9 @@
 { oid => '2829', descr => 'correlation coefficient',
   proname => 'corr', prokind => 'a', proisstrict => 'f', prorettype => 'float8',
   proargtypes => 'float8 float8', prosrc => 'aggregate_dummy' },
+{ oid => '3063', descr => 'xi correlation coefficient',
+  proname => 'xicorr', prokind => 'a', proisstrict => 'f', prorettype => 'float8',
+  proargtypes => 'float8 float8', prosrc => 'aggregate_dummy' },
 
 { oid => '2160',
   proname => 'text_pattern_lt', proleakproof => 't', prorettype => 'bool',
diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out
index c35288eecde..69969859ea9 100644
--- a/src/test/regress/expected/aggregates.out
+++ b/src/test/regress/expected/aggregates.out
@@ -496,6 +496,16 @@ SELECT corr(b, a) FROM aggtest;
  0.139634516517873
 (1 row)
 
+CREATE TEMPORARY TABLE bigger_aggtest AS
+SELECT g AS a,
+       g*g + 2 AS b
+FROM generate_series(1,100) g;
+SELECT xicorr(b, a) FROM bigger_aggtest;
+      xicorr      
+------------------
+ 0.97029702970297
+(1 row)
+
 -- check single-tuple behavior
 SELECT covar_pop(1::float8,2::float8), covar_samp(3::float8,4::float8);
  covar_pop | covar_samp 
diff --git a/src/test/regress/sql/aggregates.sql b/src/test/regress/sql/aggregates.sql
index 62540b1ffa4..7714fdff281 100644
--- a/src/test/regress/sql/aggregates.sql
+++ b/src/test/regress/sql/aggregates.sql
@@ -134,6 +134,11 @@ SELECT regr_r2(b, a) FROM aggtest;
 SELECT regr_slope(b, a), regr_intercept(b, a) FROM aggtest;
 SELECT covar_pop(b, a), covar_samp(b, a) FROM aggtest;
 SELECT corr(b, a) FROM aggtest;
+CREATE TEMPORARY TABLE bigger_aggtest AS
+SELECT g AS a,
+       g*g + 2 AS b
+FROM generate_series(1,100) g;
+SELECT xicorr(b, a) FROM bigger_aggtest;
 
 -- check single-tuple behavior
 SELECT covar_pop(1::float8,2::float8), covar_samp(3::float8,4::float8);
-- 
2.39.5 (Apple Git-154)

