[PATCH] Introduce array_shuffle() and array_sample()

Martin Kalcher Mon, 18 Jul 2022 12:03:56 -0700

Thanks for all your feedback and help. I got a patch that i considerready for review. It introduces two new functions:


  array_shuffle(anyarray) -> anyarray
  array_sample(anyarray, integer) -> anyarray

array_shuffle() shuffles an array (obviously). array_sample() picks nrandom elements from an array.


Is someone interested in looking at it? What are the next steps?

Martin

From 5498bb2d9f1fab4cad56cd0d3a6eeafa24a26c8e Mon Sep 17 00:00:00 2001
From: Martin Kalcher <martin.kalc...@aboutsource.net>
Date: Sun, 17 Jul 2022 18:06:04 +0200
Subject: [PATCH] Introduce array_shuffle() and array_sample()

* array_shuffle() shuffles to elements of an array.
* array_sample() chooses n elements from an array by random.

Shuffling of arrays can already be accomplished with SQL
using unnest() and array_agg(order by random()). But that is
very slow compared to the new functions.
---
 doc/src/sgml/func.sgml               |  34 ++++++
 src/backend/utils/adt/arrayfuncs.c   | 163 +++++++++++++++++++++++++++
 src/include/catalog/pg_proc.dat      |   6 +
 src/test/regress/expected/arrays.out |  30 +++++
 src/test/regress/sql/arrays.sql      |  11 ++
 5 files changed, 244 insertions(+)

diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index b6783b7ad0..c676031b4a 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -19395,6 +19395,40 @@ SELECT NULLIF(value, '(none)') ...
        </para></entry>
       </row>
 
+      <row>
+       <entry role="func_table_entry"><para role="func_signature">
+        <indexterm>
+         <primary>array_sample</primary>
+        </indexterm>
+        <function>array_sample</function> ( <parameter>array</parameter> <type>anyarray</type>, <parameter>n</parameter> <type>integer</type> )
+        <returnvalue>anyarray</returnvalue>
+       </para>
+       <para>
+        Returns <parameter>n</parameter> randomly chosen elements from <parameter>array</parameter>.
+       </para>
+       <para>
+        <literal>array_sample(ARRAY[1,2,3,4,5,6], 3)</literal>
+        <returnvalue>{4,5,1}</returnvalue>
+       </para></entry>
+      </row>
+
+      <row>
+       <entry role="func_table_entry"><para role="func_signature">
+        <indexterm>
+         <primary>array_shuffle</primary>
+        </indexterm>
+        <function>array_shuffle</function> ( <type>anyarray</type> )
+        <returnvalue>anyarray</returnvalue>
+       </para>
+       <para>
+        Shuffles the elements of the array.
+       </para>
+       <para>
+        <literal>array_shuffle(ARRAY[1,2,3,4,5,6])</literal>
+        <returnvalue>{4,5,1,3,2,6}</returnvalue>
+       </para></entry>
+      </row>
+
       <row>
        <entry role="func_table_entry"><para role="func_signature">
         <indexterm id="function-array-to-string">
diff --git a/src/backend/utils/adt/arrayfuncs.c b/src/backend/utils/adt/arrayfuncs.c
index fb167f226a..a6769a8ebf 100644
--- a/src/backend/utils/adt/arrayfuncs.c
+++ b/src/backend/utils/adt/arrayfuncs.c
@@ -6872,3 +6872,166 @@ trim_array(PG_FUNCTION_ARGS)
 
 	PG_RETURN_DATUM(result);
 }
+
+/*
+ * Shuffle the elements of an array.
+ */
+Datum
+array_shuffle(PG_FUNCTION_ARGS)
+{
+	ArrayType  *array,
+			   *result;
+	int16		elmlen;
+	bool		elmbyval;
+	char		elmalign;
+	Oid			elmtyp;
+	TypeCacheEntry *typentry;
+	Datum	   *elms,
+				elm;
+	bool	   *nuls,
+				nul;
+	int			nelms,
+				i,
+				j;
+
+	array = PG_GETARG_ARRAYTYPE_P(0);
+	nelms = ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array));
+
+	/* There is no point in shuffling arrays with less then two elements. */
+	if (nelms < 2)
+		PG_RETURN_ARRAYTYPE_P(array);
+
+	elmtyp = ARR_ELEMTYPE(array);
+	typentry = (TypeCacheEntry *) fcinfo->flinfo->fn_extra;
+
+	if (typentry == NULL || typentry->type_id != elmtyp)
+	{
+		typentry = lookup_type_cache(elmtyp, 0);
+		fcinfo->flinfo->fn_extra = (void *) typentry;
+	}
+	elmlen = typentry->typlen;
+	elmbyval = typentry->typbyval;
+	elmalign = typentry->typalign;
+
+	deconstruct_array(array, elmtyp, elmlen, elmbyval, elmalign,
+					  &elms, &nuls, &nelms);
+
+	/* Shuffle elements and nulls using Fisher-Yates shuffle algorithm. */
+	for (i = nelms - 1; i > 0; i--)
+	{
+		j = random() % (i + 1);
+		elm = elms[i];
+		nul = nuls[i];
+		elms[i] = elms[j];
+		nuls[i] = nuls[j];
+		elms[j] = elm;
+		nuls[j] = nul;
+	}
+
+	result = construct_md_array(elms, nuls,
+								ARR_NDIM(array), ARR_DIMS(array), ARR_LBOUND(array),
+								elmtyp, elmlen, elmbyval, elmalign);
+
+	pfree(elms);
+	pfree(nuls);
+	PG_FREE_IF_COPY(array, 0);
+
+	PG_RETURN_ARRAYTYPE_P(result);
+}
+
+/*
+ * Choose N random elements from an array.
+ */
+Datum
+array_sample(PG_FUNCTION_ARGS)
+{
+	ArrayType  *array,
+			   *result;
+	int16		elmlen;
+	bool		elmbyval;
+	char		elmalign;
+	Oid			elmtyp;
+	TypeCacheEntry *typentry;
+	Datum	   *elms,
+				elm;
+	bool	   *nuls,
+				nul;
+	int			nelms,
+				rnelms,
+				rdims[1],
+				rlbs[1],
+				i,
+				j;
+
+	array = PG_GETARG_ARRAYTYPE_P(0);
+	nelms = ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array));
+	elmtyp = ARR_ELEMTYPE(array);
+	rnelms = PG_GETARG_INT32(1);
+
+	if (rnelms < 0)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("second parameter must not be negative")));
+
+	/* Return an empty array if the requested sample size is zero. */
+	if (rnelms == 0)
+	{
+		PG_FREE_IF_COPY(array, 0);
+		PG_RETURN_ARRAYTYPE_P(construct_empty_array(elmtyp));
+	}
+
+	/*
+	 * Return the original array if the requested sample size is greater than
+	 * or equal to its own size.
+	 */
+	if (rnelms >= nelms)
+		PG_RETURN_ARRAYTYPE_P(array);
+
+	typentry = (TypeCacheEntry *) fcinfo->flinfo->fn_extra;
+
+	if (typentry == NULL || typentry->type_id != elmtyp)
+	{
+		typentry = lookup_type_cache(elmtyp, 0);
+		fcinfo->flinfo->fn_extra = (void *) typentry;
+	}
+	elmlen = typentry->typlen;
+	elmbyval = typentry->typbyval;
+	elmalign = typentry->typalign;
+
+	deconstruct_array(array, elmtyp, elmlen, elmbyval, elmalign,
+					  &elms, &nuls, &nelms);
+
+	/*
+	 * Move rnelms (requested sample size) randomly-chosen elements to the
+	 * beginning of the array, using the Fisher-Yates shuffle algorithm.
+	 */
+	for (i = 0; i < rnelms; i++)
+	{
+		/* random j such that i <= j < nelms */
+		j = (random() % (nelms - i)) + i;
+		elm = elms[i];
+		nul = nuls[i];
+		elms[i] = elms[j];
+		nuls[i] = nuls[j];
+		elms[j] = elm;
+		nuls[j] = nul;
+	}
+
+	/*
+	 * Construct a new array from the first rnelms (requested sample size)
+	 * elements of the shuffled array. Even though we are constructing a
+	 * one-dimensional array, we cannot use construct_array(), because it does
+	 * not support NULL elements.
+	 */
+	rdims[0] = rnelms;
+	rlbs[0] = 1;
+
+	result = construct_md_array(elms, nuls, 1, rdims, rlbs,
+								elmtyp, elmlen, elmbyval, elmalign);
+
+	pfree(elms);
+	pfree(nuls);
+	PG_FREE_IF_COPY(array, 0);
+
+	PG_RETURN_ARRAYTYPE_P(result);
+}
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 2e41f4d9e8..f51af5ee9f 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -1681,6 +1681,12 @@
   proname => 'arraycontjoinsel', provolatile => 's', prorettype => 'float8',
   proargtypes => 'internal oid internal int2 internal',
   prosrc => 'arraycontjoinsel' },
+{ oid => '8464', descr => 'shuffle array',
+  proname => 'array_shuffle', proisstrict => 't', prorettype => 'anyarray',
+  proargtypes => 'anyarray', prosrc => 'array_shuffle' },
+{ oid => '8465', descr => 'take samples from array',
+  proname => 'array_sample', proisstrict => 't', prorettype => 'anyarray',
+  proargtypes => 'anyarray int4', prosrc => 'array_sample' },
 
 { oid => '764', descr => 'large object import',
   proname => 'lo_import', provolatile => 'v', proparallel => 'u',
diff --git a/src/test/regress/expected/arrays.out b/src/test/regress/expected/arrays.out
index ce6f3a65f9..4994e6ac62 100644
--- a/src/test/regress/expected/arrays.out
+++ b/src/test/regress/expected/arrays.out
@@ -2445,3 +2445,33 @@ SELECT trim_array(ARRAY[1, 2, 3], -1); -- fail
 ERROR:  number of elements to trim must be between 0 and 3
 SELECT trim_array(ARRAY[1, 2, 3], 10); -- fail
 ERROR:  number of elements to trim must be between 0 and 3
+-- array_shuffle
+SELECT array_agg(a order by a)
+FROM
+(SELECT unnest(array_shuffle('{NULL,1,2,3,4,5}'::int[]))) v(a);
+    array_agg     
+------------------
+ {1,2,3,4,5,NULL}
+(1 row)
+
+-- array_sample
+SELECT cardinality(array_sample('{NULL,1,2,3,4,5}'::int[], 3));
+ cardinality 
+-------------
+           3
+(1 row)
+
+SELECT array_sample('{1,2,3,4,5}'::int[], 3) <@ '{1,2,3,4,5}'::int[];
+ ?column? 
+----------
+ t
+(1 row)
+
+SELECT array_sample('{1,2,3,4,5}'::int[], 10);
+ array_sample 
+--------------
+ {1,2,3,4,5}
+(1 row)
+
+SELECT array_sample('{1,2,3,4,5}'::int[], -1); -- fail
+ERROR:  second parameter must not be negative
diff --git a/src/test/regress/sql/arrays.sql b/src/test/regress/sql/arrays.sql
index f774faf856..0234aafa06 100644
--- a/src/test/regress/sql/arrays.sql
+++ b/src/test/regress/sql/arrays.sql
@@ -754,3 +754,14 @@ FROM
 
 SELECT trim_array(ARRAY[1, 2, 3], -1); -- fail
 SELECT trim_array(ARRAY[1, 2, 3], 10); -- fail
+
+-- array_shuffle
+SELECT array_agg(a order by a)
+FROM
+(SELECT unnest(array_shuffle('{NULL,1,2,3,4,5}'::int[]))) v(a);
+
+-- array_sample
+SELECT cardinality(array_sample('{NULL,1,2,3,4,5}'::int[], 3));
+SELECT array_sample('{1,2,3,4,5}'::int[], 3) <@ '{1,2,3,4,5}'::int[];
+SELECT array_sample('{1,2,3,4,5}'::int[], 10);
+SELECT array_sample('{1,2,3,4,5}'::int[], -1); -- fail
-- 
2.37.1

[PATCH] Introduce array_shuffle() and array_sample()

Reply via email to