Thanks for all your feedback and help. I got a patch that i consider
ready for review. It introduces two new functions:
array_shuffle(anyarray) -> anyarray
array_sample(anyarray, integer) -> anyarray
array_shuffle() shuffles an array (obviously). array_sample() picks n
random elements from an array.
Is someone interested in looking at it? What are the next steps?
Martin
From 5498bb2d9f1fab4cad56cd0d3a6eeafa24a26c8e Mon Sep 17 00:00:00 2001
From: Martin Kalcher <martin.kalc...@aboutsource.net>
Date: Sun, 17 Jul 2022 18:06:04 +0200
Subject: [PATCH] Introduce array_shuffle() and array_sample()
* array_shuffle() shuffles to elements of an array.
* array_sample() chooses n elements from an array by random.
Shuffling of arrays can already be accomplished with SQL
using unnest() and array_agg(order by random()). But that is
very slow compared to the new functions.
---
doc/src/sgml/func.sgml | 34 ++++++
src/backend/utils/adt/arrayfuncs.c | 163 +++++++++++++++++++++++++++
src/include/catalog/pg_proc.dat | 6 +
src/test/regress/expected/arrays.out | 30 +++++
src/test/regress/sql/arrays.sql | 11 ++
5 files changed, 244 insertions(+)
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index b6783b7ad0..c676031b4a 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -19395,6 +19395,40 @@ SELECT NULLIF(value, '(none)') ...
</para></entry>
</row>
+ <row>
+ <entry role="func_table_entry"><para role="func_signature">
+ <indexterm>
+ <primary>array_sample</primary>
+ </indexterm>
+ <function>array_sample</function> ( <parameter>array</parameter> <type>anyarray</type>, <parameter>n</parameter> <type>integer</type> )
+ <returnvalue>anyarray</returnvalue>
+ </para>
+ <para>
+ Returns <parameter>n</parameter> randomly chosen elements from <parameter>array</parameter>.
+ </para>
+ <para>
+ <literal>array_sample(ARRAY[1,2,3,4,5,6], 3)</literal>
+ <returnvalue>{4,5,1}</returnvalue>
+ </para></entry>
+ </row>
+
+ <row>
+ <entry role="func_table_entry"><para role="func_signature">
+ <indexterm>
+ <primary>array_shuffle</primary>
+ </indexterm>
+ <function>array_shuffle</function> ( <type>anyarray</type> )
+ <returnvalue>anyarray</returnvalue>
+ </para>
+ <para>
+ Shuffles the elements of the array.
+ </para>
+ <para>
+ <literal>array_shuffle(ARRAY[1,2,3,4,5,6])</literal>
+ <returnvalue>{4,5,1,3,2,6}</returnvalue>
+ </para></entry>
+ </row>
+
<row>
<entry role="func_table_entry"><para role="func_signature">
<indexterm id="function-array-to-string">
diff --git a/src/backend/utils/adt/arrayfuncs.c b/src/backend/utils/adt/arrayfuncs.c
index fb167f226a..a6769a8ebf 100644
--- a/src/backend/utils/adt/arrayfuncs.c
+++ b/src/backend/utils/adt/arrayfuncs.c
@@ -6872,3 +6872,166 @@ trim_array(PG_FUNCTION_ARGS)
PG_RETURN_DATUM(result);
}
+
+/*
+ * Shuffle the elements of an array.
+ */
+Datum
+array_shuffle(PG_FUNCTION_ARGS)
+{
+ ArrayType *array,
+ *result;
+ int16 elmlen;
+ bool elmbyval;
+ char elmalign;
+ Oid elmtyp;
+ TypeCacheEntry *typentry;
+ Datum *elms,
+ elm;
+ bool *nuls,
+ nul;
+ int nelms,
+ i,
+ j;
+
+ array = PG_GETARG_ARRAYTYPE_P(0);
+ nelms = ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array));
+
+ /* There is no point in shuffling arrays with less then two elements. */
+ if (nelms < 2)
+ PG_RETURN_ARRAYTYPE_P(array);
+
+ elmtyp = ARR_ELEMTYPE(array);
+ typentry = (TypeCacheEntry *) fcinfo->flinfo->fn_extra;
+
+ if (typentry == NULL || typentry->type_id != elmtyp)
+ {
+ typentry = lookup_type_cache(elmtyp, 0);
+ fcinfo->flinfo->fn_extra = (void *) typentry;
+ }
+ elmlen = typentry->typlen;
+ elmbyval = typentry->typbyval;
+ elmalign = typentry->typalign;
+
+ deconstruct_array(array, elmtyp, elmlen, elmbyval, elmalign,
+ &elms, &nuls, &nelms);
+
+ /* Shuffle elements and nulls using Fisher-Yates shuffle algorithm. */
+ for (i = nelms - 1; i > 0; i--)
+ {
+ j = random() % (i + 1);
+ elm = elms[i];
+ nul = nuls[i];
+ elms[i] = elms[j];
+ nuls[i] = nuls[j];
+ elms[j] = elm;
+ nuls[j] = nul;
+ }
+
+ result = construct_md_array(elms, nuls,
+ ARR_NDIM(array), ARR_DIMS(array), ARR_LBOUND(array),
+ elmtyp, elmlen, elmbyval, elmalign);
+
+ pfree(elms);
+ pfree(nuls);
+ PG_FREE_IF_COPY(array, 0);
+
+ PG_RETURN_ARRAYTYPE_P(result);
+}
+
+/*
+ * Choose N random elements from an array.
+ */
+Datum
+array_sample(PG_FUNCTION_ARGS)
+{
+ ArrayType *array,
+ *result;
+ int16 elmlen;
+ bool elmbyval;
+ char elmalign;
+ Oid elmtyp;
+ TypeCacheEntry *typentry;
+ Datum *elms,
+ elm;
+ bool *nuls,
+ nul;
+ int nelms,
+ rnelms,
+ rdims[1],
+ rlbs[1],
+ i,
+ j;
+
+ array = PG_GETARG_ARRAYTYPE_P(0);
+ nelms = ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array));
+ elmtyp = ARR_ELEMTYPE(array);
+ rnelms = PG_GETARG_INT32(1);
+
+ if (rnelms < 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("second parameter must not be negative")));
+
+ /* Return an empty array if the requested sample size is zero. */
+ if (rnelms == 0)
+ {
+ PG_FREE_IF_COPY(array, 0);
+ PG_RETURN_ARRAYTYPE_P(construct_empty_array(elmtyp));
+ }
+
+ /*
+ * Return the original array if the requested sample size is greater than
+ * or equal to its own size.
+ */
+ if (rnelms >= nelms)
+ PG_RETURN_ARRAYTYPE_P(array);
+
+ typentry = (TypeCacheEntry *) fcinfo->flinfo->fn_extra;
+
+ if (typentry == NULL || typentry->type_id != elmtyp)
+ {
+ typentry = lookup_type_cache(elmtyp, 0);
+ fcinfo->flinfo->fn_extra = (void *) typentry;
+ }
+ elmlen = typentry->typlen;
+ elmbyval = typentry->typbyval;
+ elmalign = typentry->typalign;
+
+ deconstruct_array(array, elmtyp, elmlen, elmbyval, elmalign,
+ &elms, &nuls, &nelms);
+
+ /*
+ * Move rnelms (requested sample size) randomly-chosen elements to the
+ * beginning of the array, using the Fisher-Yates shuffle algorithm.
+ */
+ for (i = 0; i < rnelms; i++)
+ {
+ /* random j such that i <= j < nelms */
+ j = (random() % (nelms - i)) + i;
+ elm = elms[i];
+ nul = nuls[i];
+ elms[i] = elms[j];
+ nuls[i] = nuls[j];
+ elms[j] = elm;
+ nuls[j] = nul;
+ }
+
+ /*
+ * Construct a new array from the first rnelms (requested sample size)
+ * elements of the shuffled array. Even though we are constructing a
+ * one-dimensional array, we cannot use construct_array(), because it does
+ * not support NULL elements.
+ */
+ rdims[0] = rnelms;
+ rlbs[0] = 1;
+
+ result = construct_md_array(elms, nuls, 1, rdims, rlbs,
+ elmtyp, elmlen, elmbyval, elmalign);
+
+ pfree(elms);
+ pfree(nuls);
+ PG_FREE_IF_COPY(array, 0);
+
+ PG_RETURN_ARRAYTYPE_P(result);
+}
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 2e41f4d9e8..f51af5ee9f 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -1681,6 +1681,12 @@
proname => 'arraycontjoinsel', provolatile => 's', prorettype => 'float8',
proargtypes => 'internal oid internal int2 internal',
prosrc => 'arraycontjoinsel' },
+{ oid => '8464', descr => 'shuffle array',
+ proname => 'array_shuffle', proisstrict => 't', prorettype => 'anyarray',
+ proargtypes => 'anyarray', prosrc => 'array_shuffle' },
+{ oid => '8465', descr => 'take samples from array',
+ proname => 'array_sample', proisstrict => 't', prorettype => 'anyarray',
+ proargtypes => 'anyarray int4', prosrc => 'array_sample' },
{ oid => '764', descr => 'large object import',
proname => 'lo_import', provolatile => 'v', proparallel => 'u',
diff --git a/src/test/regress/expected/arrays.out b/src/test/regress/expected/arrays.out
index ce6f3a65f9..4994e6ac62 100644
--- a/src/test/regress/expected/arrays.out
+++ b/src/test/regress/expected/arrays.out
@@ -2445,3 +2445,33 @@ SELECT trim_array(ARRAY[1, 2, 3], -1); -- fail
ERROR: number of elements to trim must be between 0 and 3
SELECT trim_array(ARRAY[1, 2, 3], 10); -- fail
ERROR: number of elements to trim must be between 0 and 3
+-- array_shuffle
+SELECT array_agg(a order by a)
+FROM
+(SELECT unnest(array_shuffle('{NULL,1,2,3,4,5}'::int[]))) v(a);
+ array_agg
+------------------
+ {1,2,3,4,5,NULL}
+(1 row)
+
+-- array_sample
+SELECT cardinality(array_sample('{NULL,1,2,3,4,5}'::int[], 3));
+ cardinality
+-------------
+ 3
+(1 row)
+
+SELECT array_sample('{1,2,3,4,5}'::int[], 3) <@ '{1,2,3,4,5}'::int[];
+ ?column?
+----------
+ t
+(1 row)
+
+SELECT array_sample('{1,2,3,4,5}'::int[], 10);
+ array_sample
+--------------
+ {1,2,3,4,5}
+(1 row)
+
+SELECT array_sample('{1,2,3,4,5}'::int[], -1); -- fail
+ERROR: second parameter must not be negative
diff --git a/src/test/regress/sql/arrays.sql b/src/test/regress/sql/arrays.sql
index f774faf856..0234aafa06 100644
--- a/src/test/regress/sql/arrays.sql
+++ b/src/test/regress/sql/arrays.sql
@@ -754,3 +754,14 @@ FROM
SELECT trim_array(ARRAY[1, 2, 3], -1); -- fail
SELECT trim_array(ARRAY[1, 2, 3], 10); -- fail
+
+-- array_shuffle
+SELECT array_agg(a order by a)
+FROM
+(SELECT unnest(array_shuffle('{NULL,1,2,3,4,5}'::int[]))) v(a);
+
+-- array_sample
+SELECT cardinality(array_sample('{NULL,1,2,3,4,5}'::int[], 3));
+SELECT array_sample('{1,2,3,4,5}'::int[], 3) <@ '{1,2,3,4,5}'::int[];
+SELECT array_sample('{1,2,3,4,5}'::int[], 10);
+SELECT array_sample('{1,2,3,4,5}'::int[], -1); -- fail
--
2.37.1