Am 28.09.22 um 16:18 schrieb Tom Lane:
It is seeded at process start, yes. If you don't feel a need for
user control over the sequence used by these functions, then using
pg_global_prng_state is fine. (Basically the point to be made
here is that we need to keep a tight rein on what can be affected
by setseed().)
regards, tom lane
New patch: array_shuffle() and array_sample() use pg_global_prng_state now.
Martin
From b9433564f925521f5f6bcebd7cd74a3e12f4f354 Mon Sep 17 00:00:00 2001
From: Martin Kalcher <[email protected]>
Date: Sun, 17 Jul 2022 18:06:04 +0200
Subject: [PATCH v5] Introduce array_shuffle() and array_sample()
* array_shuffle() shuffles the elements of an array.
* array_sample() chooses n elements from an array by random.
---
doc/src/sgml/func.sgml | 34 +++++
src/backend/utils/adt/array_userfuncs.c | 176 ++++++++++++++++++++++++
src/include/catalog/pg_proc.dat | 6 +
src/test/regress/expected/arrays.out | 54 ++++++++
src/test/regress/sql/arrays.sql | 14 ++
5 files changed, 284 insertions(+)
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index e1fe4fec57..6b2a3d16f6 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -18468,6 +18468,40 @@ SELECT NULLIF(value, '(none)') ...
</para></entry>
</row>
+ <row>
+ <entry role="func_table_entry"><para role="func_signature">
+ <indexterm>
+ <primary>array_sample</primary>
+ </indexterm>
+ <function>array_sample</function> ( <parameter>array</parameter> <type>anyarray</type>, <parameter>n</parameter> <type>integer</type> )
+ <returnvalue>anyarray</returnvalue>
+ </para>
+ <para>
+ Returns <parameter>n</parameter> randomly chosen elements from <parameter>array</parameter> in selection order.
+ </para>
+ <para>
+ <literal>array_sample(ARRAY[[1,2],[3,4],[5,6]], 2)</literal>
+ <returnvalue>{{5,6},{1,2}}</returnvalue>
+ </para></entry>
+ </row>
+
+ <row>
+ <entry role="func_table_entry"><para role="func_signature">
+ <indexterm>
+ <primary>array_shuffle</primary>
+ </indexterm>
+ <function>array_shuffle</function> ( <type>anyarray</type> )
+ <returnvalue>anyarray</returnvalue>
+ </para>
+ <para>
+ Shuffles the first dimension of the array.
+ </para>
+ <para>
+ <literal>array_shuffle(ARRAY[[1,2],[3,4],[5,6]])</literal>
+ <returnvalue>{{5,6},{1,2},{3,4}}</returnvalue>
+ </para></entry>
+ </row>
+
<row>
<entry role="func_table_entry"><para role="func_signature">
<indexterm id="function-array-to-string">
diff --git a/src/backend/utils/adt/array_userfuncs.c b/src/backend/utils/adt/array_userfuncs.c
index ca70590d7d..4bf28da5e4 100644
--- a/src/backend/utils/adt/array_userfuncs.c
+++ b/src/backend/utils/adt/array_userfuncs.c
@@ -14,6 +14,7 @@
#include "catalog/pg_type.h"
#include "common/int.h"
+#include "common/pg_prng.h"
#include "utils/array.h"
#include "utils/builtins.h"
#include "utils/lsyscache.h"
@@ -902,3 +903,178 @@ array_positions(PG_FUNCTION_ARGS)
PG_RETURN_DATUM(makeArrayResult(astate, CurrentMemoryContext));
}
+
+/*
+ * Produce array with n randomly chosen items from given array in random order.
+ *
+ * array: array object to examine (must not be NULL)
+ * n: number of items (must not be greater than the size of the arrays first dimension)
+ * elmtyp, elmlen, elmbyval, elmalign: info for the datatype of the items
+ *
+ * NOTE: it would be cleaner to look up the elmlen/elmbval/elmalign info
+ * from the system catalogs, given the elmtyp. However, the caller is
+ * in a better position to cache this info across multiple uses, or even
+ * to hard-wire values if the element type is hard-wired.
+ */
+static ArrayType *
+array_shuffle_n(ArrayType *array, int n,
+ Oid elmtyp, int elmlen, bool elmbyval, char elmalign)
+{
+ ArrayType *result;
+ int ndim,
+ *dims,
+ *lbs,
+ rdims[MAXDIM],
+ nelm,
+ nitem,
+ i,
+ j,
+ k;
+ Datum elm,
+ *elms,
+ *ielms,
+ *jelms;
+ bool nul,
+ *nuls,
+ *inuls,
+ *jnuls;
+
+ ndim = ARR_NDIM(array);
+ dims = ARR_DIMS(array);
+ lbs = ARR_LBOUND(array);
+
+ if (ndim < 1 || dims[0] < 1 || n < 1)
+ return construct_empty_array(elmtyp);
+
+ deconstruct_array(array, elmtyp, elmlen, elmbyval, elmalign,
+ &elms, &nuls, &nelm);
+
+ nitem = dims[0]; /* total number of items */
+ nelm /= nitem; /* number of elements per item */
+
+ ielms = elms;
+ inuls = nuls;
+
+ /*
+ * Shuffle array using Fisher-Yates algorithm. Iterate array and swap head
+ * (ielms) with an randomly chosen item (jelms) at each iteration.
+ */
+ for (i = 0; i < n; i++)
+ {
+ j = (int) pg_prng_uint64_range(&pg_global_prng_state, 0, nitem - i - 1) * nelm;
+ jelms = ielms + j;
+ jnuls = inuls + j;
+
+ /* Swap all elements in item (i) with elements in item (j). */
+ for (k = 0; k < nelm; k++)
+ {
+ elm = *ielms;
+ nul = *inuls;
+ *ielms++ = *jelms;
+ *inuls++ = *jnuls;
+ *jelms++ = elm;
+ *jnuls++ = nul;
+ }
+ }
+
+ memcpy(rdims, dims, ndim * sizeof(int));
+ rdims[0] = n;
+
+ result = construct_md_array(elms, nuls, ndim, rdims, lbs,
+ elmtyp, elmlen, elmbyval, elmalign);
+
+ pfree(elms);
+ pfree(nuls);
+
+ return result;
+}
+
+/*
+ * Shuffle the elements of an array.
+ */
+Datum
+array_shuffle(PG_FUNCTION_ARGS)
+{
+ ArrayType *array,
+ *result;
+ int16 elmlen;
+ bool elmbyval;
+ char elmalign;
+ Oid elmtyp;
+ TypeCacheEntry *typentry;
+ int n;
+
+ array = PG_GETARG_ARRAYTYPE_P(0);
+
+ /* Return empty array immediately. */
+ if (ARR_NDIM(array) < 1)
+ PG_RETURN_ARRAYTYPE_P(array);
+
+ n = ARR_DIMS(array)[0];
+
+ /* There is no point in shuffling arrays with less then two items. */
+ if (n < 2)
+ PG_RETURN_ARRAYTYPE_P(array);
+
+ elmtyp = ARR_ELEMTYPE(array);
+ typentry = (TypeCacheEntry *) fcinfo->flinfo->fn_extra;
+ if (typentry == NULL || typentry->type_id != elmtyp)
+ {
+ typentry = lookup_type_cache(elmtyp, 0);
+ fcinfo->flinfo->fn_extra = (void *) typentry;
+ }
+ elmlen = typentry->typlen;
+ elmbyval = typentry->typbyval;
+ elmalign = typentry->typalign;
+
+ result = array_shuffle_n(array, n,
+ elmtyp, elmlen, elmbyval, elmalign);
+
+ PG_FREE_IF_COPY(array, 0);
+
+ PG_RETURN_ARRAYTYPE_P(result);
+}
+
+/*
+ * Choose N random elements from an array.
+ */
+Datum
+array_sample(PG_FUNCTION_ARGS)
+{
+ ArrayType *array,
+ *result;
+ int16 elmlen;
+ bool elmbyval;
+ char elmalign;
+ Oid elmtyp;
+ TypeCacheEntry *typentry;
+ int n,
+ nitem;
+
+ array = PG_GETARG_ARRAYTYPE_P(0);
+ n = PG_GETARG_INT32(1);
+ nitem = (ARR_NDIM(array) < 1) ? 0 : ARR_DIMS(array)[0];
+
+ if (n < 0 || n > nitem)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("sample size must be between 0 and %d", nitem)));
+
+ elmtyp = ARR_ELEMTYPE(array);
+ typentry = (TypeCacheEntry *) fcinfo->flinfo->fn_extra;
+ if (typentry == NULL || typentry->type_id != elmtyp)
+ {
+ typentry = lookup_type_cache(elmtyp, 0);
+ fcinfo->flinfo->fn_extra = (void *) typentry;
+ }
+ elmlen = typentry->typlen;
+ elmbyval = typentry->typbyval;
+ elmalign = typentry->typalign;
+
+ result = array_shuffle_n(array, n,
+ elmtyp, elmlen, elmbyval, elmalign);
+
+ PG_FREE_IF_COPY(array, 0);
+
+ PG_RETURN_ARRAYTYPE_P(result);
+}
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index a07e737a33..00c32c58e7 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -1681,6 +1681,12 @@
proname => 'arraycontjoinsel', provolatile => 's', prorettype => 'float8',
proargtypes => 'internal oid internal int2 internal',
prosrc => 'arraycontjoinsel' },
+{ oid => '8464', descr => 'shuffle array',
+ proname => 'array_shuffle', provolatile => 'v', proparallel => 'r', proisstrict => 't',
+ prorettype => 'anyarray', proargtypes => 'anyarray', prosrc => 'array_shuffle' },
+{ oid => '8465', descr => 'take samples from array',
+ proname => 'array_sample', provolatile => 'v', proparallel => 'r', proisstrict => 't',
+ prorettype => 'anyarray', proargtypes => 'anyarray int4', prosrc => 'array_sample' },
{ oid => '764', descr => 'large object import',
proname => 'lo_import', provolatile => 'v', proparallel => 'u',
diff --git a/src/test/regress/expected/arrays.out b/src/test/regress/expected/arrays.out
index 97920f38c2..9dd365f22d 100644
--- a/src/test/regress/expected/arrays.out
+++ b/src/test/regress/expected/arrays.out
@@ -2447,3 +2447,57 @@ SELECT trim_array(ARRAY[1, 2, 3], 10); -- fail
ERROR: number of elements to trim must be between 0 and 3
SELECT trim_array(ARRAY[]::int[], 1); -- fail
ERROR: number of elements to trim must be between 0 and 0
+-- array_shuffle
+SELECT array_shuffle('{1,2,3,4,5,6}'::int[]) <@ '{1,2,3,4,5,6}';
+ ?column?
+----------
+ t
+(1 row)
+
+SELECT array_shuffle('{1,2,3,4,5,6}'::int[]) @> '{1,2,3,4,5,6}';
+ ?column?
+----------
+ t
+(1 row)
+
+SELECT array_dims(array_shuffle('[-1:2][2:3]={{1,2},{3,NULL},{5,6},{7,8}}'::int[]));
+ array_dims
+-------------
+ [-1:2][2:3]
+(1 row)
+
+SELECT array_dims(array_shuffle('{{{1,2},{3,NULL}},{{5,6},{7,8}},{{9,10},{11,12}}}'::int[]));
+ array_dims
+-----------------
+ [1:3][1:2][1:2]
+(1 row)
+
+-- array_sample
+SELECT array_sample('{1,2,3,4,5,6}'::int[], 3) <@ '{1,2,3,4,5,6}';
+ ?column?
+----------
+ t
+(1 row)
+
+SELECT array_length(array_sample('{1,2,3,4,5,6}'::int[], 3), 1);
+ array_length
+--------------
+ 3
+(1 row)
+
+SELECT array_dims(array_sample('[-1:2][2:3]={{1,2},{3,NULL},{5,6},{7,8}}'::int[], 3));
+ array_dims
+-------------
+ [-1:1][2:3]
+(1 row)
+
+SELECT array_dims(array_sample('{{{1,2},{3,NULL}},{{5,6},{7,8}},{{9,10},{11,12}}}'::int[], 2));
+ array_dims
+-----------------
+ [1:2][1:2][1:2]
+(1 row)
+
+SELECT array_sample('{1,2,3,4,5,6}'::int[], -1); -- fail
+ERROR: sample size must be between 0 and 6
+SELECT array_sample('{1,2,3,4,5,6}'::int[], 7); --fail
+ERROR: sample size must be between 0 and 6
diff --git a/src/test/regress/sql/arrays.sql b/src/test/regress/sql/arrays.sql
index 791af5c0ce..766798bc62 100644
--- a/src/test/regress/sql/arrays.sql
+++ b/src/test/regress/sql/arrays.sql
@@ -755,3 +755,17 @@ FROM
SELECT trim_array(ARRAY[1, 2, 3], -1); -- fail
SELECT trim_array(ARRAY[1, 2, 3], 10); -- fail
SELECT trim_array(ARRAY[]::int[], 1); -- fail
+
+-- array_shuffle
+SELECT array_shuffle('{1,2,3,4,5,6}'::int[]) <@ '{1,2,3,4,5,6}';
+SELECT array_shuffle('{1,2,3,4,5,6}'::int[]) @> '{1,2,3,4,5,6}';
+SELECT array_dims(array_shuffle('[-1:2][2:3]={{1,2},{3,NULL},{5,6},{7,8}}'::int[]));
+SELECT array_dims(array_shuffle('{{{1,2},{3,NULL}},{{5,6},{7,8}},{{9,10},{11,12}}}'::int[]));
+
+-- array_sample
+SELECT array_sample('{1,2,3,4,5,6}'::int[], 3) <@ '{1,2,3,4,5,6}';
+SELECT array_length(array_sample('{1,2,3,4,5,6}'::int[], 3), 1);
+SELECT array_dims(array_sample('[-1:2][2:3]={{1,2},{3,NULL},{5,6},{7,8}}'::int[], 3));
+SELECT array_dims(array_sample('{{{1,2},{3,NULL}},{{5,6},{7,8}},{{9,10},{11,12}}}'::int[], 2));
+SELECT array_sample('{1,2,3,4,5,6}'::int[], -1); -- fail
+SELECT array_sample('{1,2,3,4,5,6}'::int[], 7); --fail
--
2.37.3