Am 22.07.22 um 11:31 schrieb Martin Kalcher:
i came to the same conclusions and went with Option 1 (see patch).
Mainly because most code in utils/adt is organized by type and this way
it is clear, that this is a thin wrapper around pg_prng.
Small patch update. I realized the new functions should live
array_userfuncs.c (rather than arrayfuncs.c), fixed some file headers
and added some comments to the code.From 138777531961c31250074d1c0d87417e31d63656 Mon Sep 17 00:00:00 2001
From: Martin Kalcher <martin.kalc...@aboutsource.net>
Date: Sun, 17 Jul 2022 18:06:04 +0200
Subject: [PATCH] Introduce array_shuffle() and array_sample()
* array_shuffle() shuffles the elements of an array.
* array_sample() chooses n elements from an array by random.
The new functions share its prng state with random() and thus interoperate
with setseed().
---
doc/src/sgml/func.sgml | 35 +++++
src/backend/utils/adt/Makefile | 1 +
src/backend/utils/adt/array_userfuncs.c | 182 ++++++++++++++++++++++++
src/backend/utils/adt/float.c | 37 +----
src/backend/utils/adt/user_prng.c | 87 +++++++++++
src/include/catalog/pg_proc.dat | 6 +
src/include/utils/user_prng.h | 19 +++
src/test/regress/expected/arrays.out | 58 ++++++++
src/test/regress/sql/arrays.sql | 14 ++
9 files changed, 406 insertions(+), 33 deletions(-)
create mode 100644 src/backend/utils/adt/user_prng.c
create mode 100644 src/include/utils/user_prng.h
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index b6783b7ad0..87df8a9c81 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -19395,6 +19395,41 @@ SELECT NULLIF(value, '(none)') ...
</para></entry>
</row>
+ <row>
+ <entry role="func_table_entry"><para role="func_signature">
+ <indexterm>
+ <primary>array_sample</primary>
+ </indexterm>
+ <function>array_sample</function> ( <parameter>array</parameter> <type>anyarray</type>, <parameter>n</parameter> <type>integer</type> )
+ <returnvalue>anyarray</returnvalue>
+ </para>
+ <para>
+ Returns <parameter>n</parameter> randomly chosen elements from <parameter>array</parameter>.
+ The order of the elements in resulting array is unspecified.
+ </para>
+ <para>
+ <literal>array_sample(ARRAY[[1,2],[3,4],[5,6]], 2)</literal>
+ <returnvalue>{{5,6},{1,2}}</returnvalue>
+ </para></entry>
+ </row>
+
+ <row>
+ <entry role="func_table_entry"><para role="func_signature">
+ <indexterm>
+ <primary>array_shuffle</primary>
+ </indexterm>
+ <function>array_shuffle</function> ( <type>anyarray</type> )
+ <returnvalue>anyarray</returnvalue>
+ </para>
+ <para>
+ Shuffles the first dimension of the array.
+ </para>
+ <para>
+ <literal>array_shuffle(ARRAY[[1,2],[3,4],[5,6]])</literal>
+ <returnvalue>{{5,6},{1,2},{3,4}}</returnvalue>
+ </para></entry>
+ </row>
+
<row>
<entry role="func_table_entry"><para role="func_signature">
<indexterm id="function-array-to-string">
diff --git a/src/backend/utils/adt/Makefile b/src/backend/utils/adt/Makefile
index 7c722ea2ce..42b65e58bb 100644
--- a/src/backend/utils/adt/Makefile
+++ b/src/backend/utils/adt/Makefile
@@ -109,6 +109,7 @@ OBJS = \
tsvector.o \
tsvector_op.o \
tsvector_parser.o \
+ user_prng.o \
uuid.o \
varbit.o \
varchar.o \
diff --git a/src/backend/utils/adt/array_userfuncs.c b/src/backend/utils/adt/array_userfuncs.c
index ca70590d7d..e3ba14a17c 100644
--- a/src/backend/utils/adt/array_userfuncs.c
+++ b/src/backend/utils/adt/array_userfuncs.c
@@ -17,6 +17,7 @@
#include "utils/array.h"
#include "utils/builtins.h"
#include "utils/lsyscache.h"
+#include "utils/user_prng.h"
#include "utils/typcache.h"
@@ -902,3 +903,184 @@ array_positions(PG_FUNCTION_ARGS)
PG_RETURN_DATUM(makeArrayResult(astate, CurrentMemoryContext));
}
+
+/*
+ * Produce array with max n randomly chosen items from given array in
+ * random order.
+ *
+ * array: array object to examine (must not be NULL)
+ * n: max number of items
+ * elmtyp, elmlen, elmbyval, elmalign: info for the datatype of the items
+ *
+ * NOTE: it would be cleaner to look up the elmlen/elmbval/elmalign info
+ * from the system catalogs, given the elmtyp. However, the caller is
+ * in a better position to cache this info across multiple uses, or even
+ * to hard-wire values if the element type is hard-wired.
+ */
+static ArrayType *
+array_shuffle_n(ArrayType *array, int n,
+ Oid elmtyp, int elmlen, bool elmbyval, char elmalign)
+{
+ ArrayType *result;
+ int ndim,
+ *dims,
+ *lbs,
+ rdims[MAXDIM],
+ nelm,
+ nitem,
+ rnitem,
+ i,
+ j,
+ k;
+ Datum elm,
+ *elms,
+ *relms;
+ bool nul,
+ *nuls,
+ *rnuls;
+
+ ndim = ARR_NDIM(array);
+ dims = ARR_DIMS(array);
+ lbs = ARR_LBOUND(array);
+
+ if (ndim < 1 || dims[0] < 1 || n < 1)
+ return construct_empty_array(elmtyp);
+
+ deconstruct_array(array, elmtyp, elmlen, elmbyval, elmalign,
+ &relms, &rnuls, &nelm);
+
+ nitem = dims[0]; /* total number of items */
+ rnitem = Min(n, nitem); /* target number of items */
+ nelm /= nitem; /* number of elements per item */
+
+ elms = relms;
+ nuls = rnuls;
+
+ /*
+ * Shuffle array using Fisher-Yates algorithm. Swap head with an randomly
+ * chosen item and increment head.
+ */
+ for (i = 0; i < rnitem; i++)
+ {
+ k = (int) user_prng_uint64_range(0, nitem - i - 1) * nelm;
+
+ /* Swap all elements in head with elements in item k. */
+ for (j = 0; j < nelm; j++)
+ {
+ elm = *elms;
+ nul = *nuls;
+ *elms = elms[k];
+ *nuls = nuls[k];
+ elms[k] = elm;
+ nuls[k] = nul;
+ elms++;
+ nuls++;
+ }
+ }
+
+ memcpy(rdims, dims, ndim * sizeof(int));
+ rdims[0] = rnitem;
+
+ result = construct_md_array(relms, rnuls, ndim, rdims, lbs,
+ elmtyp, elmlen, elmbyval, elmalign);
+
+ pfree(relms);
+ pfree(rnuls);
+
+ return result;
+}
+
+/*
+ * Shuffle the elements of an array.
+ */
+Datum
+array_shuffle(PG_FUNCTION_ARGS)
+{
+ ArrayType *array,
+ *result;
+ int16 elmlen;
+ bool elmbyval;
+ char elmalign;
+ Oid elmtyp;
+ TypeCacheEntry *typentry;
+ int n;
+
+ array = PG_GETARG_ARRAYTYPE_P(0);
+
+ /* Return empty array immediately. */
+ if (ARR_NDIM(array) < 1)
+ PG_RETURN_ARRAYTYPE_P(array);
+
+ n = ARR_DIMS(array)[0];
+
+ /* There is no point in shuffling arrays with less then two items. */
+ if (n < 2)
+ PG_RETURN_ARRAYTYPE_P(array);
+
+ elmtyp = ARR_ELEMTYPE(array);
+ typentry = (TypeCacheEntry *) fcinfo->flinfo->fn_extra;
+ if (typentry == NULL || typentry->type_id != elmtyp)
+ {
+ typentry = lookup_type_cache(elmtyp, 0);
+ fcinfo->flinfo->fn_extra = (void *) typentry;
+ }
+ elmlen = typentry->typlen;
+ elmbyval = typentry->typbyval;
+ elmalign = typentry->typalign;
+
+ result = array_shuffle_n(array, n,
+ elmtyp, elmlen, elmbyval, elmalign);
+
+ PG_FREE_IF_COPY(array, 0);
+
+ PG_RETURN_ARRAYTYPE_P(result);
+}
+
+/*
+ * Choose N random elements from an array.
+ */
+Datum
+array_sample(PG_FUNCTION_ARGS)
+{
+ ArrayType *array,
+ *result;
+ int16 elmlen;
+ bool elmbyval;
+ char elmalign;
+ Oid elmtyp;
+ TypeCacheEntry *typentry;
+ int n;
+
+ array = PG_GETARG_ARRAYTYPE_P(0);
+ n = PG_GETARG_INT32(1);
+
+ if (n < 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("second parameter must not be negative")));
+
+ /*
+ * Return the original array if its size is less than or equal to the
+ * requested sample size.
+ */
+ if (ARR_NDIM(array) < 1 || ARR_DIMS(array)[0] <= n)
+ PG_RETURN_ARRAYTYPE_P(array);
+
+ elmtyp = ARR_ELEMTYPE(array);
+ typentry = (TypeCacheEntry *) fcinfo->flinfo->fn_extra;
+ if (typentry == NULL || typentry->type_id != elmtyp)
+ {
+ typentry = lookup_type_cache(elmtyp, 0);
+ fcinfo->flinfo->fn_extra = (void *) typentry;
+ }
+ elmlen = typentry->typlen;
+ elmbyval = typentry->typbyval;
+ elmalign = typentry->typalign;
+
+ result = array_shuffle_n(array, n,
+ elmtyp, elmlen, elmbyval, elmalign);
+
+ PG_FREE_IF_COPY(array, 0);
+
+ PG_RETURN_ARRAYTYPE_P(result);
+}
diff --git a/src/backend/utils/adt/float.c b/src/backend/utils/adt/float.c
index fc8f39a7a9..29dabbbc6a 100644
--- a/src/backend/utils/adt/float.c
+++ b/src/backend/utils/adt/float.c
@@ -21,15 +21,13 @@
#include "catalog/pg_type.h"
#include "common/int.h"
-#include "common/pg_prng.h"
#include "common/shortest_dec.h"
#include "libpq/pqformat.h"
-#include "miscadmin.h"
#include "utils/array.h"
#include "utils/float.h"
#include "utils/fmgrprotos.h"
+#include "utils/user_prng.h"
#include "utils/sortsupport.h"
-#include "utils/timestamp.h"
/*
@@ -64,10 +62,6 @@ float8 degree_c_sixty = 60.0;
float8 degree_c_one_half = 0.5;
float8 degree_c_one = 1.0;
-/* State for drandom() and setseed() */
-static bool drandom_seed_set = false;
-static pg_prng_state drandom_seed;
-
/* Local function prototypes */
static double sind_q1(double x);
static double cosd_q1(double x);
@@ -2749,30 +2743,8 @@ datanh(PG_FUNCTION_ARGS)
Datum
drandom(PG_FUNCTION_ARGS)
{
- float8 result;
-
- /* Initialize random seed, if not done yet in this process */
- if (unlikely(!drandom_seed_set))
- {
- /*
- * If possible, initialize the seed using high-quality random bits.
- * Should that fail for some reason, we fall back on a lower-quality
- * seed based on current time and PID.
- */
- if (unlikely(!pg_prng_strong_seed(&drandom_seed)))
- {
- TimestampTz now = GetCurrentTimestamp();
- uint64 iseed;
-
- /* Mix the PID with the most predictable bits of the timestamp */
- iseed = (uint64) now ^ ((uint64) MyProcPid << 32);
- pg_prng_seed(&drandom_seed, iseed);
- }
- drandom_seed_set = true;
- }
-
- /* pg_prng_double produces desired result range [0.0 - 1.0) */
- result = pg_prng_double(&drandom_seed);
+ /* user_prng_double produces desired result range [0.0 - 1.0) */
+ float8 result = user_prng_double();
PG_RETURN_FLOAT8(result);
}
@@ -2792,8 +2764,7 @@ setseed(PG_FUNCTION_ARGS)
errmsg("setseed parameter %g is out of allowed range [-1,1]",
seed)));
- pg_prng_fseed(&drandom_seed, seed);
- drandom_seed_set = true;
+ user_prng_seed(seed);
PG_RETURN_VOID();
}
diff --git a/src/backend/utils/adt/user_prng.c b/src/backend/utils/adt/user_prng.c
new file mode 100644
index 0000000000..cac9abe40c
--- /dev/null
+++ b/src/backend/utils/adt/user_prng.c
@@ -0,0 +1,87 @@
+/*-------------------------------------------------------------------------
+ *
+ * user_prng.c
+ * Random number generator for user facing functions.
+ *
+ * Portions Copyright (c) 2022, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/utils/adt/user_prng.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "common/pg_prng.h"
+#include "miscadmin.h"
+#include "utils/user_prng.h"
+#include "utils/timestamp.h"
+
+/*
+ * Process wide state vector used by all user facing random functions,
+ * like random() or array_shuffle().
+ */
+static pg_prng_state user_prng_state;
+static bool user_prng_state_set = false;
+
+/*
+ * Ensure the prng is seeded.
+ */
+static inline void
+user_prng_ensure_seed()
+{
+ /* Initialize prng state, if not done yet in this process. */
+ if (unlikely(!user_prng_state_set))
+ {
+ /*
+ * If possible, initialize the state using high-quality random bits.
+ * Should that fail for some reason, we fall back on a lower-quality
+ * seed based on current time and PID.
+ */
+ if (unlikely(!pg_prng_strong_seed(&user_prng_state)))
+ {
+ uint64 now = (uint64) GetCurrentTimestamp();
+ uint64 pid = (uint64) MyProcPid;
+
+ /* Mix PID with the most predictable bits of the timestamp. */
+ pg_prng_seed(&user_prng_state, now ^ (pid << 32));
+ }
+
+ user_prng_state_set = true;
+ }
+}
+
+/*
+ * Seed the prng.
+ */
+inline void
+user_prng_seed(float8 seed)
+{
+ pg_prng_fseed(&user_prng_state, seed);
+
+ user_prng_state_set = true;
+}
+
+/*
+ * Select a random double from the range [0.0, 1.0).
+ */
+inline float8
+user_prng_double()
+{
+ user_prng_ensure_seed();
+
+ return pg_prng_double(&user_prng_state);
+}
+
+/*
+ * Select a random uint64 from the range [min, max].
+ */
+inline uint64
+user_prng_uint64_range(uint64 min, uint64 max)
+{
+ user_prng_ensure_seed();
+
+ return pg_prng_uint64_range(&user_prng_state, min, max);
+}
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 2e41f4d9e8..9933d10f6d 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -1681,6 +1681,12 @@
proname => 'arraycontjoinsel', provolatile => 's', prorettype => 'float8',
proargtypes => 'internal oid internal int2 internal',
prosrc => 'arraycontjoinsel' },
+{ oid => '8464', descr => 'shuffle array',
+ proname => 'array_shuffle', provolatile => 'v', proparallel => 'r', proisstrict => 't',
+ prorettype => 'anyarray', proargtypes => 'anyarray', prosrc => 'array_shuffle' },
+{ oid => '8465', descr => 'take samples from array',
+ proname => 'array_sample', provolatile => 'v', proparallel => 'r', proisstrict => 't',
+ prorettype => 'anyarray', proargtypes => 'anyarray int4', prosrc => 'array_sample' },
{ oid => '764', descr => 'large object import',
proname => 'lo_import', provolatile => 'v', proparallel => 'u',
diff --git a/src/include/utils/user_prng.h b/src/include/utils/user_prng.h
new file mode 100644
index 0000000000..1f6df0b6fa
--- /dev/null
+++ b/src/include/utils/user_prng.h
@@ -0,0 +1,19 @@
+/*-------------------------------------------------------------------------
+ *
+ * user_prng.h
+ * Random number generator for user facing functions.
+ *
+ * Portions Copyright (c) 2022, PostgreSQL Global Development Group
+ *
+ * src/include/utils/user_prng.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef USER_PRNG_H
+#define USER_PRNG_H
+
+extern void user_prng_seed(float8 seed);
+extern float8 user_prng_double();
+extern uint64 user_prng_uint64_range(uint64 min, uint64 max);
+
+#endif /* USER_PRNG_H */
diff --git a/src/test/regress/expected/arrays.out b/src/test/regress/expected/arrays.out
index ce6f3a65f9..ff4fea0923 100644
--- a/src/test/regress/expected/arrays.out
+++ b/src/test/regress/expected/arrays.out
@@ -2445,3 +2445,61 @@ SELECT trim_array(ARRAY[1, 2, 3], -1); -- fail
ERROR: number of elements to trim must be between 0 and 3
SELECT trim_array(ARRAY[1, 2, 3], 10); -- fail
ERROR: number of elements to trim must be between 0 and 3
+-- array_shuffle
+SELECT setseed(0.1); -- ensure predictable results
+ setseed
+---------
+
+(1 row)
+
+SELECT array_shuffle('{NULL,1,2,3,4,5}'::int[]);
+ array_shuffle
+------------------
+ {5,NULL,1,2,4,3}
+(1 row)
+
+SELECT array_shuffle('{NULL,1,2,3,4,5}'::int[]);
+ array_shuffle
+------------------
+ {4,1,2,NULL,3,5}
+(1 row)
+
+SELECT array_shuffle('[-1:2][2:3]={{1,2},{3,4},{5,6},{7,8}}'::int[]);
+ array_shuffle
+---------------------------------------
+ [-1:2][2:3]={{7,8},{3,4},{1,2},{5,6}}
+(1 row)
+
+-- array_sample
+SELECT setseed(0.1); -- ensure predictable results
+ setseed
+---------
+
+(1 row)
+
+SELECT array_sample('{1,2,3,4,5}'::int[], 3);
+ array_sample
+--------------
+ {5,1,2}
+(1 row)
+
+SELECT array_sample('{1,2,3,4,5}'::int[], 3);
+ array_sample
+--------------
+ {3,4,1}
+(1 row)
+
+SELECT array_sample('{1,2,3,4,5}'::int[], 6);
+ array_sample
+--------------
+ {1,2,3,4,5}
+(1 row)
+
+SELECT array_sample('[-1:2][2:3]={{1,2},{3,4},{5,6},{7,8}}'::int[], 3);
+ array_sample
+---------------------------------
+ [-1:1][2:3]={{1,2},{5,6},{3,4}}
+(1 row)
+
+SELECT array_sample('{1,2,3,4,5}'::int[], -1); -- fail
+ERROR: second parameter must not be negative
diff --git a/src/test/regress/sql/arrays.sql b/src/test/regress/sql/arrays.sql
index f774faf856..3371c04131 100644
--- a/src/test/regress/sql/arrays.sql
+++ b/src/test/regress/sql/arrays.sql
@@ -754,3 +754,17 @@ FROM
SELECT trim_array(ARRAY[1, 2, 3], -1); -- fail
SELECT trim_array(ARRAY[1, 2, 3], 10); -- fail
+
+-- array_shuffle
+SELECT setseed(0.1); -- ensure predictable results
+SELECT array_shuffle('{NULL,1,2,3,4,5}'::int[]);
+SELECT array_shuffle('{NULL,1,2,3,4,5}'::int[]);
+SELECT array_shuffle('[-1:2][2:3]={{1,2},{3,4},{5,6},{7,8}}'::int[]);
+
+-- array_sample
+SELECT setseed(0.1); -- ensure predictable results
+SELECT array_sample('{1,2,3,4,5}'::int[], 3);
+SELECT array_sample('{1,2,3,4,5}'::int[], 3);
+SELECT array_sample('{1,2,3,4,5}'::int[], 6);
+SELECT array_sample('[-1:2][2:3]={{1,2},{3,4},{5,6},{7,8}}'::int[], 3);
+SELECT array_sample('{1,2,3,4,5}'::int[], -1); -- fail
--
2.37.1