Am 19.07.22 um 16:20 schrieb Tom Lane:
So I withdraw my original position. These functions should just shuffle or select in the array's first dimension, preserving subarrays. Or else be lazy and reject more-than-one-D arrays; but it's probably not that hard to handle them.
Here is a patch with dimension aware array_shuffle() and array_sample().If you think array_flatten() is desirable, i can take a look at it. Maybe a second parameter would be nice to specify the target dimension:
select array_flatten(array[[[1,2],[3,4]],[[5,6],[7,8]]], 1); ------------------- {1,2,3,4,5,6,7,8} select array_flatten(array[[[1,2],[3,4]],[[5,6],[7,8]]], 2); ----------------------- {{1,2,3,4},{5,6,7,8}} Martin
From 2aa6d72ff0f4d8835ee2f09f8cdf16b7e8005e56 Mon Sep 17 00:00:00 2001 From: Martin Kalcher <martin.kalc...@aboutsource.net> Date: Sun, 17 Jul 2022 18:06:04 +0200 Subject: [PATCH] Introduce array_shuffle() and array_sample() * array_shuffle() shuffles the elements of an array. * array_sample() chooses n elements from an array by random. Shuffling of arrays can already be accomplished with SQL using unnest() and array_agg(order by random()). But that is very slow compared to the new functions. In addition the new functions are dimension aware. --- doc/src/sgml/func.sgml | 35 +++++ src/backend/utils/adt/arrayfuncs.c | 189 +++++++++++++++++++++++++++ src/include/catalog/pg_proc.dat | 6 + src/test/regress/expected/arrays.out | 60 +++++++++ src/test/regress/sql/arrays.sql | 17 +++ 5 files changed, 307 insertions(+) diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index b6783b7ad0..6b96897244 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -19395,6 +19395,41 @@ SELECT NULLIF(value, '(none)') ... </para></entry> </row> + <row> + <entry role="func_table_entry"><para role="func_signature"> + <indexterm> + <primary>array_sample</primary> + </indexterm> + <function>array_sample</function> ( <parameter>array</parameter> <type>anyarray</type>, <parameter>n</parameter> <type>integer</type> ) + <returnvalue>anyarray</returnvalue> + </para> + <para> + Returns <parameter>n</parameter> randomly chosen elements from <parameter>array</parameter>. + The order of the elements in resulting array is unspecified. + </para> + <para> + <literal>array_sample(ARRAY[[1,2],[3,4],[5,6]], 2)</literal> + <returnvalue>{{5,6},{3,4}}</returnvalue> + </para></entry> + </row> + + <row> + <entry role="func_table_entry"><para role="func_signature"> + <indexterm> + <primary>array_shuffle</primary> + </indexterm> + <function>array_shuffle</function> ( <type>anyarray</type> ) + <returnvalue>anyarray</returnvalue> + </para> + <para> + Shuffles the first dimension of the array. + </para> + <para> + <literal>array_shuffle(ARRAY[[1,2],[3,4],[5,6]])</literal> + <returnvalue>{{5,6},{3,4},{1,2}}</returnvalue> + </para></entry> + </row> + <row> <entry role="func_table_entry"><para role="func_signature"> <indexterm id="function-array-to-string"> diff --git a/src/backend/utils/adt/arrayfuncs.c b/src/backend/utils/adt/arrayfuncs.c index fb167f226a..3de1b0db30 100644 --- a/src/backend/utils/adt/arrayfuncs.c +++ b/src/backend/utils/adt/arrayfuncs.c @@ -6872,3 +6872,192 @@ trim_array(PG_FUNCTION_ARGS) PG_RETURN_DATUM(result); } + +/* + * Produce array with max n random items from given array in random order. + * + * array: array object to examine (must not be NULL) + * n: max number of items + * elmtyp, elmlen, elmbyval, elmalign: info for the datatype of the items + * + * NOTE: it would be cleaner to look up the elmlen/elmbval/elmalign info + * from the system catalogs, given the elmtype. However, the caller is + * in a better position to cache this info across multiple uses, or even + * to hard-wire values if the element type is hard-wired. + */ +static ArrayType * +array_shuffle_n(ArrayType *array, int n, + Oid elmtyp, int elmlen, bool elmbyval, char elmalign) +{ + ArrayType *result; + int ndim, + *dims, + *lbs, + rdims[MAXDIM], + nelm, + nitem, + i, + j, + k; + Datum elm, + *elms, + *relms; + bool nul, + *nuls, + *rnuls; + + ndim = ARR_NDIM(array); + dims = ARR_DIMS(array); + lbs = ARR_LBOUND(array); + + if (ndim < 1 || dims[0] < 1 || n < 1) + return construct_empty_array(elmtyp); + + deconstruct_array(array, elmtyp, elmlen, elmbyval, elmalign, + &relms, &rnuls, &nelm); + + /* Calculate number of elements per item. */ + nelm = (ndim > 1) ? ArrayGetNItems(ndim - 1, dims + 1) : 1; + elms = relms; + nuls = rnuls; + nitem = dims[0]; + n = Min(n, nitem); + + /* + * Shuffle array using Fisher-Yates algorithm. Swap head with an randomly + * chosen item and increment head. + */ + for (i = 0; i < n; i++) + { + k = (rand() % (nitem - i)) * nelm; + + /* Swap all elements in head with elements in item k. */ + for (j = 0; j < nelm; j++) + { + elm = *elms; + nul = *nuls; + *elms = elms[k]; + *nuls = nuls[k]; + elms[k] = elm; + nuls[k] = nul; + elms++; + nuls++; + } + } + + memcpy(rdims, dims, ndim * sizeof(int)); + rdims[0] = n; + + result = construct_md_array(relms, rnuls, ndim, rdims, lbs, + elmtyp, elmlen, elmbyval, elmalign); + + pfree(relms); + pfree(rnuls); + + return result; +} + +/* + * Shuffle the elements of an array. + */ +Datum +array_shuffle(PG_FUNCTION_ARGS) +{ + ArrayType *array, + *result; + int16 elmlen; + bool elmbyval; + char elmalign; + Oid elmtyp; + TypeCacheEntry *typentry; + int n; + + array = PG_GETARG_ARRAYTYPE_P(0); + + /* Return empty array immediately. */ + if (ARR_NDIM(array) < 1) + PG_RETURN_ARRAYTYPE_P(array); + + n = ARR_DIMS(array)[0]; + + /* There is no point in shuffling arrays with less then two items. */ + if (n < 2) + PG_RETURN_ARRAYTYPE_P(array); + + elmtyp = ARR_ELEMTYPE(array); + typentry = (TypeCacheEntry *) fcinfo->flinfo->fn_extra; + + if (typentry == NULL || typentry->type_id != elmtyp) + { + typentry = lookup_type_cache(elmtyp, 0); + fcinfo->flinfo->fn_extra = (void *) typentry; + } + elmlen = typentry->typlen; + elmbyval = typentry->typbyval; + elmalign = typentry->typalign; + + result = array_shuffle_n(array, n, + elmtyp, elmlen, elmbyval, elmalign); + + PG_FREE_IF_COPY(array, 0); + + PG_RETURN_ARRAYTYPE_P(result); +} + +/* + * Choose N random elements from an array. + */ +Datum +array_sample(PG_FUNCTION_ARGS) +{ + ArrayType *array, + *result; + int16 elmlen; + bool elmbyval; + char elmalign; + Oid elmtyp; + TypeCacheEntry *typentry; + int n; + + array = PG_GETARG_ARRAYTYPE_P(0); + n = PG_GETARG_INT32(1); + + if (n < 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("second parameter must not be negative"))); + + elmtyp = ARR_ELEMTYPE(array); + + /* Return an empty array if the requested sample size is zero. */ + if (n == 0) + { + PG_FREE_IF_COPY(array, 0); + PG_RETURN_ARRAYTYPE_P(construct_empty_array(elmtyp)); + } + + /* + * Return the original array if its size is less than or equal to the + * requested sample size. + */ + if (ARR_NDIM(array) < 1 || ARR_DIMS(array)[0] <= n) + PG_RETURN_ARRAYTYPE_P(array); + + typentry = (TypeCacheEntry *) fcinfo->flinfo->fn_extra; + + if (typentry == NULL || typentry->type_id != elmtyp) + { + typentry = lookup_type_cache(elmtyp, 0); + fcinfo->flinfo->fn_extra = (void *) typentry; + } + elmlen = typentry->typlen; + elmbyval = typentry->typbyval; + elmalign = typentry->typalign; + + result = array_shuffle_n(array, n, + elmtyp, elmlen, elmbyval, elmalign); + + PG_FREE_IF_COPY(array, 0); + + PG_RETURN_ARRAYTYPE_P(result); +} diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 2e41f4d9e8..f51af5ee9f 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -1681,6 +1681,12 @@ proname => 'arraycontjoinsel', provolatile => 's', prorettype => 'float8', proargtypes => 'internal oid internal int2 internal', prosrc => 'arraycontjoinsel' }, +{ oid => '8464', descr => 'shuffle array', + proname => 'array_shuffle', proisstrict => 't', prorettype => 'anyarray', + proargtypes => 'anyarray', prosrc => 'array_shuffle' }, +{ oid => '8465', descr => 'take samples from array', + proname => 'array_sample', proisstrict => 't', prorettype => 'anyarray', + proargtypes => 'anyarray int4', prosrc => 'array_sample' }, { oid => '764', descr => 'large object import', proname => 'lo_import', provolatile => 'v', proparallel => 'u', diff --git a/src/test/regress/expected/arrays.out b/src/test/regress/expected/arrays.out index ce6f3a65f9..1eb7ef37cf 100644 --- a/src/test/regress/expected/arrays.out +++ b/src/test/regress/expected/arrays.out @@ -2445,3 +2445,63 @@ SELECT trim_array(ARRAY[1, 2, 3], -1); -- fail ERROR: number of elements to trim must be between 0 and 3 SELECT trim_array(ARRAY[1, 2, 3], 10); -- fail ERROR: number of elements to trim must be between 0 and 3 +-- array_shuffle +SELECT array_agg(a order by a) +FROM +(SELECT unnest(array_shuffle('{NULL,1,2,3,4,5}'::int[]))) v(a); + array_agg +------------------ + {1,2,3,4,5,NULL} +(1 row) + +SELECT array_dims(array_shuffle('{{1,2,3},{4,5,6}}'::int[][])); + array_dims +------------ + [1:2][1:3] +(1 row) + +SELECT array_dims(array_shuffle('[-1:1]={1,2,3}'::int[])); + array_dims +------------ + [-1:1] +(1 row) + +SELECT (array_shuffle('{{1,2},{3,4},{5,6}}'::int[][]))[1:][1] <@ '{1,3,5}'::int[]; + ?column? +---------- + t +(1 row) + +-- array_sample +SELECT cardinality(array_sample('{NULL,1,2,3,4,5}'::int[], 3)); + cardinality +------------- + 3 +(1 row) + +SELECT array_sample('{1,2,3,4,5}'::int[], 3) <@ '{1,2,3,4,5}'::int[]; + ?column? +---------- + t +(1 row) + +SELECT array_sample('{1,2,3,4,5}'::int[], 10) <@ '{1,2,3,4,5}'::int[]; + ?column? +---------- + t +(1 row) + +SELECT array_dims(array_sample('{{1,2},{3,4},{4,5},{6,7}}'::int[][], 3)); + array_dims +------------ + [1:3][1:2] +(1 row) + +SELECT array_dims(array_sample('[-1:1]={1,2,3}'::int[], 2)); + array_dims +------------ + [-1:0] +(1 row) + +SELECT array_sample('{1,2,3,4,5}'::int[], -1); -- fail +ERROR: second parameter must not be negative diff --git a/src/test/regress/sql/arrays.sql b/src/test/regress/sql/arrays.sql index f774faf856..5daee9aa72 100644 --- a/src/test/regress/sql/arrays.sql +++ b/src/test/regress/sql/arrays.sql @@ -754,3 +754,20 @@ FROM SELECT trim_array(ARRAY[1, 2, 3], -1); -- fail SELECT trim_array(ARRAY[1, 2, 3], 10); -- fail + +-- array_shuffle +SELECT array_agg(a order by a) +FROM +(SELECT unnest(array_shuffle('{NULL,1,2,3,4,5}'::int[]))) v(a); + +SELECT array_dims(array_shuffle('{{1,2,3},{4,5,6}}'::int[][])); +SELECT array_dims(array_shuffle('[-1:1]={1,2,3}'::int[])); +SELECT (array_shuffle('{{1,2},{3,4},{5,6}}'::int[][]))[1:][1] <@ '{1,3,5}'::int[]; + +-- array_sample +SELECT cardinality(array_sample('{NULL,1,2,3,4,5}'::int[], 3)); +SELECT array_sample('{1,2,3,4,5}'::int[], 3) <@ '{1,2,3,4,5}'::int[]; +SELECT array_sample('{1,2,3,4,5}'::int[], 10) <@ '{1,2,3,4,5}'::int[]; +SELECT array_dims(array_sample('{{1,2},{3,4},{4,5},{6,7}}'::int[][], 3)); +SELECT array_dims(array_sample('[-1:1]={1,2,3}'::int[], 2)); +SELECT array_sample('{1,2,3,4,5}'::int[], -1); -- fail -- 2.37.1