From f45ac553c262c33e7f3be4d8c1616fff55e2bcbe Mon Sep 17 00:00:00 2001
From: Alexey Grishchenko <agrishchenko@pivotal.io>
Date: Wed, 3 Aug 2016 12:33:14 +0100
Subject: [PATCH] PL/Python adding support for multi-dimensional arrays

This patch adds support for multi-dimensional arrays as both input
and output parameters for PL/Python functions. The number of dimensions
is limited by Postgres MAXDIM macrovariable, by default equal to 6.
Both input and output multi-dimensional arrays should have fixed
dimension sizes, i.e. 2-d arrays should represent MxN matrix, 3-d
arrays representing MxNxK cube, etc.
Patch does not support multi-dimensional arrays of composite types,
as composite types in Python might be represented as iterators and
there is no way to find out when the composite type structure start,
so composite types can be used only in a single-dimensional arrays.
---
 src/pl/plpython/expected/plpython_types.out   |  30 +++-
 src/pl/plpython/expected/plpython_types_3.out |  30 +++-
 src/pl/plpython/plpy_typeio.c                 | 243 +++++++++++++++++++++-----
 src/pl/plpython/sql/plpython_types.sql        |   7 +
 4 files changed, 261 insertions(+), 49 deletions(-)

diff --git a/src/pl/plpython/expected/plpython_types.out b/src/pl/plpython/expected/plpython_types.out
index f0b6abd..1a06620 100644
--- a/src/pl/plpython/expected/plpython_types.out
+++ b/src/pl/plpython/expected/plpython_types.out
@@ -537,9 +537,19 @@ INFO:  (None, <type 'NoneType'>)
 (1 row)
 
 SELECT * FROM test_type_conversion_array_int4(ARRAY[[1,2,3],[4,5,6]]);
-ERROR:  cannot convert multidimensional array to Python list
-DETAIL:  PL/Python only supports one-dimensional arrays.
-CONTEXT:  PL/Python function "test_type_conversion_array_int4"
+INFO:  ([[1, 2, 3], [4, 5, 6]], <type 'list'>)
+ test_type_conversion_array_int4 
+---------------------------------
+ {{1,2,3},{4,5,6}}
+(1 row)
+
+SELECT * FROM test_type_conversion_array_int4(ARRAY[[[1,2,NULL],[NULL,5,6]],[[NULL,8,9],[10,11,12]]]);
+INFO:  ([[[1, 2, None], [None, 5, 6]], [[None, 8, 9], [10, 11, 12]]], <type 'list'>)
+          test_type_conversion_array_int4          
+---------------------------------------------------
+ {{{1,2,NULL},{NULL,5,6}},{{NULL,8,9},{10,11,12}}}
+(1 row)
+
 CREATE FUNCTION test_type_conversion_array_text(x text[]) RETURNS text[] AS $$
 plpy.info(x, type(x))
 return x
@@ -551,6 +561,13 @@ INFO:  (['foo', 'bar'], <type 'list'>)
  {foo,bar}
 (1 row)
 
+SELECT * FROM test_type_conversion_array_text(ARRAY[['foo', 'bar'],['foo2', 'bar2']]);
+INFO:  ([['foo', 'bar'], ['foo2', 'bar2']], <type 'list'>)
+ test_type_conversion_array_text 
+---------------------------------
+ {{foo,bar},{foo2,bar2}}
+(1 row)
+
 CREATE FUNCTION test_type_conversion_array_bytea(x bytea[]) RETURNS bytea[] AS $$
 plpy.info(x, type(x))
 return x
@@ -578,6 +595,13 @@ SELECT * FROM test_type_conversion_array_mixed2();
 ERROR:  invalid input syntax for integer: "abc"
 CONTEXT:  while creating return value
 PL/Python function "test_type_conversion_array_mixed2"
+CREATE FUNCTION test_type_conversion_mdarray_malformed() RETURNS int[] AS $$
+return [[1,2,3],[4,5]]
+$$ LANGUAGE plpythonu;
+SELECT * FROM test_type_conversion_mdarray_malformed();
+ERROR:  Multidimensional arrays must have array expressions with matching dimensions. PL/Python function return value has sequence length 2 while expected 3
+CONTEXT:  while creating return value
+PL/Python function "test_type_conversion_mdarray_malformed"
 CREATE FUNCTION test_type_conversion_array_record() RETURNS type_record[] AS $$
 return [{'first': 'one', 'second': 42}, {'first': 'two', 'second': 11}]
 $$ LANGUAGE plpythonu;
diff --git a/src/pl/plpython/expected/plpython_types_3.out b/src/pl/plpython/expected/plpython_types_3.out
index 56b78e1..3ef2862 100644
--- a/src/pl/plpython/expected/plpython_types_3.out
+++ b/src/pl/plpython/expected/plpython_types_3.out
@@ -537,9 +537,19 @@ INFO:  (None, <class 'NoneType'>)
 (1 row)
 
 SELECT * FROM test_type_conversion_array_int4(ARRAY[[1,2,3],[4,5,6]]);
-ERROR:  cannot convert multidimensional array to Python list
-DETAIL:  PL/Python only supports one-dimensional arrays.
-CONTEXT:  PL/Python function "test_type_conversion_array_int4"
+INFO:  ([[1, 2, 3], [4, 5, 6]], <class 'list'>)
+ test_type_conversion_array_int4 
+---------------------------------
+ {{1,2,3},{4,5,6}}
+(1 row)
+
+SELECT * FROM test_type_conversion_array_int4(ARRAY[[[1,2,NULL],[NULL,5,6]],[[NULL,8,9],[10,11,12]]]);
+INFO:  ([[[1, 2, None], [None, 5, 6]], [[None, 8, 9], [10, 11, 12]]], <class 'list'>)
+          test_type_conversion_array_int4          
+---------------------------------------------------
+ {{{1,2,NULL},{NULL,5,6}},{{NULL,8,9},{10,11,12}}}
+(1 row)
+
 CREATE FUNCTION test_type_conversion_array_text(x text[]) RETURNS text[] AS $$
 plpy.info(x, type(x))
 return x
@@ -551,6 +561,13 @@ INFO:  (['foo', 'bar'], <class 'list'>)
  {foo,bar}
 (1 row)
 
+SELECT * FROM test_type_conversion_array_text(ARRAY[['foo', 'bar'],['foo2', 'bar2']]);
+INFO:  ([['foo', 'bar'], ['foo2', 'bar2']], <class 'list'>)
+ test_type_conversion_array_text 
+---------------------------------
+ {{foo,bar},{foo2,bar2}}
+(1 row)
+
 CREATE FUNCTION test_type_conversion_array_bytea(x bytea[]) RETURNS bytea[] AS $$
 plpy.info(x, type(x))
 return x
@@ -578,6 +595,13 @@ SELECT * FROM test_type_conversion_array_mixed2();
 ERROR:  invalid input syntax for integer: "abc"
 CONTEXT:  while creating return value
 PL/Python function "test_type_conversion_array_mixed2"
+CREATE FUNCTION test_type_conversion_mdarray_malformed() RETURNS int[] AS $$
+return [[1,2,3],[4,5]]
+$$ LANGUAGE plpython3u;
+SELECT * FROM test_type_conversion_mdarray_malformed();
+ERROR:  Multidimensional arrays must have array expressions with matching dimensions. PL/Python function return value has sequence length 2 while expected 3
+CONTEXT:  while creating return value
+PL/Python function "test_type_conversion_mdarray_malformed"
 CREATE FUNCTION test_type_conversion_array_record() RETURNS type_record[] AS $$
 return [{'first': 'one', 'second': 42}, {'first': 'two', 'second': 11}]
 $$ LANGUAGE plpython3u;
diff --git a/src/pl/plpython/plpy_typeio.c b/src/pl/plpython/plpy_typeio.c
index 7ad7a44..3fdc60d 100644
--- a/src/pl/plpython/plpy_typeio.c
+++ b/src/pl/plpython/plpy_typeio.c
@@ -631,46 +631,111 @@ PLyList_FromArray(PLyDatumToOb *arg, Datum d)
 {
 	ArrayType  *array = DatumGetArrayTypeP(d);
 	PLyDatumToOb *elm = arg->elm;
-	PyObject   *list;
-	int			length;
-	int			lbound;
+	int			ndim;
+	int		   *dims;
+	int		   *lb;
+	char	   *dataptr;
+	bits8	   *bitmap;
+	int			bitmask;
 	int			i;
+	int 		dim;
+	int			indx[MAXDIM];
+	PyObject   *lists[MAXDIM];
 
 	if (ARR_NDIM(array) == 0)
 		return PyList_New(0);
 
-	if (ARR_NDIM(array) != 1)
-		ereport(ERROR,
-				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-			  errmsg("cannot convert multidimensional array to Python list"),
-			  errdetail("PL/Python only supports one-dimensional arrays.")));
+	/* Array dimensions and left bounds */
+	ndim = ARR_NDIM(array);
+	dims = ARR_DIMS(array);
+	lb = ARR_LBOUND(array);
 
-	length = ARR_DIMS(array)[0];
-	lbound = ARR_LBOUND(array)[0];
-	list = PyList_New(length);
-	if (list == NULL)
-		PLy_elog(ERROR, "could not create new Python list");
+	/* Internal array representation pointers */
+	dataptr = ARR_DATA_PTR(array);
+	bitmap = ARR_NULLBITMAP(array);
+	bitmask = 1;
 
-	for (i = 0; i < length; i++)
+	/* Iterators initialization */
+	for (i = 0; i < ndim; i++) {
+		indx[i] = lb[i];
+		lists[i] = NULL;
+	}
+	lists[0] = PyList_New(dims[0]);
+
+	/* We need this incref to keep the pointer valid after the array traversal
+	 * terminates, as this traversal does DECREF for all the lists in array */
+	Py_INCREF(lists[0]);
+
+	/* In this cycle we are going over array dimensions. Postgres offers you an
+	 * option to iterate over all the multi-dimensional array elemens in order.
+	 * For 3-dimesnional array the order of iteration would be following - first
+	 * you start with [0,0,0] elements through [0,0,k], then [0,1,0] till [0,1,k]
+	 * till [0,m,k], then [1,0,0] till [1,0,k] till [1,m,k], and so on.
+	 * In Python, each 1-d array is a separate list object, so 3-d array of
+	 * [n,m,k] element is a list of n m-element arrays, each element of which is
+	 * k-element array. In this cycle we traverse from outter dimensions to
+	 * inner ones, creating nested Python lists during traversal */
+	dim = 0;
+	while (dim >= 0)
 	{
-		Datum		elem;
-		bool		isnull;
-		int			offset;
-
-		offset = lbound + i;
-		elem = array_ref(array, 1, &offset, arg->typlen,
-						 elm->typlen, elm->typbyval, elm->typalign,
-						 &isnull);
-		if (isnull)
+		/* If we finished up iterating over current dimension - go one level up */
+		if (indx[dim] > dims[dim])
 		{
-			Py_INCREF(Py_None);
-			PyList_SET_ITEM(list, i, Py_None);
+			Py_DECREF(lists[dim]);
+			indx[dim] = 0;
+			dim -= 1;
+		}
+		/* If we are processing inner dimension - create one more list */
+		else if (dim < ndim - 1)
+		{
+			lists[dim+1] = PyList_New(dims[dim+1]);
+
+			/* We need this INCREF as we keep array pointer on our side,
+			 * while PyList_SET_ITEM steals the reference */
+			Py_INCREF(lists[dim+1]);
+
+			PyList_SET_ITEM(lists[dim], indx[dim] - lb[dim], lists[dim+1]);
+			indx[dim] += 1;
+			dim += 1;
+			indx[dim] = lb[dim];
+		}
+		/* If we are iterating over the outter dimension, fill the list with
+		 * values from the original Postgres array */
+		else if (dim == ndim - 1)
+		{
+			for (indx[dim] = lb[dim]; indx[dim] <= dims[dim]; indx[dim]++)
+			{
+				/* checking for NULL */
+				if (bitmap && (*bitmap & bitmask) == 0)
+				{
+					Py_INCREF(Py_None);
+					PyList_SET_ITEM(lists[dim], indx[dim] - lb[dim], Py_None);
+				}
+				else
+				{
+					Datum		itemvalue;
+
+					itemvalue = fetch_att(dataptr, elm->typbyval, elm->typlen);
+					PyList_SET_ITEM(lists[dim], indx[dim] - lb[dim], elm->func(elm, itemvalue));
+					dataptr = att_addlength_pointer(dataptr, elm->typlen, dataptr);
+					dataptr = (char *) att_align_nominal(dataptr, elm->typalign);
+				}
+
+				/* advance bitmap pointer if any */
+				if (bitmap)
+				{
+					bitmask <<= 1;
+					if (bitmask == 0x100 /* (1<<8) */)
+					{
+						bitmap++;
+						bitmask = 1;
+					}
+				}
+			}
 		}
-		else
-			PyList_SET_ITEM(list, i, elm->func(elm, elem));
 	}
 
-	return list;
+	return lists[0];
 }
 
 /*
@@ -866,39 +931,131 @@ static Datum
 PLySequence_ToArray(PLyObToDatum *arg, int32 typmod, PyObject *plrv)
 {
 	ArrayType  *array;
-	Datum		rv;
 	int			i;
 	Datum	   *elems;
 	bool	   *nulls;
 	int			len;
-	int			lbs;
+	int			ndim;
+	int			dims[MAXDIM];
+	int			lbs[MAXDIM];
+	int			indx[MAXDIM];
+	PyObject   *stack[MAXDIM];
+	int			dim;
+	int			idxelem;
+	Datum		rv;
 
 	Assert(plrv != Py_None);
 
 	if (!PySequence_Check(plrv))
 		PLy_elog(ERROR, "return value of function with array return type is not a Python sequence");
 
-	len = PySequence_Length(plrv);
+	ndim = 1;
+	dims[0] = PySequence_Length(plrv);
+	len = dims[0];
+	stack[0] = plrv;
+
+	/* We don't want to create multi-dimensional arrays when we have an empty sequence,
+	 * when we need to parse sequence of composite objects, and when we have strings */
+	if (len > 0 && !type_is_rowtype(get_base_element_type(arg->typoid)) &&
+			!PyString_Check(plrv) && !PyBytes_Check(plrv) && !PyUnicode_Check(plrv))
+	{
+		PyObject *pyptr = PySequence_GetItem(plrv, 0);
+
+		/* We want to iterate through all iterable objects except by strings on nested levels */
+		while (pyptr != NULL && PySequence_Check(pyptr) &&
+				!(PyString_Check(pyptr) || PyBytes_Check(pyptr) || PyUnicode_Check(pyptr)))
+		{
+			dims[ndim] = PySequence_Length(pyptr);
+			if (dims[ndim] < 0)
+				PLy_elog(ERROR, "Cannot determine sequence length for function return value");
+			len *= dims[ndim];
+			stack[ndim] = pyptr;
+			ndim += 1;
+			if (dims[ndim - 1] == 0) {
+				pyptr = NULL;
+				break;
+			}
+			pyptr = PySequence_GetItem(pyptr, 0);
+		}
+
+		/* Pyptr points to element of n-dimensional array, we don't need its reference */
+		Py_XDECREF(pyptr);
+	}
+
+	/* We need this incref to keep the pointer valid after the array traversal
+	 * terminates, as this traversal does DECREF for all the lists in array, and
+	 * stack[0] corresponds to function return value */
+	Py_INCREF(stack[0]);
+
 	elems = palloc(sizeof(*elems) * len);
 	nulls = palloc(sizeof(*nulls) * len);
 
-	for (i = 0; i < len; i++)
-	{
-		PyObject   *obj = PySequence_GetItem(plrv, i);
+	for (i = 0; i < ndim; i++) {
+		indx[i] = 0;
+		lbs[i] = 1;
+	}
 
-		if (obj == Py_None)
-			nulls[i] = true;
-		else
+	/* In this cycle we are going over nested Python lists, fetching elements
+	 * from the deepest level and putting them into a linear array for Postgres
+	 * to interpret them as n-dimensional array. This is a cycle implementation
+	 * of DFS (recursive traversal of nested arrays here), keeping the stack in
+	 * "stack" variable */
+	dim = 0;
+	idxelem = 0;
+	while (dim >= 0)
+	{
+		/* If we finished up iterating over current list - go one level up */
+		if (indx[dim] == dims[dim])
+		{
+			Py_DECREF(stack[dim]);
+			indx[dim] = 0;
+			dim -= 1;
+		}
+		/* If we are processing inner list - create one more list */
+		else if (dim < ndim - 1)
 		{
-			nulls[i] = false;
-			elems[i] = arg->elm->func(arg->elm, -1, obj);
+			stack[dim+1] = PySequence_GetItem(stack[dim], indx[dim]);
+			if (PySequence_Length(stack[dim+1]) != dims[dim+1])
+				PLy_elog(ERROR, "Multidimensional arrays must have array expressions with matching dimensions. "
+								"PL/Python function return value has sequence length %d while expected %d",
+								(int)PySequence_Length(stack[dim+1]), dims[dim+1]);
+			indx[dim] += 1;
+			dim += 1;
+		}
+		/* If we are iterating over the outter list, fill the output array */
+		else if (dim == ndim - 1)
+		{
+			for (indx[dim] = 0; indx[dim] < dims[dim]; indx[dim]++)
+			{
+				PyObject *obj = PySequence_GetItem(stack[dim], indx[dim]);
+
+				if (obj == Py_None)
+					nulls[idxelem] = true;
+				else
+				{
+					nulls[idxelem] = false;
+
+					/*
+					* We don't support arrays of row types yet, so the first argument
+					* can be NULL.
+					*/
+					elems[idxelem] = arg->elm->func(arg->elm, -1, obj);
+				}
+				Py_XDECREF(obj);
+				idxelem += 1;
+			}
 		}
-		Py_XDECREF(obj);
 	}
 
-	lbs = 1;
-	array = construct_md_array(elems, nulls, 1, &len, &lbs,
-							   get_base_element_type(arg->typoid), arg->elm->typlen, arg->elm->typbyval, arg->elm->typalign);
+	array = construct_md_array(elems,
+							   nulls,
+							   ndim,
+							   dims,
+							   lbs,
+							   get_base_element_type(arg->typoid),
+							   arg->elm->typlen,
+							   arg->elm->typbyval,
+							   arg->elm->typalign);
 
 	/*
 	 * If the result type is a domain of array, the resulting array must be
diff --git a/src/pl/plpython/sql/plpython_types.sql b/src/pl/plpython/sql/plpython_types.sql
index 19d920d..f03a8b7 100644
--- a/src/pl/plpython/sql/plpython_types.sql
+++ b/src/pl/plpython/sql/plpython_types.sql
@@ -237,6 +237,7 @@ SELECT * FROM test_type_conversion_array_int4(ARRAY[NULL,1]);
 SELECT * FROM test_type_conversion_array_int4(ARRAY[]::integer[]);
 SELECT * FROM test_type_conversion_array_int4(NULL);
 SELECT * FROM test_type_conversion_array_int4(ARRAY[[1,2,3],[4,5,6]]);
+SELECT * FROM test_type_conversion_array_int4(ARRAY[[[1,2,NULL],[NULL,5,6]],[[NULL,8,9],[10,11,12]]]);
 
 
 CREATE FUNCTION test_type_conversion_array_text(x text[]) RETURNS text[] AS $$
@@ -245,6 +246,7 @@ return x
 $$ LANGUAGE plpythonu;
 
 SELECT * FROM test_type_conversion_array_text(ARRAY['foo', 'bar']);
+SELECT * FROM test_type_conversion_array_text(ARRAY[['foo', 'bar'],['foo2', 'bar2']]);
 
 
 CREATE FUNCTION test_type_conversion_array_bytea(x bytea[]) RETURNS bytea[] AS $$
@@ -268,6 +270,11 @@ $$ LANGUAGE plpythonu;
 
 SELECT * FROM test_type_conversion_array_mixed2();
 
+CREATE FUNCTION test_type_conversion_mdarray_malformed() RETURNS int[] AS $$
+return [[1,2,3],[4,5]]
+$$ LANGUAGE plpythonu;
+
+SELECT * FROM test_type_conversion_mdarray_malformed();
 
 CREATE FUNCTION test_type_conversion_array_record() RETURNS type_record[] AS $$
 return [{'first': 'one', 'second': 42}, {'first': 'two', 'second': 11}]
-- 
2.7.4 (Apple Git-66)

