Changeset: f197b705aae5 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=f197b705aae5
Modified Files:
monetdb5/extras/pyapi/Benchmarks/pyapi_test.sh
monetdb5/extras/pyapi/pyapi.c
monetdb5/extras/pyapi/pytypes.h
sql/backends/monet5/sql.c
sql/backends/monet5/sql.h
Branch: pyapi
Log Message:
Move code for Python Object -> BAT conversion into separate functions.
diffs (truncated from 903 to 300 lines):
diff --git a/monetdb5/extras/pyapi/Benchmarks/pyapi_test.sh
b/monetdb5/extras/pyapi/Benchmarks/pyapi_test.sh
--- a/monetdb5/extras/pyapi/Benchmarks/pyapi_test.sh
+++ b/monetdb5/extras/pyapi/Benchmarks/pyapi_test.sh
@@ -11,7 +11,7 @@ export MSERVERTEST='netstat -ant | grep
# Testing parameters
# Input test (zero copy vs copy)
# The input sizes to test (in MB)
-export INPUT_TESTING_SIZES="0.1 1 10 100 1000 10000"
+export INPUT_TESTING_SIZES="0.1 1 10 100 1000"
# Amount of tests to run for each size
export INPUT_TESTING_NTESTS=10
diff --git a/monetdb5/extras/pyapi/pyapi.c b/monetdb5/extras/pyapi/pyapi.c
--- a/monetdb5/extras/pyapi/pyapi.c
+++ b/monetdb5/extras/pyapi/pyapi.c
@@ -749,7 +749,7 @@ str PyAPIeval(MalBlkPtr mb, MalStkPtr st
t_start = ceil((count * chunk) / totalchunks);
t_end = floor((count * (chunk + 1)) / totalchunks);
if (((int)count) / 2 * 2 == (int)count) t_end--;
- VERBOSE_MESSAGE("---Start: %d, End: %d, Count: %d\n", t_start,
t_end, t_end - t_start);
+ VERBOSE_MESSAGE("---Start: %zu, End: %zu, Count: %zu\n",
t_start, t_end, t_end - t_start);
}
}
#endif
@@ -830,111 +830,20 @@ str PyAPIeval(MalBlkPtr mb, MalStkPtr st
goto wrapup;
}
+ if (code_object == NULL) { PyRun_SimpleString("del pyfun"); }
+
// Now we need to do some error checking on the result object, because
the result object has to have the correct type/size
// We will also do some converting of result objects to a common type
(such as scalar -> [[scalar]])
- if (pResult) {
- PyObject * pColO = NULL;
-
- if (PyType_IsPandasDataFrame(pResult)) {
- //the result object is a Pandas data frame
- //we can convert the pandas data frame to a numpy array by
simply accessing the "values" field (as pandas dataframes are numpy arrays
internally)
- pResult = PyObject_GetAttrString(pResult, "values");
- if (pResult == NULL) {
- msg = createException(MAL, "pyapi.eval", "Invalid Pandas
data frame.");
- goto wrapup;
- }
- //we transpose the values field so it's aligned correctly for
our purposes
- pResult = PyObject_GetAttrString(pResult, "T");
- if (pResult == NULL) {
- msg = createException(MAL, "pyapi.eval", "Invalid Pandas
data frame.");
- goto wrapup;
- }
- }
-
- if (PyType_IsPyScalar(pResult)) { //check if the return object is
a scalar
- if (pci->retc == 1) {
- //if we only expect a single return value, we can accept
scalars by converting it into an array holding an array holding the element
(i.e. [[pResult]])
- PyObject *list = PyList_New(1);
- PyList_SetItem(list, 0, pResult);
- pResult = list;
-
- list = PyList_New(1);
- PyList_SetItem(list, 0, pResult);
- pResult = list;
- }
- else {
- //the result object is a scalar, yet we expect more than
one return value. We can only convert the result into a list with a single
element, so the output is necessarily wrong.
- msg = createException(MAL, "pyapi.eval", "A single scalar
was returned, yet we expect a list of %d columns. We can only convert a single
scalar into a single column, thus the result is invalid.", pci->retc);
- goto wrapup;
- }
- }
- else {
- //if it is not a scalar, we check if it is a single array
- bool IsSingleArray = TRUE;
- PyObject *data = pResult;
- if (PyType_IsNumpyMaskedArray(data)) {
- data = PyObject_GetAttrString(pResult, "data");
- if (data == NULL) {
- msg = createException(MAL, "pyapi.eval", "Invalid
masked array.");
- goto wrapup;
- }
- }
- if (PyType_IsNumpyArray(data)) {
- if (PyArray_NDIM((PyArrayObject*)data) != 1) {
- IsSingleArray = FALSE;
- }
- else {
- pColO = PyArray_GETITEM((PyArrayObject*)data,
PyArray_GETPTR1((PyArrayObject*)data, 0));
- IsSingleArray = PyType_IsPyScalar(pColO);
- }
- }
- else if (PyList_Check(data)) {
- pColO = PyList_GetItem(data, 0);
- IsSingleArray = PyType_IsPyScalar(pColO);
- }
- else if (PyLazyArray_CheckExact(data)) {
- pColO = data;
- IsSingleArray = TRUE;
- } else if (!PyType_IsNumpyMaskedArray(data)) {
- //it is neither a python array, numpy array or numpy
masked array, thus the result is unsupported! Throw an exception!
- msg = createException(MAL, "pyapi.eval", "Unsupported
result object. Expected either an array, a numpy array, a numpy masked array or
a pandas data frame, but received an object of type \"%s\"",
PyString_AsString(PyObject_Str(PyObject_Type(data))));
- goto wrapup;
- }
-
- if (IsSingleArray) {
- if (pci->retc == 1) {
- //if we only expect a single return value, we can
accept a single array by converting it into an array holding an array holding
the element (i.e. [pResult])
- PyObject *list = PyList_New(1);
- PyList_SetItem(list, 0, pResult);
- pResult = list;
- }
- else {
- //the result object is a single array, yet we expect
more than one return value. We can only convert the result into a list with a
single array, so the output is necessarily wrong.
- msg = createException(MAL, "pyapi.eval", "A single
array was returned, yet we expect a list of %d columns. The result is
invalid.", pci->retc);
- goto wrapup;
- }
- }
- else {
- //the return value is an array of arrays, all we need to
do is check if it is the correct size
- int results = 0;
- if (PyList_Check(data)) results = PyList_Size(data);
- else results = PyArray_DIMS((PyArrayObject*)data)[0];
- if (results != pci->retc) {
- //wrong return size, we expect pci->retc arrays
- msg = createException(MAL, "pyapi.eval", "An array of
size %d was returned, yet we expect a list of %d columns. The result is
invalid.", results, pci->retc);
- goto wrapup;
- }
- }
- }
- if (code_object == NULL) { PyRun_SimpleString("del pyfun"); }
- } else {
- msg = createException(MAL, "pyapi.eval", "Invalid result object.
No result object could be generated.");
+ pResult = PyObject_CheckForConversion(pResult, pci->retc, NULL, &msg);
+ if (pResult == NULL) {
goto wrapup;
}
}
VERBOSE_MESSAGE("Collecting return values.\n");
+
+
// Now we have executed the Python function, we have to collect the return
values and convert them to BATs
// We will first collect header information about the Python return
objects and extract the underlying C arrays
// We will store this header information in a PyReturn object
@@ -942,89 +851,10 @@ str PyAPIeval(MalBlkPtr mb, MalStkPtr st
// The reason we are doing this as a separate step is because this
preprocessing requires us to call the Python API
// Whereas the actual returning does not require us to call the Python API
// This means we can do the actual returning without holding the GIL
- for (i = 0; i < pci->retc; i++) {
- // Refers to the current Numpy mask (if it exists)
- PyObject *pMask = NULL;
- // Refers to the current Numpy array
- PyObject * pColO = NULL;
- // This is the PyReturn header information for the current return
value, we will fill this now
- PyReturn *ret = &pyreturn_values[i];
-
- ret->multidimensional = FALSE;
- // There are three possibilities (we have ensured this right after
executing the Python call)
- // 1: The top level result object is a PyList or Numpy Array
containing pci->retc Numpy Arrays
- // 2: The top level result object is a (pci->retc x N) dimensional
Numpy Array [Multidimensional]
- // 3: The top level result object is a (pci->retc x N) dimensional
Numpy Masked Array [Multidimensional]
- if (PyList_Check(pResult)) {
- // If it is a PyList, we simply get the i'th Numpy array from the
PyList
- pColO = PyList_GetItem(pResult, i);
- }
- else {
- // If it isn't, the result object is either a Nump Masked Array or
a Numpy Array
- PyObject *data = pResult;
- if (PyType_IsNumpyMaskedArray(data)) {
- data = PyObject_GetAttrString(pResult, "data"); // If it is a
Masked array, the data is stored in the masked_array.data attribute
- pMask = PyObject_GetAttrString(pResult, "mask");
- }
-
- // We can either have a multidimensional numpy array, or a single
dimensional numpy array
- if (PyArray_NDIM((PyArrayObject*)data) != 1) {
- // If it is a multidimensional numpy array, we have to convert
the i'th dimension to a NUMPY array object
- ret->multidimensional = TRUE;
- ret->result_type =
PyArray_DESCR((PyArrayObject*)data)->type_num;
- }
- else {
- // If it is a single dimensional Numpy array, we get the i'th
Numpy array from the Numpy Array
- pColO = PyArray_GETITEM((PyArrayObject*)data,
PyArray_GETPTR1((PyArrayObject*)data, i));
- }
- }
-
- // Now we have to do some preprocessing on the data
- if (ret->multidimensional) {
- // If it is a multidimensional Numpy array, we don't need to do
any conversion, we can just do some pointers
- ret->count = PyArray_DIMS((PyArrayObject*)pResult)[1];
- ret->numpy_array = pResult;
- ret->numpy_mask = pMask;
- ret->array_data = PyArray_DATA((PyArrayObject*)ret->numpy_array);
- if (ret->numpy_mask != NULL) ret->mask_data =
PyArray_DATA((PyArrayObject*)ret->numpy_mask);
- ret->memory_size =
PyArray_DESCR((PyArrayObject*)ret->numpy_array)->elsize;
- }
- else {
- if (PyLazyArray_CheckExact(pColO)) {
- // To handle returning of lazy arrays, we just convert them to
a Numpy array. This is slow and could be done much faster, but since this can
only happen if we directly return one of the input arguments this should be a
rare situation anyway.
- pColO = PyLazyArray_AsNumpyArray(pColO);
- if (pColO == NULL) {
- msg = PyError_CreateException("Failed to convert lazy
array to numpy array.\n", NULL);
- goto wrapup;
- }
- }
- // If it isn't we need to convert pColO to the expected Numpy
Array type
- ret->numpy_array = PyArray_FromAny(pColO, NULL, 1, 1,
NPY_ARRAY_CARRAY | NPY_ARRAY_FORCECAST, NULL);
- if (ret->numpy_array == NULL) {
- msg = createException(MAL, "pyapi.eval", "Could not create a
Numpy array from the return type.\n");
- goto wrapup;
- }
-
- ret->result_type =
PyArray_DESCR((PyArrayObject*)ret->numpy_array)->type_num; // We read the
result type from the resulting array
- ret->memory_size =
PyArray_DESCR((PyArrayObject*)ret->numpy_array)->elsize;
- ret->count = PyArray_DIMS((PyArrayObject*)ret->numpy_array)[0];
- ret->array_data = PyArray_DATA((PyArrayObject*)ret->numpy_array);
- // If pColO is a Masked array, we convert the mask to a NPY_BOOL
numpy array
- if (PyObject_HasAttrString(pColO, "mask")) {
- pMask = PyObject_GetAttrString(pColO, "mask");
- if (pMask != NULL) {
- ret->numpy_mask = PyArray_FromAny(pMask,
PyArray_DescrFromType(NPY_BOOL), 1, 1, NPY_ARRAY_CARRAY, NULL);
- if (ret->numpy_mask == NULL ||
PyArray_DIMS((PyArrayObject*)ret->numpy_mask)[0] != (int)ret->count)
- {
- PyErr_Clear();
- pMask = NULL;
- ret->numpy_mask = NULL;
- }
- }
- }
- if (ret->numpy_mask != NULL) ret->mask_data =
PyArray_DATA((PyArrayObject*)ret->numpy_mask);
- }
+ if (!PyObject_PreprocessObject(pResult, pyreturn_values, pci->retc, &msg))
{
+ goto wrapup;
}
+
#ifndef WIN32
/*[SHARED_MEMORY]*/
@@ -1239,184 +1069,18 @@ returnvalues:
{
PyReturn *ret = &pyreturn_values[i];
int bat_type = ATOMstorage(getColumnType(getArgType(mb,pci,i)));
- size_t index_offset = 0;
if (bat_type == TYPE_any || bat_type == TYPE_void) {
getArgType(mb,pci,i) = bat_type;
msg = createException(MAL, "pyapi.eval", "Unknown return value,
possibly projecting with no parameters.");
goto wrapup;
- }
-
- if (ret->multidimensional) index_offset = i;
- VERBOSE_MESSAGE("- Returning a Numpy Array of type %s of size %zu and
storing it in a BAT of type %s\n", PyType_Format(ret->result_type), ret->count,
BatType_Format(bat_type));
- switch (bat_type)
- {
- case TYPE_bte:
- NP_CREATE_BAT(b, bit);
- break;
- case TYPE_sht:
- NP_CREATE_BAT(b, sht);
- break;
- case TYPE_int:
- NP_CREATE_BAT(b, int);
- break;
- case TYPE_lng:
- NP_CREATE_BAT(b, lng);
- break;
- case TYPE_flt:
- NP_CREATE_BAT(b, flt);
- break;
- case TYPE_dbl:
- NP_CREATE_BAT(b, dbl);
- break;
-#ifdef HAVE_HGE
- case TYPE_hge:
- NP_CREATE_BAT(b, hge);
- break;
-#endif
- case TYPE_str:
- {
- bool *mask = NULL;
- char *data = NULL;
- char *utf8_string = NULL;
- if (ret->mask_data != NULL)
- {
- mask = (bool*)ret->mask_data;
- }
- if (ret->array_data == NULL)
- {
- msg = createException(MAL, "pyapi.eval", "No return value
stored in the structure. n");
- goto wrapup;
- }
- data = (char*) ret->array_data;
-
- if (ret->result_type != NPY_OBJECT) {
- utf8_string = GDKzalloc(256 + ret->memory_size + 1);
- utf8_string[256 + ret->memory_size] = '\0';
- }
-
- b = BATnew(TYPE_void, TYPE_str, ret->count, TRANSIENT);
- BATseqbase(b, seqbase); b->T->nil = 0; b->T->nonil = 1;
- b->tkey = 0; b->tsorted = 0; b->trevsorted = 0;
- VERBOSE_MESSAGE("- Collecting return values of type %s.\n",
PyType_Format(ret->result_type));
- switch(ret->result_type)
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list