While playing a little more with bincount(), one modification would be handy: Allow negative integers in the bin list, but skip them when counting bins
My specific use case is calculating subtotals on columns of large datasets (1m rows x 30 cols), where some rows need to be excluded. The groupings are expensive to compute, and sometimes will involve ~99% of the rows (eliminate only outliers/errors), and other times only ~5% of the rows (focus in on a subset). I'd like to calculate subtotals like this using bincount(), without having to copy the large datasets just to eliminate the unwanted rows: # Assign each row to a group numbered from 0..G, except for -1 for rows to exclude row_groups = expensive_function(data) # Count number in each group, excluding those with grp==-1 grp_counts = bincount(list=row_groups) # Use bincount() to form subtotals by column, excluding those with grp==-1 subtotals = column_stack([ bincount(list=row_groups, weights=data[:,i]) for i in range(G+1) ]) Is there any appetite to make such a change to bincount()? This would require two simple changes to bincount() in _compiled_base.c and an update to the docstring. Here is the diff file with enough context to show the entire arr_bincount() function: *** orig_compiled_base.c 2006-07-21 13:14:21.250000000 +1000 --- _compiled_base.c 2006-07-21 13:34:41.718750000 +1000 *************** *** 70,143 **** intp j ; for ( j = 1 ; j < len; j ++ ) if ( i [j] < min ) {min = i [j] ; mn = j ;} return mn; } static PyObject * arr_bincount(PyObject *self, PyObject *args, PyObject *kwds) { /* histogram accepts one or two arguments. The first is an array ! * of non-negative integers and the second, if present, is an * array of weights, which must be promotable to double. * Call these arguments list and weight. Both must be one- * dimensional. len (weight) == len(list) * If weight is not present: ! * histogram (list) [i] is the number of occurrences of i in list. * If weight is present: * histogram (list, weight) [i] is the sum of all weight [j] ! * where list [j] == i. */ /* self is not used */ PyArray_Descr *type; PyObject *list = NULL, *weight=Py_None ; PyObject *lst=NULL, *ans=NULL, *wts=NULL; ! intp *numbers, *ians, len , mxi, mni, ans_size; int i; double *weights , *dans; static char *kwlist[] = {"list", "weights", NULL}; Py_Try(PyArg_ParseTupleAndKeywords(args, kwds, "O|O", kwlist, &list, &weight)); Py_Try(lst = PyArray_ContiguousFromAny(list, PyArray_INTP, 1, 1)); len = PyArray_SIZE(lst); numbers = (intp *) PyArray_DATA(lst); mxi = mxx (numbers, len) ; - mni = mnx (numbers, len) ; - Py_Assert(numbers[mni] >= 0, - "irst argument of bincount must be non-negative"); ans_size = numbers [mxi] + 1 ; type = PyArray_DescrFromType(PyArray_INTP); if (weight == Py_None) { Py_Try(ans = PyArray_Zeros(1, &ans_size, type, 0)); ians = (intp *)(PyArray_DATA(ans)); for (i = 0 ; i < len ; i++) ! ians [numbers [i]] += 1 ; Py_DECREF(lst); } else { Py_Try(wts = PyArray_ContiguousFromAny(weight, PyArray_DOUBLE, 1, 1)); weights = (double *)PyArray_DATA (wts); Py_Assert(PyArray_SIZE(wts) == len, "bincount: length of weights " \ "does not match that of list"); type = PyArray_DescrFromType(PyArray_DOUBLE); Py_Try(ans = PyArray_Zeros(1, &ans_size, type, 0)); dans = (double *)PyArray_DATA (ans); for (i = 0 ; i < len ; i++) { ! dans[numbers[i]] += weights[i]; } Py_DECREF(lst); Py_DECREF(wts); } return ans; fail: Py_XDECREF(lst); Py_XDECREF(wts); Py_XDECREF(ans); return NULL; } --- 70,145 ---- intp j ; for ( j = 1 ; j < len; j ++ ) if ( i [j] < min ) {min = i [j] ; mn = j ;} return mn; } static PyObject * arr_bincount(PyObject *self, PyObject *args, PyObject *kwds) { /* histogram accepts one or two arguments. The first is an array ! * of integers and the second, if present, is an * array of weights, which must be promotable to double. * Call these arguments list and weight. Both must be one- * dimensional. len (weight) == len(list) * If weight is not present: ! * histogram (list) [i] is the number of occurrences of i in list ! * for i>=0. Negative i values are ignored. * If weight is present: * histogram (list, weight) [i] is the sum of all weight [j] ! * where list [j] == i and i>=0. */ /* self is not used */ PyArray_Descr *type; PyObject *list = NULL, *weight=Py_None ; PyObject *lst=NULL, *ans=NULL, *wts=NULL; ! intp *numbers, *ians, len , mxi, ans_size; int i; double *weights , *dans; static char *kwlist[] = {"list", "weights", NULL}; Py_Try(PyArg_ParseTupleAndKeywords(args, kwds, "O|O", kwlist, &list, &weight)); Py_Try(lst = PyArray_ContiguousFromAny(list, PyArray_INTP, 1, 1)); len = PyArray_SIZE(lst); numbers = (intp *) PyArray_DATA(lst); mxi = mxx (numbers, len) ; ans_size = numbers [mxi] + 1 ; type = PyArray_DescrFromType(PyArray_INTP); if (weight == Py_None) { Py_Try(ans = PyArray_Zeros(1, &ans_size, type, 0)); ians = (intp *)(PyArray_DATA(ans)); for (i = 0 ; i < len ; i++) ! if (numbers[i]>=0) { ! ians[numbers [i]] += 1 ; ! } Py_DECREF(lst); } else { Py_Try(wts = PyArray_ContiguousFromAny(weight, PyArray_DOUBLE, 1, 1)); weights = (double *)PyArray_DATA (wts); Py_Assert(PyArray_SIZE(wts) == len, "bincount: length of weights " \ "does not match that of list"); type = PyArray_DescrFromType(PyArray_DOUBLE); Py_Try(ans = PyArray_Zeros(1, &ans_size, type, 0)); dans = (double *)PyArray_DATA (ans); for (i = 0 ; i < len ; i++) { ! if (numbers[i]>=0) { ! dans[numbers[i]] += weights[i]; ! } } Py_DECREF(lst); Py_DECREF(wts); } return ans; fail: Py_XDECREF(lst); Py_XDECREF(wts); Py_XDECREF(ans); return NULL; } Cheers Stephen ------------------------------------------------------------------------- Take Surveys. Earn Cash. Influence the Future of IT Join SourceForge.net's Techsay panel and you'll get the chance to share your opinions on IT & business topics through brief surveys -- and earn cash http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV _______________________________________________ Numpy-discussion mailing list Numpy-discussion@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/numpy-discussion