Hi folks,

I'm working on a "ragged array" class -- an array that can store and work with what can be considered tabular data, with the rows of different lengths:

"""
ragged_array

A "ragged" array class -- build on numpy

The idea is to be able to store data that is essentially 2-d, but each row is
an arbitrary length, like:

1   2   3
4   5   6   7   8   9
10 11
12 13  14  15  16  17  18
19 20  21
...

At the moment, my implementation (see enclosed) stores the data in a 1-d numpy array as an attribute, and also an index array that stores the indexes into the rows. This is working fine.

However, I'd like to have it support any of the usual numpy operations that make sense for a ragged array:

arr.sum()
arr *= a_scalar
arr * a_scalar

etc, etc, etc.


So I thought maybe I'd do a subclass, instead of having the data array an attribute of the class. But I can't figure out how to solve the indexing problem:

I want to re-map indexing, so that:

arr[i] returns the ith "row":

In [2]: ra = ragged_array([(1,2), (3,4,5), (6,7)])

In [4]: print ra
ragged array:
[1 2]
[3 4 5]
[6 7]

In [5]: ra[1]
Out[5]: array([3, 4, 5])

I'm currently doing (error checking removed):


def __getitem__(self,index):
    """
    returns a numpy array of one row.
    """
row = (self._data_array[self._index_array[index]:self._index_array[index+1]] )

        return row

But if I subclass ndarray, then self._data_array becomes jsut plain "self", and I've overloaded indexing (and slicing), so I don't know how I could index into the "flat" array to get the subset of the array I need.

any ideas?

Other comments about the class would be great, too.

-Chris


--
Christopher Barker, Ph.D.
Oceanographer

Emergency Response Division
NOAA/NOS/OR&R            (206) 526-6959   voice
7600 Sand Point Way NE   (206) 526-6329   fax
Seattle, WA  98115       (206) 526-6317   main reception

chris.bar...@noaa.gov
#!/usr/bin/env python

"""
ragged_array

A "ragged" array class -- build on numpy

The idea is to be able to store data that is essentially 2-d, but each row is
an arbitrary length, like:

1   2   3
4   5   6   7   8   9 
10 11  
12 13  14  15  16  17  18
19 20  21
...

This can also be extended to support higher dimensional arrays, as long as
only the first dimension is the "ragged" one.

Internally, the data is stored as an array of one less dimension, with indexes
into the array to catch the "rows". 

The array can be indexed by row.

Operations can be done on the entire array just like any numpy array
  - this is one of the primary advantages of using a single numpy array 
    for the internal storage.
  - operations that require slicing, specifying an axis, etc, are likely to
    fail
"""

import numpy as np

class ragged_array:
    
    def __init__(self, data, dtype=None):
        """
        create a new ragged array
        
        data should be a sequence of sequences:
        [ [1, 2, 3], [4,5,6,7], [4,2] ]
        
        if no dtype is provided, it will be determined by the type of the first row

        """
        
        # generate the arrays:
        a = []
        ind = [0]
        # flatten the data sequence:
        for row in data:
            a.extend(row)
            ind.append(ind[-1]+len(row))
        self._data_array = np.array(a, dtype=dtype)
        # note: using "np.int" forthe index array should give me 32 bits on 
        #       32 bit python, and 64 bits on 64 bit python.
        self._index_array = np.array(ind, dtype=np.int) 

    def append(self, row):

        """
        Should this be supported?
        
        It does require a copy of the data array
        
        """
        self._data_array = np.r_[self._data_array, row]
        self._index_array  = np.r_[self._index_array, (self._data_array.shape[0],) ]

        
    def __len__(self):
        return len(self._index_array) - 1 # there is an extra index at the end, so that IndexArray[i+1] works
        
    def __getitem__(self,index):
        """
        returns a numpy array of one row.
        """
        if index > (len(self._index_array) - 1):
            raise IndexError
        if  index < 0:
             if index < - (len(self._index_array) -1 ):
                 raise IndexError
             index = len(self._index_array) -1 + index
        row = (self._data_array[self._index_array[index]:self._index_array[index+1]] )
        return row

    def __getslice__(self, i, j):
        """
        ra.__getslice__(i, j) <==> a[i:j]
    
        Use of negative indices is not supported.
        
        This a view, just like numpy arrays
        """
        ## this builds a new ragged_array, as a view onto the original
        ##  fixme: this seems like it should be more elegant.
        j = min( j, len(self) )
        rslt = ragged_array(((),),)
        print self._index_array
        rslt._data_array = self._data_array[self._index_array[i]:self._index_array[j]]
        print rslt._data_array
        rslt._index_array = np.r_[0, (self._index_array[i+1:j+1] - self._index_array[i])]
        print rslt._index_array
        return rslt
    
    def __str__(self):
        """
        present a nice string representation of the array
        """
        msg = ['ragged array:']
        for i in range(len(self)):
            msg.append(str(self[i]))
        msg.append("")
        return "\n".join(msg)
    
    def flat(self):
        """
        returns a flattend version ofthe array -- 1-d
        
        actually returns the internal array representation, so it shares a view with the ragged array
        
        """
        return self._data_array
            

#!/usr/bin/env python

"""
test_ragged_array.py

tests ofthe ragged_array class

designed to be run with nose
"""

import nose
import nose.tools
import numpy as np


from numpy_extras.ragged_array import ragged_array as ra


def test_one_row():
    row = (1,2,3)
    a = ra([row,])
    assert np.array_equal(a[0], row)
    
def test_three_rows():
    rows = ([1,2,3],
            [4,5,6,7,8],
            [9, 10] )
    a = ra(rows)
    assert np.array_equal(a[0], rows[0])
    assert np.array_equal(a[1], rows[1])
    assert np.array_equal(a[2], rows[2])

def test_slice():    
    " a slice in the middle"    
    rows = ([1,2,3],
            [4,5,6,7,8],
            [9, 10],
            [11, 12, 13, 14, 15] )
    a = ra(rows)
    slice = a[1:3]
    print slice
    assert np.array_equal(slice[0], rows[1])
    assert np.array_equal(slice[1], rows[2])

def test_slice2():    
    " a slice with the first item"    
    rows = ([1,2,3],
            [4,5,6,7,8],
            [9, 10],
            [11, 12, 13, 14, 15] )
    a = ra(rows)
    slice = a[0:3]
    print slice
    assert np.array_equal(slice[0], rows[0])
    assert np.array_equal(slice[1], rows[1])
    assert np.array_equal(slice[2], rows[2])

def test_slice3():
    " a slice with the last item"    
    rows = ([1,2,3],
            [4,5,6,7,8],
            [9, 10],
            [11, 12, 13, 14, 15] )
    a = ra(rows)
    slice = a[2:4]
    print slice
    assert np.array_equal(slice[0], rows[2])
    assert np.array_equal(slice[1], rows[3])

def test_slice3():
    " a slice going over the end"
    rows = ([1,2,3],
            [4,5,6,7,8],
            [9, 10],
            [11, 12, 13, 14, 15] )
    a = ra(rows)
    slice = a[2:6]
    assert np.array_equal(slice[0], rows[2])
    assert np.array_equal(slice[1], rows[3])

def test_iteration():
    rows = ([1,2,3],
            [4,5,6,7,8],
            [9, 10],
            [11, 12, 13, 14, 15] )
    arr = ra(rows)
    for a, b in zip(rows, arr):
        assert np.array_equal(a, b)

def test_negative_index():
    rows = ([1,2,3],
            [4,5,6,7,8],
            [9, 10],
            [11, 12, 13, 14, 15] )
    arr = ra(rows)
    assert arr[-2][0] == 9
    assert np.array_equal( arr[-1], rows[-1] )
    assert np.array_equal( arr[-2], rows[-2] )
    assert np.array_equal( arr[-3], rows[-3] )
    assert np.array_equal( arr[-4], rows[-4] )

@nose.tools.raises(IndexError)
def test_large_negative_index():
    rows = ([1,2,3],
            [4,5,6,7,8],
            [9, 10],
            [11, 12, 13, 14, 15] )
    arr = ra(rows)
    print arr[-5]

    
def test_dtype():
    row = (1,2,3)
    a = ra([row,], dtype=np.float32)
    print a[0].dtype
    assert (a[0].dtype == np.float32)

def test_mixed_types():
    rows = ([1,2,3],
            [4.1,5.2,6.3,7.4,8.5],
            )
    a = ra(rows, dtype = np.float64)
    print a
    assert np.array_equal(a[0], rows[0]) 
    assert np.array_equal(a[1], rows[1]) 
    
def test_mixed_types2():
    rows = ([1,2,3],
            [4.1,5.2,6.3,7.4,8.5],
            )
    a = ra(rows)
    print a
    assert (a[0].dtype == np.float64)
    
        
def test_string():
    rows = ([1,2,3],
            [4,5,6,7,8],
            [9, 10] )
    a = ra(rows)
    result = "ragged array:\n[1 2 3]\n[4 5 6 7 8]\n[ 9 10]\n"

    assert (str(a) == result)

def test_flat():
    rows = ([1,2,3],
            [4,5,6,7,8],
            [9, 10] )
    a = ra(rows)
    result = np.array( (1,2,3,4,5,6,7,8,9,10) )

    assert np.array_equal(a.flat(), result)

def test_math_flat():
    rows = ([1,2,3],
            [4,5,6,7,8],
            [9, 10] )
    a = ra(rows)
    b = a.flat()
    b *= 5
    assert a[1][0] == 20
    assert a[2][1] == 50
    
def test_append():
    rows = ([1,2,3],
            [4,5,6,7,8],
            [9, 10] )
    a = ra(rows)
    new_row = (11, 12, 13, 14,)
    a.append(new_row)

    print a
    assert np.array_equal( a[3] , new_row )
    assert (len(a)  == 4)
    assert a[3][2] == 13
    assert np.array_equal( a[-1], new_row )
                           
                           
                           
                           
                           
                           
                           
_______________________________________________
NumPy-Discussion mailing list
NumPy-Discussion@scipy.org
http://mail.scipy.org/mailman/listinfo/numpy-discussion

Reply via email to