Re: [Pytables-users] Strings in VLArrays (And other types of Array, maybe?)

Francesc Altet Mon, 08 May 2006 18:51:48 -0700

Hola Pepe,

A Dissabte 06 Maig 2006 06:21, Pepe Barbe va escriure:
> Hello,
>
> I have several lists that have string names with genes, the genes can
> range between 4 and 6 characters and the lists can range, in size,
> between 20 to 3000. For this reason I thought the simplest way was to
> store the array using VLArray objects. My code this is very simple and
> looks like this:
[snipped]
>
> After revieweing the outcome of this process I've noticed that
> elements in the array display the original text and then some garbled
> text. I am thinking that it might because I am assigning elements of
> size 4 when I've specified length=6 in the StringAtom object, but
> since I am new about PyTables I am not sure how to overcome this
> problem.


Yes, the routine conversions of PyTables were the culprit. When the
VLArray was feed with objects having a different size than the
underlying atom, a conversion was needed (but not done!). A cure for
this is attached. The same patch has been applied to SVN trunk as well
as 1.3 branch, plus updated unit tests to avoid this problem to appear
again in the future. Please, check if this works for you.

Thanks for reporting this!

-- 
>0,0<   Francesc Altet     http://www.carabos.com/
V   V   Cárabos Coop. V.   Enjoy Data
 "-"

Index: tables/utils.py
===================================================================
--- tables/utils.py	(revision 1571)
+++ tables/utils.py	(working copy)
@@ -300,76 +300,71 @@
 # the same object or a new one sharing the same memory.
 def convertToNA(arr, atom, copy = False):
     "Convert a generic object into a numarray object"
-    if (isinstance(arr, numarray.NumArray) or
-        isinstance(arr, strings.CharArray)):
-        if copy or not arr.iscontiguous():
-            naarr = arr.copy()
+
+    # Convert arr to a numarray object.
+    # Strings will be *always* copied during conversions as there is not support
+    # for them in the array protocol implementation of numarray yet.
+    # First check NumArray as they will be the most frequently used objects.
+    if isinstance(arr, numarray.NumArray):
+        naarr = arr
+    elif isinstance(arr, strings.CharArray):
+        if ((copy) or (not arr.iscontiguous()) or
+            (arr.itemsize() != atom.itemsize)):
+            # A copy has to be made
+            naarr = strings.array(arr, itemsize=atom.itemsize, padc='\x00')
         else:
-            naarr = arr
+            naarr = arr    # A copy is not necessary
     # Check for NumPy objects
     # This works for both CharArray and regular homogeneous arrays
     elif (numpy_imported and isinstance(arr, numpy.ndarray)):
         if arr.dtype.kind == "U":
             raise NotImplementedError, \
                   """Unicode types are not suppored yet, sorry."""
-        if copy or not arr.flags['CONTIGUOUS']:
-            if arr.dtype.kind == "S":
-                naarr = strings.array(arr)  # A copy is made
-            else:
-                naarr = numarray.array(arr)  # A copy is made
+        if arr.dtype.kind != "S":
+            naarr = numarray.asarray(arr)
         else:
-            if arr.dtype.kind == "S":
-                naarr = strings.asarray(arr)  # A copy is not necessarily made
-            else:
-                naarr = numarray.asarray(arr)  # A copy is not necessarily made
+            naarr = strings.array(arr, itemsize=atom.itemsize, padc = '\x00')
         # Check for Numeric objects
     elif (Numeric_imported and
-          type(arr) == Numeric.ArrayType and
-          not arr.typecode() == 'c'):
-        if copy or not arr.iscontiguous():
-            carr = arr.copy()
+          type(arr) == Numeric.ArrayType):
+        if arr.typecode() != 'c':
+            naarr = numarray.asarray(arr)
         else:
-            carr = arr
-        naarr = numarray.asarray(carr)
-    elif (Numeric_imported and
-          type(arr) == Numeric.ArrayType and
-          arr.typecode() == 'c'):
-        # Special case for Numeric objects of type Char
-        try:
-            naarr = strings.array(arr.tolist(), itemsize=atom.itemsize,
-                                  padc = '\x00')
-            # If still doesn't, issues an error
-        except:  #XXX
-            raise TypeError, """The object '%s' can't be converted into a CharArray object of type '%s'. Sorry, but this object is not supported in this context.""" % (arr, atom)
+            # Special case for Numeric objects of type Char
+            try:
+                naarr = strings.array(arr.tolist(), itemsize=atom.itemsize,
+                                      padc='\x00')
+                # If still doesn't, issues an error
+            except:  #XXX
+                raise TypeError, \
+"""The object '%s' can't be converted into a CharArray object of type '%s'.
+Sorry, but this object is not supported in this context.""" % (arr, atom)
     else:
-        # Test if arr can be converted to a numarray object of the
-        # correct type
+        # Check if arr can be converted to a numarray object of the
+        # correct type.
         try:
-            # 2005-02-04: The 'copy' argument appears in __doc__
-            # but not in documentation.
-            naarr = numarray.array(arr, type=atom.type, copy=copy)
+            naarr = numarray.asarray(arr, type=atom.type)
         # If not, test with a chararray
         except TypeError:
             try:
-                naarr = strings.array(arr, itemsize=atom.itemsize,
-                                      padc = '\x00')
+                naarr = strings.array(arr, itemsize=atom.itemsize, padc='\x00')
             # If still doesn't, issues an error
             except:  #XXX
-                raise TypeError, """The object '%s' can't be converted into a numarray object of type '%s'. Sorry, but this object is not supported in this context.""" % (arr, atom)
+                raise TypeError, \
+"""The object '%s' can't be converted into a numarray object of type '%s'.
+Sorry, but this object is not supported in this context.""" % (arr, atom)
 
-    # Convert to the atom type, if necessary
-    if (isinstance(naarr, numarray.NumArray) and naarr.type() <> atom.type):
-        naarr = naarr.astype(atom.type)         # Force a cast
+    # At this point we should have a NumArray or a CharArray naarr.
+    # Get copies of data if necessary.
+    if isinstance(naarr, numarray.NumArray):
+        # We always want a contiguous buffer
+        # (no matter if has an offset or not; that will be corrected later on)
+        if (copy) or (not naarr.iscontiguous()) or (naarr.type() <> atom.type):
+            # Do a copy of the array in case is not contiguous
+            naarr = numarray.array(naarr, type=atom.type)
 
-    # We always want a contiguous buffer
-    # (no matter if has an offset or not; that will be corrected later)
-    if not naarr.iscontiguous():
-        # Do a copy of the array in case is not contiguous
-        naarr = numarray.NDArray.copy(naarr)
-
     return naarr
 
-
 def convertNAToNumeric(arr):
     """Convert a numarray object into a Numeric one"""

Re: [Pytables-users] Strings in VLArrays (And other types of Array, maybe?)

Reply via email to