Re: [Numpy-discussion] More loadtxt() changes

Ryan May Tue, 25 Nov 2008 21:23:43 -0800

Pierre GM wrote:

On Nov 25, 2008, at 10:02 PM, Ryan May wrote:
Pierre GM wrote:
* Your locked version of update won't probably work either, as youforce
the converter to output a string (you set the status to largest
possible, that's the one that outputs strings). Why don't you set the
status to the current one (make a tmp one if needed).
Looking at the code, it looks like mapper is only used in theupgrade()method. My goal by setting status to the largest possible is to lockthe
converter to the supplied function.  That way for the user supplied
converters, the StringConverter doesn't try to upgrade away fromit. My
thinking was that if the user supplied converter function fails, the
user should know. (Though I got this wrong the first time.)


Updated patch attached.  This includes:
 * Updated docstring
 * New tests
 * Fixes for previous issues
 * Fixes to make new tests actually work

I appreciate any and all feedback.

Ryan

--
Ryan May
Graduate Research Assistant
School of Meteorology
University of Oklahoma

Index: numpy/lib/io.py
===================================================================
--- numpy/lib/io.py     (revision 6107)
+++ numpy/lib/io.py     (working copy)
@@ -233,29 +233,136 @@
     for name in todel:
         os.remove(name)
 
-# Adapted from matplotlib
+def _string_like(obj):
+    try: obj + ''
+    except (TypeError, ValueError): return False
+    return True
 
-def _getconv(dtype):
-    typ = dtype.type
-    if issubclass(typ, np.bool_):
-        return lambda x: bool(int(x))
-    if issubclass(typ, np.integer):
-        return lambda x: int(float(x))
-    elif issubclass(typ, np.floating):
-        return float
-    elif issubclass(typ, np.complex):
-        return complex
+def str2bool(value):
+    """
+    Tries to transform a string supposed to represent a boolean to a boolean.
+    
+    Raises
+    ------
+    ValueError
+        If the string is not 'True' or 'False' (case independent)
+    """
+    value = value.upper()
+    if value == 'TRUE':
+        return True
+    elif value == 'FALSE':
+        return False
     else:
-        return str
+        raise ValueError("Invalid boolean")
 
+class StringConverter(object):
+    """
+    Factory class for function transforming a string into another object (int,
+    float).
 
-def _string_like(obj):
-    try: obj + ''
-    except (TypeError, ValueError): return 0
-    return 1
+    After initialization, an instance can be called to transform a string 
+    into another object. If the string is recognized as representing a missing
+    value, a default value is returned.
 
+    Parameters
+    ----------
+    dtype : dtype, optional
+        Input data type, used to define a basic function and a default value
+        for missing data. For example, when `dtype` is float, the :attr:`func`
+        attribute is set to ``float`` and the default value to `np.nan`.
+    missing_values : sequence, optional
+        Sequence of strings indicating a missing value.
+
+    Attributes
+    ----------
+    func : function
+        Function used for the conversion
+    default : var
+        Default value to return when the input corresponds to a missing value.
+    mapper : sequence of tuples
+        Sequence of tuples (function, default value) to evaluate in order.
+
+    """
+    from numpy.core import nan # To avoid circular import
+    mapper = [(str2bool, None),
+              (int, -1), #Needs to be int so that it can fail and promote
+                         #to float
+              (float, nan),
+              (complex, nan+0j),
+              (str, '???')]
+
+    def __init__(self, dtype=None, missing_values=None):
+        self._locked = False
+        if dtype is None:
+            self.func = str2bool
+            self.default = None
+            self._status = 0
+        else:
+            dtype = np.dtype(dtype).type
+            if issubclass(dtype, np.bool_):
+                (self.func, self.default, self._status) = (str2bool, 0, 0)
+            elif issubclass(dtype, np.integer):
+                #Needs to be int(float(x)) so that floating point values will
+                #be coerced to int when specifid by dtype
+                (self.func, self.default, self._status) = (lambda x: 
int(float(x)), -1, 1)
+            elif issubclass(dtype, np.floating):
+                (self.func, self.default, self._status) = (float, np.nan, 2)
+            elif issubclass(dtype, np.complex):
+                (self.func, self.default, self._status) = (complex, np.nan + 
0j, 3)
+            else:
+                (self.func, self.default, self._status) = (str, '???', -1)
+
+        # Store the list of strings corresponding to missing values.
+        if missing_values is None:
+            self.missing_values = []
+        else:
+            self.missing_values = set(list(missing_values) + [''])
+
+    def __call__(self, value):
+        if value in self.missing_values:
+            return self.default
+        return self.func(value)
+
+    def upgrade(self, value):
+        """
+    Tries to find the best converter for `value`, by testing different
+    converters in order.
+    The order in which the converters are tested is read from the
+    :attr:`_status` attribute of the instance.
+        """
+        try:
+            self.__call__(value)
+        except ValueError:
+            if self._locked:
+                raise
+            _statusmax = len(self.mapper)
+            if self._status == _statusmax:
+                raise ValueError("Could not find a valid conversion function")
+            elif self._status < _statusmax - 1:
+                self._status += 1
+            (self.func, self.default) = self.mapper[self._status]
+            self.upgrade(value)
+
+    def update(self, func, default=None, locked=False):
+        """
+    Sets the :attr:`func` and :attr:`default` attributes directly.
+
+    Parameters
+    ----------
+    func : function
+        Conversion function.
+    default : var, optional
+        Default value to return when a missing value is encountered.
+    locked : bool, optional
+        Whether this should lock in the function so that no upgrading is
+        possible.
+        """
+        self.func = func
+        self.default = default
+        self._locked = locked
+
 def loadtxt(fname, dtype=float, comments='#', delimiter=None, converters=None,
-            skiprows=0, usecols=None, unpack=False):
+            skiprows=0, usecols=None, unpack=False, names=None):
     """
     Load data from a text file.
 
@@ -266,30 +373,38 @@
     fname : file or string
         File or filename to read.  If the filename extension is ``.gz``,
         the file is first decompressed.
-    dtype : data-type
+    dtype : data-type or None, optional
         Data type of the resulting array.  If this is a record data-type,
         the resulting array will be 1-dimensional, and each row will be
         interpreted as an element of the array.   In this case, the number
         of columns used must match the number of fields in the data-type.
+        If None, the dtypes will be determined by the contents of each
+        column, individually.
     comments : string, optional
         The character used to indicate the start of a comment.
     delimiter : string, optional
         The string used to separate values.  By default, this is any
         whitespace.
     converters : {}
-        A dictionary mapping column number to a function that will convert
-        that column to a float.  E.g., if column 0 is a date string:
-        ``converters = {0: datestr2num}``. Converters can also be used to
-        provide a default value for missing data:
+        A dictionary mapping column number or name to a function that
+        will convert that column to a float.  E.g., if column 0 is a
+        date string: ``converters = {0: datestr2num}``. Converters can
+        also be used to provide a default value for missing data:
         ``converters = {3: lambda s: float(s or 0)}``.
-    skiprows : int
+    skiprows : int, optional
         Skip the first `skiprows` lines.
-    usecols : sequence
-        Which columns to read, with 0 being the first.  For example,
-        ``usecols = (1,4,5)`` will extract the 2nd, 5th and 6th columns.
-    unpack : bool
+    usecols : sequence, optional
+        Which columns to read.  This can be a sequence of either column
+        numbers or column names.  For column numbers, 0 is the first.
+        For example, ``usecols = (1,4,5)`` will extract the 2nd, 5th and
+        6th columns.
+    unpack : bool, optional
         If True, the returned array is transposed, so that arguments may be
         unpacked using ``x, y, z = loadtxt(...)``
+    names : sequence or True, optional
+        If True, the names are read from the first line after skipping
+        `skiprows` lines.  If a sequence, *names* is a list of names to
+        use in creating a flexible dtype for the data.
 
     Returns
     -------
@@ -333,11 +448,10 @@
             fh = gzip.open(fname)
         else:
             fh = file(fname)
-    elif hasattr(fname, 'seek'):
+    elif hasattr(fname, 'readline'):
         fh = fname
     else:
         raise ValueError('fname must be a string or file handle')
-    X = []
 
     def flatten_dtype(dt):
         """Unpack a structured data-type."""
@@ -359,10 +473,6 @@
         else:
             return []
 
-    # Make sure we're dealing with a proper dtype
-    dtype = np.dtype(dtype)
-    defconv = _getconv(dtype)
-
     # Skip the first `skiprows` lines
     for i in xrange(skiprows):
         fh.readline()
@@ -377,37 +487,80 @@
         first_vals = split_line(first_line)
     N = len(usecols or first_vals)
 
-    dtype_types = flatten_dtype(dtype)
-    if len(dtype_types) > 1:
-        # We're dealing with a structured array, each field of
-        # the dtype matches a column
-        converters = [_getconv(dt) for dt in dtype_types]
+    # If names is True, read the field names from the first line
+    if names == True:
+        names = first_vals
+        first_line = ''
+
+    # Make sure we're dealing with a proper dtype
+    if dtype is None:
+        converters = [StringConverter() for i in xrange(N)]
     else:
-        # All fields have the same dtype
-        converters = [defconv for i in xrange(N)]
+        dtype = np.dtype(dtype)
+        dtype_types = flatten_dtype(dtype)
+        if len(dtype_types) > 1:
+            # We're dealing with a structured array, each field of
+            # the dtype matches a column
+            converters = [StringConverter(dt) for dt in dtype_types]
+            names = list(dtype.names)
+        else:
+            # All fields have the same dtype
+            converters = [StringConverter(dtype) for i in xrange(N)]
 
+    # If usecols contains a list of names, convert them to column indices
+    if usecols and _string_like(usecols[0]):
+        usecols = [names.index(_) for _ in usecols]
+
     # By preference, use the converters specified by the user
     for i, conv in (user_converters or {}).iteritems():
+        # If the converter is specified by column number, convert it to an 
index
+        if _string_like(i):
+            i = names.index(i)
         if usecols:
             try:
                 i = usecols.index(i)
             except ValueError:
                 # Unused converter specified
                 continue
-        converters[i] = conv
+        converters[i].update(conv, None, locked=True)
 
     # Parse each line, including the first
+    rows = []
     for i, line in enumerate(itertools.chain([first_line], fh)):
         vals = split_line(line)
         if len(vals) == 0:
             continue
 
         if usecols:
-            vals = [vals[i] for i in usecols]
+            vals = [vals[_] for _ in usecols]
 
-        # Convert each value according to its column and store
-        X.append(tuple([conv(val) for (conv, val) in zip(converters, vals)]))
+        if dtype is None:
+            for converter, item in zip(converters, vals):
+                if len(item.strip()):
+                    converter.upgrade(item)
 
+        # Store the values
+        rows.append(tuple(vals))
+
+    # Convert each value according to its column and store
+    for i,vals in enumerate(rows):
+        rows[i] = tuple([conv(val) for (conv, val) in zip(converters, vals)])
+
+    #Construct final dtype if necessary
+    if dtype is None:
+        dtype_types = [np.array([row[i] for row in rows]).dtype
+            for i in xrange(N)]
+        uniform_dtype = all([dtype_types[0] == dt for dt in dtype_types])
+        if uniform_dtype and not names:
+            dtype = dtype_types[0]
+            dtype_types = dtype
+        else:
+            if not names:
+                names = ['column%d'%i for i in xrange(N)]
+            elif usecols:
+                names = [names[i] for i in usecols]
+            dtype = np.dtype(zip(names, dtype_types))
+
     if len(dtype_types) > 1:
         # We're dealing with a structured array, with a dtype such as
         # [('x', int), ('y', [('s', int), ('t', float)])]
@@ -416,16 +569,16 @@
         # [('x', int), ('s', int), ('t', float)]
         #
         # Then, view the array using the specified dtype.
-        X = np.array(X, dtype=np.dtype([('', t) for t in dtype_types]))
-        X = X.view(dtype)
+        rows = np.array(rows, dtype=np.dtype([('', t) for t in dtype_types]))
+        rows = rows.view(dtype)
     else:
-        X = np.array(X, dtype)
+        rows = np.array(rows, dtype)
 
-    X = np.squeeze(X)
+    rows = np.squeeze(rows)
     if unpack:
-        return X.T
+        return rows.T
     else:
-        return X
+        return rows
 
 
 def savetxt(fname, X, fmt='%.18e',delimiter=' '):
Index: numpy/lib/tests/test_io.py
===================================================================
--- numpy/lib/tests/test_io.py  (revision 6107)
+++ numpy/lib/tests/test_io.py  (working copy)
@@ -140,6 +140,37 @@
         y = np.loadtxt(d, dtype=mydescriptor)
         assert_array_equal(y, b)
 
+    def test_header(self):
+        c = StringIO.StringIO('gender age weight\nM 64.0 75.0\nF 25.0 60.0')
+        x = np.loadtxt(c, names=True, dtype=None)
+        g = np.array(['M', 'F'])
+        a = np.array([64.0, 25.0])
+        w = np.array([75.0, 60.0])
+        assert_array_equal(x['gender'], g)
+        assert_array_equal(x['age'], a)
+        assert_array_equal(x['weight'], w)
+
+    def test_auto_dtype(self):
+        c = StringIO.StringIO('A 64 75.0 3+4j True\nBC 25 60.0 5+6j False')
+        x = np.loadtxt(c, dtype=None)
+        a = np.array(['A', 'BC'])
+        b = np.array([64, 25])
+        c = np.array([75.0, 60.0])
+        d = np.array([3+4j, 5+6j])
+        e = np.array([True, False])
+        assert_array_equal(x['column0'], a)
+        assert_array_equal(x['column1'], b)
+        assert_array_equal(x['column2'], c)
+        assert_array_equal(x['column3'], d)
+        assert_array_equal(x['column4'], e)
+
+    def test_auto_dtype_uniform(self):
+        d = StringIO.StringIO('1 2 3 4\n5 6 7 8\n')
+        x = np.loadtxt(d, dtype=None)
+        print x
+        a = np.array([[1,2,3,4],[5,6,7,8]])
+        assert_array_equal(x, a)
+
     def test_array(self):
         c = StringIO.StringIO()
         c.write('1 2\n3 4')

_______________________________________________
Numpy-discussion mailing list
Numpy-discussion@scipy.org
http://projects.scipy.org/mailman/listinfo/numpy-discussion

Re: [Numpy-discussion] More loadtxt() changes

Reply via email to