Pierre GM wrote:
I think that treating an explicitly-passed-in ' ' delimiter as
identical to 'no delimiter' is a bad idea. If I say that ' ' is the
delimiter, or '\t' is the delimiter, this should be treated *just*
like ',' being the delimiter, where the expected output is:
['1', '2', '3', '4', '', '5']
Valid point.
Well, all, stay tuned for yet another "yet another implementation..."
Found a problem. If you read the names from the file and specify
usecols, you end up with the first N names read from the file as the
fields in your output (where N is the number of entries in usecols),
instead of having the names of the columns you asked for.
For instance:
>>>from StringIO import StringIO
>>>from genload_proposal import loadtxt
>>>f = StringIO('stid stnm relh tair\nnrmn 121 45 9.1')
>>>loadtxt(f, usecols=('stid', 'relh', 'tair'), names=True, dtype=None)
array(('nrmn', 45, 9.0999999999999996),
dtype=[('stid', '|S4'), ('stnm', '<i8'), ('relh', '<f8')])
What I want to come out is:
array(('nrmn', 45, 9.0999999999999996),
dtype=[('stid', '|S4'), ('relh', '<i8'), ('tair', '<f8')])
I've attached a version that fixes this by setting a flag internally if
the names are read from the file. If this flag is true, at the end the
names are filtered down to only the ones that are given in usecols.
I also have one other thought. Is there any way we can make this handle
object arrays, or rather, a field containing objects, specifically
datetime objects? Right now, this does not work because calling view
does not work for object arrays. I'm just looking for a simple way to
store date/time in my record array (currently a string field).
Ryan
--
Ryan May
Graduate Research Assistant
School of Meteorology
University of Oklahoma
"""
Proposal :
Here's an extension to np.loadtxt, designed to take missing values into account.
"""
import itertools
import numpy as np
import numpy.ma as ma
def _is_string_like(obj):
"""
Check whether obj behaves like a string.
"""
try:
obj + ''
except (TypeError, ValueError):
return False
return True
def _to_filehandle(fname, flag='r', return_opened=False):
"""
Returns the filehandle corresponding to a string or a file.
If the string ends in '.gz', the file is automatically unzipped.
Parameters
----------
fname : string, filehandle
Name of the file whose filehandle must be returned.
flag : string, optional
Flag indicating the status of the file ('r' for read, 'w' for write).
return_opened : boolean, optional
Whether to return the opening status of the file.
"""
if _is_string_like(fname):
if fname.endswith('.gz'):
import gzip
fhd = gzip.open(fname, flag)
elif fname.endswith('.bz2'):
import bz2
fhd = bz2.BZ2File(fname)
else:
fhd = file(fname, flag)
opened = True
elif hasattr(fname, 'seek'):
fhd = fname
opened = False
else:
raise ValueError('fname must be a string or file handle')
if return_opened:
return fhd, opened
return fhd
def flatten_dtype(ndtype):
"""
Unpack a structured data-type.
"""
names = ndtype.names
if names is None:
return [ndtype]
else:
types = []
for field in names:
(typ, _) = ndtype.fields[field]
flat_dt = flatten_dtype(typ)
types.extend(flat_dt)
return types
def nested_masktype(datatype):
"""
Construct the dtype of a mask for nested elements.
"""
names = datatype.names
if names:
descr = []
for name in names:
(ndtype, _) = datatype.fields[name]
descr.append((name, nested_masktype(ndtype)))
return descr
# Is this some kind of composite a la (np.float,2)
elif datatype.subdtype:
mdescr = list(datatype.subdtype)
mdescr[0] = np.dtype(bool)
return tuple(mdescr)
else:
return np.bool
class LineSplitter:
"""
Defines a function to split a string at a given delimiter or at given places.
Parameters
----------
comment : {'#', string}
Character used to mark the beginning of a comment.
delimiter : var
"""
def __init__(self, delimiter=None, comments='#'):
self.comments = comments
# Delimiter is a character
if delimiter is None:
self._isfixed = False
self.delimiter = None
elif _is_string_like(delimiter):
self._isfixed = False
self.delimiter = delimiter.strip() or None
# Delimiter is a list of field widths
elif hasattr(delimiter, '__iter__'):
self._isfixed = True
idx = np.cumsum([0]+list(delimiter))
self.slices = [slice(i,j) for (i,j) in zip(idx[:-1], idx[1:])]
# Delimiter is a single integer
elif int(delimiter):
self._isfixed = True
self.slices = None
self.delimiter = delimiter
else:
self._isfixed = False
self.delimiter = None
#
def __call__(self, line):
# Strip the comments
line = line.split(self.comments)[0]
if not line:
return []
# Fixed-width fields
if self._isfixed:
# Fields have different widths
if self.slices is None:
fixed = self.delimiter
slices = [slice(i, i+fixed)
for i in range(len(line))[::fixed]]
else:
slices = self.slices
return [line[s].strip() for s in slices]
else:
return [s.strip() for s in line.split(self.delimiter)]
"""
Splits the line at each current delimiter.
Comments are stripped beforehand.
"""
class NameValidator:
"""
Validates a list of strings to use as field names.
The strings are stripped of any non alphanumeric character, and spaces
are replaced by `_`.
During instantiation, the user can define a list of names to exclude, as
well as a list of invalid characters. Names in the exclude list are appended
a '_' character.
Once an instance has been created, it can be called with a list of names
and a list of valid names will be created.
The `__call__` method accepts an optional keyword, `default`, that sets
the default name in case of ambiguity. By default, `default = 'f'`, so
that names will default to `f0`, `f1`
Parameters
----------
excludelist : sequence, optional
A list of names to exclude. This list is appended to the default list
['return','file','print'].
deletechars : string, optional
A string combining invalid characters that must be deleted from the names.
"""
#
defaultexcludelist = ['return','file','print']
defaultdeletechars = set("""[EMAIL PROTECTED]&*()-=+~\|]}[{';: /?.>,<""")
#
def __init__(self, excludelist=None, deletechars=None):
#
if excludelist is None:
excludelist = []
excludelist.append(self.defaultexcludelist)
self.excludelist = excludelist
#
if deletechars is None:
delete = self.defaultdeletechars
else:
delete = set(deletechars)
delete.add('"')
self.deletechars = delete
#
def validate(self, names, default='f'):
#
if names is None:
return
#
validatednames = []
seen = dict()
#
deletechars = self.deletechars
excludelist = self.excludelist
for i, item in enumerate(names):
item = item.strip().lower().replace(' ', '_')
item = ''.join([c for c in item if c not in deletechars])
if not len(item):
item = '%s%d' % (default, i)
elif item in excludelist:
item += '_'
cnt = seen.get(item, 0)
if cnt > 0:
validatednames.append(item + '_%d' % cnt)
else:
validatednames.append(item)
seen[item] = cnt+1
return validatednames
#
def __call__(self, names, default='f'):
return self.validate(names, default)
def str2bool(value):
"""
Tries to transform a string supposed to represent a boolean to a boolean.
Raises
------
ValueError
If the string is not 'True' or 'False' (case independent)
"""
value = value.upper()
if value == 'TRUE':
return True
elif value == 'FALSE':
return False
else:
raise ValueError("Invalid boolean")
class StringConverter:
"""
Factory class for function transforming a string into another object (int,
float).
After initialization, an instance can be called to transform a string
into another object. If the string is recognized as representing a missing
value, a default value is returned.
Parameters
----------
dtype_or_func : {None, dtype, function}, optional
Input data type, used to define a basic function and a default value
for missing data. For example, when `dtype` is float, the :attr:`func`
attribute is set to ``float`` and the default value to `np.nan`.
Alternatively, function used to convert a string to another object.
In that later case, it is recommended to give an associated default
value as input.
default : {None, var}, optional
Value to return by default, that is, when the string to be converted
is flagged as missing.
missing_values : {sequence}, optional
Sequence of strings indicating a missing value.
locked : {boolean}, optional
Whether the StringConverter should be locked to prevent automatic
upgrade or not.
Attributes
----------
func : function
Function used for the conversion
default : var
Default value to return when the input corresponds to a missing value.
_status : integer
Integer representing the order of the conversion.
_mapper : sequence of tuples
Sequence of tuples (dtype, function, default value) to evaluate in order.
_locked : boolean
Whether the StringConverter is locked, thereby preventing automatic any
upgrade or not.
"""
#
_mapper = [(np.bool_, str2bool, None),
(np.integer, int, -1),
(np.floating, float, np.nan),
(np.complex, complex, np.nan+0j),
(np.string_, str, '???')]
(_defaulttype, _defaultfunc, _defaultfill) = zip(*_mapper)
#
@classmethod
def _getsubdtype(cls, val):
"""Returns the type of the dtype of the input variable."""
return np.array(val).dtype.type
#
@classmethod
def upgrade_mapper(cls, func, default=None):
"""
Upgrade the mapper of a StringConverter by adding a new function and its
corresponding default.
The input function (or sequence of functions) and its associated default
value (if any) is inserted in penultimate position of the mapper.
The corresponding type is estimated from the dtype of the default value.
Parameters
----------
func : var
Function, or sequence of functions
"""
# Func is a single functions
if hasattr(func, '__call__'):
cls._mapper.insert(-1, (cls._getsubdtype(default), func, default))
return
elif hasattr(func, '__iter__'):
if isinstance(func[0], (tuple, list)):
for _ in func:
cls._mapper.insert(-1, _)
return
if default is None:
default = [None] * len(func)
else:
default = list(default)
default.append([None] * (len(func)-len(default)))
for (fct, dft) in zip(func, default):
cls._mapper.insert(-1, (cls._getsubdtype(dft), fct, dft))
#
def __init__(self, dtype_or_func=None, default=None, missing_values=None,
locked=False):
# Defines a lock for upgrade
self._locked = bool(locked)
# No input dtype: minimal initialization
if dtype_or_func is None:
self.func = str2bool
self._status = 0
self.default = default
else:
# Is the input a np.dtype ?
try:
self.func = None
ttype = np.dtype(dtype_or_func).type
except TypeError:
# dtype_or_func must be a function, then
if not hasattr(dtype_or_func, '__call__'):
errmsg = "The input argument `dtype` is neither a function"\
" or a dtype (got '%s' instead)"
raise TypeError(errmsg % type(dtype_or_func))
# Set the function
self.func = dtype_or_func
# If we don't have a default, try to guess it or set it to None
if default is None:
try:
default = self.func('0')
except ValueError:
default = None
ttype = self._getsubdtype(default)
# Set the status according to the dtype
for (i, (deftype, func, default_def)) in enumerate(self._mapper):
if np.issubdtype(ttype, deftype):
self._status = i
self.default = default or default_def
break
# If the input was a dtype, set the function to the last we saw
if self.func is None:
self.func = func
# If the status is 1 (int), change the function to smthg more robust
if self.func == self._mapper[1][1]:
self.func = lambda x : int(float(x))
# Store the list of strings corresponding to missing values.
if missing_values is None:
self.missing_values = set([''])
else:
self.missing_values = set(list(missing_values) + [''])
#
def __call__(self, value):
try:
return self.func(value)
except ValueError:
if value in self.missing_values:
return self.default
raise ValueError("Cannot convert string '%s'" % value)
#
def upgrade(self, value):
"""
Tries to find the best converter for `value`, by testing different
converters in order.
The order in which the converters are tested is read from the
:attr:`_status` attribute of the instance.
"""
try:
self.__call__(value)
except ValueError:
# Raise an exception if we locked the converter...
if self._locked:
raise ValueError("Converter is locked and cannot be upgraded")
_statusmax = len(self._mapper)
# Complains if we try to upgrade by the maximum
if self._status == _statusmax:
raise ValueError("Could not find a valid conversion function")
elif self._status < _statusmax - 1:
self._status += 1
(_, self.func, self.default) = self._mapper[self._status]
self.upgrade(value)
#
def update(self, func, default=None, missing_values='', locked=False):
"""
Sets the :attr:`func` and :attr:`default` attributes directly.
Parameters
----------
func : function
Conversion function.
default : {var}, optional
Default value to return when a missing value is encountered.
missing_values : {var}, optional
Sequence of strings representing missing values.
locked : {False, True}, optional
Whether the status should be locked to prevent automatic upgrade.
"""
self.func = func
self._locked = locked
# Don't reset the default to None if we can avoid it
if default is not None:
self.default = default
# Add the missing values to the existing set
if missing_values is not None:
if _is_string_like(missing_values):
self.missing_values.add(missing_values)
elif hasattr(missing_values, '__iter__'):
for val in missing_values:
self.missing_values.add(val)
else:
self.missing_values = []
def genloadtxt(fname, dtype=float, comments='#', delimiter=None, skiprows=0,
converters=None, missing='', missing_values=None,
usecols=None, unpack=None,
names=None, excludelist=None, deletechars=None):
"""
Load data from a text file.
Each row in the text file must have the same number of values.
Parameters
----------
fname : file or string
File or filename to read. If the filename extension is `.gz` or `.bz2`,
the file is first decompressed.
dtype : data-type
Data type of the resulting array. If this is a flexible data-type,
the resulting array will be 1-dimensional, and each row will be
interpreted as an element of the array. In this case, the number
of columns used must match the number of fields in the data-type,
and the names of each field will be set by the corresponding name
of the dtype.
If None, the dtypes will be determined by the contents of each
column, individually.
comments : {string}, optional
The character used to indicate the start of a comment.
delimiter : {string}, optional
The string used to separate values. By default, this is any
whitespace.
skiprows : {int}, optional
Numbers of lines to skip at the beginning of the file.
converters : {None, dictionary}, optional
A dictionary mapping column number to a function that will convert
that column to a float. E.g., if column 0 is a date string:
``converters = {0: datestr2num}``. Converters can also be used to
provide a default value for missing data:
``converters = {3: lambda s: float(s or 0)}``.
missing : {string}, optional
A string representing a missing value, irrespective of the column where
it appears (e.g., `'missing'` or `'unused'`).
missing_values : {None, dictionary}, optional
A dictionary mapping a column number to a string indicating whether the
corresponding field should be masked.
usecols : {None, sequence}, optional
Which columns to read, with 0 being the first. For example,
``usecols = (1,4,5)`` will extract the 2nd, 5th and 6th columns.
names : {None, True, string, sequence}, optional
If `names` is True, the field names are read from the first valid line
after the first `skiprows` lines.
If `names` is a sequence or a single-string of comma-separated names,
the names will be used to define the field names in a flexible dtype.
If `names` is None, the names of the dtype fields will be used, if any.
unpack : {bool}, optional
If True, the returned array is transposed, so that arguments may be
unpacked using ``x, y, z = loadtxt(...)``
Returns
-------
out : MaskedArray
Data read from the text file.
Notes
--------
* When spaces are used as delimiters, there should not be any missing data
between two fields.
* When `names` is not None, names are lower cased, the spaces replaced by
underscores, and any illegal character suppressed.
* When the variable are named (either by a flexible dtype or with `names`,
there must not be any header in the file (else a :exc:ValueError exception
is raised).
"""
user_converters = converters or {}
if not isinstance(user_converters, dict):
errmsg = "The input argument 'converter' should be a valid dictionary "\
"(got '%s' instead)"
raise TypeError(errmsg % type(user_converters))
user_missing_values = missing_values or {}
if not isinstance(user_missing_values, dict):
errmsg = "The input argument 'missing_values' should be a valid "\
"dictionary (got '%s' instead)"
raise TypeError(errmsg % type(missing_values))
defmissing = [_.strip() for _ in missing.split(',')] + ['']
# Initialize the filehanlde, the LineSplitter and the NameValidator
fhd = _to_filehandle(fname)
split_line = LineSplitter(delimiter=delimiter, comments=comments)
validate_names = NameValidator(excludelist=excludelist,
deletechars=deletechars)
# Get the first valid lines after the first skiprows ones
for i in xrange(skiprows):
fhd.readline()
first_values = None
while not first_values:
first_line = fhd.readline()
if first_line == '':
raise IOError('End-of-file reached before encountering data.')
first_values = split_line(first_line)
# Check the columns to use
if usecols is not None:
usecols = list(usecols)
nbcols = len(usecols or first_values)
#Whether or not we have read the names from the file
read_names = False
# Check the names and overwrite the dtype.names if needed
if dtype is not None:
dtype = np.dtype(dtype)
dtypenames = getattr(dtype, 'names', None)
if names is True:
names = validate_names(first_values)
first_line =''
read_names = True
elif _is_string_like(names):
names = validate_names([_.strip() for _ in names.split(',')])
elif names:
names = validate_names(names)
elif dtypenames:
dtype.names = validate_names(dtypenames)
if names and dtypenames:
dtype.names = names
# If usecols is a list of names, convert to a list of indices
if usecols:
for (i, current) in enumerate(usecols):
if _is_string_like(current):
usecols[i] = names.index(current)
# If user_missing_values has names as keys, transform them to indices
missing_values = {}
for (key, val) in user_missing_values.iteritems():
# If val is a list, flatten it. In any case, add missing &'' to the list
if isinstance(val, (list, tuple)):
val = [str(_) for _ in val]
else:
val = [str(val),]
val.extend(defmissing)
if _is_string_like(key):
try:
missing_values[names.index(key)] = val
except ValueError:
pass
else:
missing_values[key] = val
# Initialize the default converters
if dtype is None:
# Note: we can't use a [...]*nbcols, as we would have 3 times the same
# ... converter, instead of 3 different converters.
converters = [StringConverter(None,
missing_values=missing_values.get(_, defmissing))
for _ in range(nbcols)]
else:
flatdtypes = flatten_dtype(dtype)
# Initialize the converters
if len(flatdtypes) > 1:
# Flexible type : get a converter from each dtype
converters = [StringConverter(dt,
missing_values=missing_values.get(i, defmissing))
for (i, dt) in enumerate(flatdtypes)]
else:
# Set to a default converter (but w/ different missing values)
converters = [StringConverter(dtype,
missing_values=missing_values.get(_, defmissing))
for _ in range(nbcols)]
missing_values = [_.missing_values for _ in converters]
# Update the converters to use the user-defined ones
for (i, conv) in user_converters.iteritems():
# If the converter is specified by column names, use the index instead
if _is_string_like(i):
i = names.index(i)
if usecols:
try:
i = usecols.index(i)
except ValueError:
# Unused converter specified
continue
converters[i].update(conv, default=None,
missing_values=missing_values[i],
locked=True)
rows = []
rowmasks = []
# Parse each line
for (i, line) in enumerate(itertools.chain([first_line,], fhd)):
values = split_line(line)
# Skip an empty line
if len(values) == 0:
continue
# Select only the columns we need
if usecols:
values = [values[_] for _ in usecols]
# Check whether we need to update the converter
if dtype is None:
for (j, (converter, item)) in enumerate(zip(converters, values)):
if len(item.strip()):
converter.upgrade(item)
# Store the values
rows.append(tuple(values))
rowmasks.append(tuple([val in mss
for (val, mss) in zip(values, missing_values)]))
# Convert each value according to the converter:
for (i, vals) in enumerate(rows):
rows[i] = tuple([conv(val) for (conv, val) in zip(converters, vals)])
# If we read the names from the file, we have more names than columns
# we want, so we need to filter down to the names we actually want.
if usecols and read_names:
names = [names[_] for _ in usecols]
# Reset the dtype
if dtype is None:
# Get the dtypes from the first row
coldtypes = [np.array(val).dtype for val in rows[0]]
# Find the columns with strings, and take the largest number of chars.
strcolidx = [i for (i, v) in enumerate(coldtypes) if v.char == 'S']
for i in strcolidx:
coldtypes[i] = "|S%i" % max(len(row[i]) for row in rows)
if names is None:
# If the dtype is uniform, don't define names, else use ''
base = coldtypes[0]
if np.all([(dt == base) for dt in coldtypes]):
(ddtype, mdtype) = (base, np.bool)
else:
ddtype = [('', dt) for dt in coldtypes]
mdtype = [('', np.bool) for dt in coldtypes]
else:
ddtype = zip(names, coldtypes)
mdtype = zip(names, [np.bool] * len(coldtypes))
output = np.array(rows, dtype=ddtype)
outputmask = np.array(rowmasks, dtype=mdtype)
else:
# Overwrite the initial dtype names if needed
if names and dtype.names:
dtype.names = names
# Check whether we have a nested dtype
flatdtypes = flatten_dtype(dtype)
if len(flatdtypes) > 1:
# Nested dtype, eg [('a', int), ('b', [('b0', int), ('b1', 'f4')])]
# First, create the array using a flattened dtype:
# [('a', int), ('b1', int), ('b2', float)]
# Then, view the array using the specified dtype.
rows = np.array(rows, dtype=np.dtype([('', t) for t in flatdtypes]))
output = rows.view(dtype)
# Now, process the rowmasks the same way
rowmasks = np.array(rowmasks, dtype=np.dtype([('', np.bool)
for t in flatdtypes]))
# Construct the new dtype
mdtype = nested_masktype(dtype)
outputmask = rowmasks.view([tuple(_) for _ in mdtype])
else:
output = np.array(rows, dtype)
if dtype.names:
outputmask = np.array(rowmasks,
dtype=[(_, np.bool) for _ in dtype.names])
else:
outputmask = np.array(rowmasks, dtype=np.bool)
# Construct the final array
if unpack:
return (output.squeeze().T, outputmask.squeeze().T)
return (output.squeeze(), outputmask.squeeze().T)
def loadtxt(fname, dtype=float, comments='#', delimiter=None, skiprows=0,
converters=None, missing='', missing_values=None,
usecols=None, unpack=None,
names=None, excludelist=None, deletechars=None):
kwargs = dict(dtype=dtype, comments=comments, delimiter=delimiter,
skiprows=skiprows, converters=converters,
missing=missing, missing_values=missing_values,
usecols=usecols, unpack=unpack, names=names,
excludelist=excludelist, deletechars=deletechars)
(output, _) = genloadtxt(fname, **kwargs)
return output
def mloadtxt(fname, dtype=float, comments='#', delimiter=None, skiprows=0,
converters=None, missing='', missing_values=None,
usecols=None, unpack=None,
names=None, excludelist=None, deletechars=None):
kwargs = dict(dtype=dtype, comments=comments, delimiter=delimiter,
skiprows=skiprows, converters=converters,
missing=missing, missing_values=missing_values,
usecols=usecols, unpack=unpack, names=names,
excludelist=excludelist, deletechars=deletechars)
(output, outputmask) = genloadtxt(fname, **kwargs)
output = output.view(ma.MaskedArray)
output.mask = outputmask
return output
_______________________________________________
Numpy-discussion mailing list
Numpy-discussion@scipy.org
http://projects.scipy.org/mailman/listinfo/numpy-discussion