On 11/16/10 10:01 AM, Christopher Barker wrote:
OK -- I'll whip up a test similar to yours -- stay tuned!
Here's what I've done:
import numpy as np
from maproomlib.utility import file_scanner
def gen_file():
f = file('test.dat', 'w')
for i in range(1200):
f.write('1 ' * 2048)
f.write('\n')
f.close()
def read_file1():
""" read unknown length: doubles"""
f = file('test.dat')
arr = file_scanner.FileScan(f)
f.close()
return arr
def read_file2():
""" read known length: doubles"""
f = file('test.dat')
arr = file_scanner.FileScanN(f, 1200*2048)
f.close()
return arr
def read_file3():
""" read known length: singles"""
f = file('test.dat')
arr = file_scanner.FileScanN_single(f, 1200*2048)
f.close()
return arr
def read_fromfile1():
""" read unknown length with fromfile(): singles"""
f = file('test.dat')
arr = np.fromfile(f, dtype=np.float32, sep=' ')
f.close()
return arr
def read_fromfile2():
""" read unknown length with fromfile(): doubles"""
f = file('test.dat')
arr = np.fromfile(f, dtype=np.float64, sep=' ')
f.close()
return arr
def read_fromstring1():
""" read unknown length with fromstring(): singles"""
f = file('test.dat')
str = f.read()
arr = np.fromstring(str, dtype=np.float32, sep=' ')
f.close()
return arr
And the results (ipython's timeit):
In [40]: timeit test.read_fromfile1()
1 loops, best of 3: 561 ms per loop
In [41]: timeit test.read_fromfile2()
1 loops, best of 3: 570 ms per loop
In [42]: timeit test.read_file1()
1 loops, best of 3: 336 ms per loop
In [43]: timeit test.read_file2()
1 loops, best of 3: 341 ms per loop
In [44]: timeit test.read_file3()
1 loops, best of 3: 515 ms per loop
In [46]: timeit test.read_fromstring1()
1 loops, best of 3: 301 ms per loop
So my filescanner is faster, but not radically so, than fromfile().
However, reading the whole file into a string, then using fromstring()
is, in fact, tne fastest method -- interesting -- shows you why you need
to profile!
Also, with my code, reading singles is slower than doubles -- odd. Perhaps the C lib fscanf read doubles anyway, then converts to singles?
Anyway, for my needs, my file_scanner and fromfile() are fast enough, and much faster than parsing the files with Python. My issue with fromfile is flexibility and robustness -- it's buggy in the face of ill-formed files. See the list archives and the bug reports for more detail.
Still, it seems your very basic method is indeed a faster way to go.I've enclosed the files. It's currently built as part of a larger lib, so no setup.py -- though it could be written easily enough.
-Chris -- Christopher Barker, Ph.D. Oceanographer Emergency Response Division NOAA/NOS/OR&R (206) 526-6959 voice 7600 Sand Point Way NE (206) 526-6329 fax Seattle, WA 98115 (206) 526-6317 main reception [email protected]
#include "Python.h"
#include <numpy/arrayobject.h>
// NOTE: these buffer sizes were picked very arbitrarily, and have
// remarkably little impact on performance on my system.
#define BUFFERSIZE1 1024
#define BUFFERSIZE2 64
int filescan_double(FILE *infile, int NNums, double *array){
double N;
int i, j;
int c;
for (i=0; i<NNums; i++){
while ( (j = fscanf(infile, "%lg", &N)) == 0 ){
c = fgetc(infile);
}
if (j == EOF) {
return(i);
}
array[i] = N;
}
// Go to the end of any whitespace:
while ( isspace(c = fgetc(infile)) ){
//printf("skipping a whitespace character: %i\n", c);
//printf("I'm at position %i in the file\n",ftell(infile));
}
if (c > -1){
// not EOF, rewind the file one byte.
fseek(infile, -1, SEEK_CUR);
}
return(i);
}
int filescan_single(FILE *infile, int NNums, float *array){
float N;
int i, j;
int c;
for (i=0; i<NNums; i++){
/* while ( (j = fscanf(infile, "%lg", &N)) == 0 ){*/
while ( (j = fscanf(infile, "%g", &N)) == 0 ){
c = fgetc(infile);
}
if (j == EOF) {
return(i);
}
array[i] = N;
}
// Go to the end of any whitespace:
while ( isspace(c = fgetc(infile)) ){
//printf("skipping a whitespace character: %i\n", c);
//printf("I'm at position %i in the file\n",ftell(infile));
}
if (c > -1){
// not EOF, rewind the file one byte.
fseek(infile, -1, SEEK_CUR);
}
return(i);
}
static char doc_FileScanN[] =
"FileScanN(file, N)\n\n"
"Reads N values in the ascii file, and produces a Numeric vector of\n"
"length N full of Floats (C doubles).\n\n"
"Raises an exception if there are fewer than N numbers in the file.\n\n"
"All text in the file that is not part of a floating point number is\n"
"skipped over.\n\n"
"After reading N numbers, the file is left before the next non-whitespace\n"
"character in the file. This will often leave the file at the start of\n"
"the next line, after scanning a line full of numbers.\n";
static char doc_FileScanN_single[] =
"FileScanN_single(file, N)\n\n"
"Reads N values in the ascii file, and produces a Numeric vector of\n"
"length N full of single precision floats (C floats).\n\n"
"Raises an exception if there are fewer than N numbers in the file.\n\n"
"All text in the file that is not part of a floating point number is\n"
"skipped over.\n\n"
"After reading N numbers, the file is left before the next non-whitespace\n"
"character in the file. This will often leave the file at the start of\n"
"the next line, after scanning a line full of numbers.\n";
static PyObject * file_scanner_FileScanN(PyObject *self, PyObject *args)
{
PyFileObject *File;
PyArrayObject *Array;
npy_intp length;
double *Data;
int i;
//printf("Starting\n");
if (!PyArg_ParseTuple(args, "O!i", &PyFile_Type, &File, &length) ) {
return NULL;
}
Data = calloc(length, sizeof(double) );
if ((i = filescan_double(PyFile_AsFile( (PyObject*)File ), length, Data)) <
length){
PyErr_SetString (PyExc_ValueError,
"End of File reached before all numbers found");
free(Data);
return NULL;
}
Array = (PyArrayObject *) PyArray_SimpleNew(1, &length, PyArray_DOUBLE);
for (i = 0; i< length ; i++){
*(double *)(Array->data + (i * Array->strides[0] ) ) = Data[i];
}
free(Data);
return PyArray_Return(Array);
}
static PyObject * file_scanner_FileScanN_single(PyObject *self, PyObject *args)
{
PyFileObject *File;
PyArrayObject *Array;
npy_intp length;
float *Data;
int i;
//printf("Starting\n");
if (!PyArg_ParseTuple(args, "O!i", &PyFile_Type, &File, &length) ) {
return NULL;
}
Data = calloc(length, sizeof(float) );
if ((i = filescan_single(PyFile_AsFile( (PyObject*)File ), length, Data)) <
length){
PyErr_SetString (PyExc_ValueError,
"End of File reached before all numbers found");
free(Data);
return NULL;
}
Array = (PyArrayObject *) PyArray_SimpleNew(1, &length, PyArray_FLOAT);
/*fixme: is there a copy data function I could use here?*/
for (i = 0; i< length ; i++){
*(float *)(Array->data + (i * Array->strides[0] ) ) = Data[i];
}
free(Data);
return PyArray_Return(Array);
}
static char doc_FileScan[] =
"FileScan(file)\n\n"
"Reads all the values in rest of the open ascii file: file, and produces\n"
"a Numeric vector full of Floats (C doubles).\n\n"
"All text in the file that is not part of a floating point number is\n"
"skipped over.\n\n"
;
static PyObject * file_scanner_FileScan(PyObject *self, PyObject *args)
{
FILE *infile;
char *DataPtr;
PyFileObject *File;
PyArrayObject *Array;
double *(*P_array);
double *(*Old_P_array);
int i,j,k;
//int ScanCount = 0;
npy_intp ScanCount = 0;
int BufferSize = BUFFERSIZE2;
int OldBufferSize = 0;
int StartOfBuffer = 0;
int NumBuffers = 0;
if (!PyArg_ParseTuple(args, "O!", &PyFile_Type, &File) ) {
return NULL;
}
infile = PyFile_AsFile( (PyObject*)File );
P_array = (double**) calloc(BufferSize, sizeof(void*) );
while (1) {
for (j=StartOfBuffer; j < BufferSize; j++){
P_array[j] = (double*) calloc(BUFFERSIZE1, sizeof(double));
NumBuffers++ ;
i = filescan_double(infile, BUFFERSIZE1, P_array[j]);
if (i) {
ScanCount += i;
//for (k=0; k<BUFFERSIZE1; k++){
// printf("%.14g\n", P_array[j][k]);
//}
}
if (i == 0){
break;
}
}
if (i == 0) {
break;
}
// Need more memory
OldBufferSize = BufferSize;
BufferSize += BUFFERSIZE2;
StartOfBuffer += BUFFERSIZE2;
Old_P_array = P_array;
P_array = (double**) calloc(BufferSize, sizeof(void*) );
for (j=0; j < OldBufferSize; j++){
P_array[j] = Old_P_array[j];
}
free(Old_P_array);
}
// copy all the data to a PyArray
Array = (PyArrayObject *) PyArray_SimpleNew(1, &ScanCount, PyArray_DOUBLE);
i = 0;
DataPtr = Array->data;
for (j=0; j<BufferSize; j++){
for (k=0; k<BUFFERSIZE1; k++){
if (i >= ScanCount) {
break;
}
*(double *)DataPtr = P_array[j][k];
DataPtr += Array->strides[0];
i++;
}
}
//free all the memory
for (j=0; j<NumBuffers; j++){
free(P_array[j]);
}
free(P_array);
return PyArray_Return(Array);
}
static PyMethodDef file_scanner_methods[] = {
{"FileScanN_single", file_scanner_FileScanN_single, METH_VARARGS,
doc_FileScanN_single},
{"FileScanN", file_scanner_FileScanN, METH_VARARGS, doc_FileScanN},
{"FileScan", file_scanner_FileScan, METH_VARARGS, doc_FileScan},
// {"byteswap", NumericExtras_byteswap, METH_VARARGS, doc_byteswap},
//{"changetype", NumericExtras_changetype, METH_VARARGS, doc_changetype},
{NULL, NULL} /* Sentinel */
};
void initfile_scanner(void){
(void) Py_InitModule("file_scanner", file_scanner_methods);
import_array()
}
test_simple_large.py
Description: application/python
_______________________________________________ NumPy-Discussion mailing list [email protected] http://mail.scipy.org/mailman/listinfo/numpy-discussion
