Re: [Numpy-discussion] seeking advice on a fast string->array conversion

Christopher Barker Tue, 16 Nov 2010 10:44:25 -0800

On 11/16/10 10:01 AM, Christopher Barker wrote:

OK -- I'll whip up a test similar to yours -- stay tuned!


Here's what I've done:

import numpy as np
from maproomlib.utility import file_scanner

def gen_file():
    f = file('test.dat', 'w')
    for i in range(1200):
        f.write('1 ' * 2048)
        f.write('\n')
    f.close()

def read_file1():
    """ read unknown length: doubles"""
    f = file('test.dat')
    arr = file_scanner.FileScan(f)
    f.close()
    return arr

def read_file2():
    """ read known length: doubles"""
    f = file('test.dat')
    arr = file_scanner.FileScanN(f, 1200*2048)
    f.close()
    return arr

def read_file3():
    """ read known length: singles"""
    f = file('test.dat')
    arr = file_scanner.FileScanN_single(f, 1200*2048)
    f.close()
    return arr

def read_fromfile1():
    """ read unknown length with fromfile(): singles"""
    f = file('test.dat')
    arr = np.fromfile(f, dtype=np.float32, sep=' ')
    f.close()
    return arr

def read_fromfile2():
    """ read unknown length with fromfile(): doubles"""
    f = file('test.dat')
    arr = np.fromfile(f, dtype=np.float64, sep=' ')
    f.close()
    return arr

def read_fromstring1():
    """ read unknown length with fromstring(): singles"""
    f = file('test.dat')
    str = f.read()
    arr = np.fromstring(str, dtype=np.float32, sep=' ')
    f.close()
    return arr

And the results (ipython's timeit):

In [40]: timeit test.read_fromfile1()
1 loops, best of 3: 561 ms per loop

In [41]: timeit test.read_fromfile2()
1 loops, best of 3: 570 ms per loop

In [42]: timeit test.read_file1()
1 loops, best of 3: 336 ms per loop

In [43]: timeit test.read_file2()
1 loops, best of 3: 341 ms per loop

In [44]: timeit test.read_file3()
1 loops, best of 3: 515 ms per loop

In [46]: timeit test.read_fromstring1()
1 loops, best of 3: 301 ms per loop

So my filescanner is faster, but not radically so, than fromfile(). However, reading the whole file into a string, then using fromstring() is, in fact, tne fastest method -- interesting -- shows you why you need to profile!

Also, with my code, reading singles is slower than doubles -- odd. Perhaps the C lib fscanf read doubles anyway, then converts to singles?

Anyway, for my needs, my file_scanner and fromfile() are fast enough, and much faster than parsing the files with Python. My issue with fromfile is flexibility and robustness -- it's buggy in the face of ill-formed files. See the list archives and the bug reports for more detail.


Still, it seems your very basic method is indeed a faster way to go.

I've enclosed the files. It's currently built as part of a larger lib, so no setup.py -- though it could be written easily enough.


-Chris



--
Christopher Barker, Ph.D.
Oceanographer

Emergency Response Division
NOAA/NOS/OR&R            (206) 526-6959   voice
7600 Sand Point Way NE   (206) 526-6329   fax
Seattle, WA  98115       (206) 526-6317   main reception

[email protected]

#include "Python.h"


#include <numpy/arrayobject.h>

// NOTE: these buffer sizes were picked very arbitrarily, and have
// remarkably little impact on performance on my system.
#define BUFFERSIZE1 1024
#define BUFFERSIZE2 64


int filescan_double(FILE *infile, int NNums, double *array){

    double N;
    int i, j;
    int c;

    for (i=0; i<NNums; i++){
        while ( (j = fscanf(infile, "%lg", &N)) == 0 ){
            c = fgetc(infile);
        }
        if (j == EOF) {
            return(i);
        }
        array[i] = N;
    }
    // Go to the end of any whitespace:
    while ( isspace(c = fgetc(infile)) ){
        //printf("skipping a whitespace character: %i\n", c);
        //printf("I'm at position %i in the file\n",ftell(infile));
    }
     if (c > -1){
         // not EOF, rewind the file one byte.
         fseek(infile, -1, SEEK_CUR);
     }
    return(i);
}

int filescan_single(FILE *infile, int NNums, float *array){

    float N;
    int i, j;
    int c;

    for (i=0; i<NNums; i++){
/*      while ( (j = fscanf(infile, "%lg", &N)) == 0 ){*/
        while ( (j = fscanf(infile, "%g", &N)) == 0 ){
            c = fgetc(infile);
        }
        if (j == EOF) {
            return(i);
        }
        array[i] = N;
    }
    // Go to the end of any whitespace:
    while ( isspace(c = fgetc(infile)) ){
        //printf("skipping a whitespace character: %i\n", c);
        //printf("I'm at position %i in the file\n",ftell(infile));
    }
     if (c > -1){
         // not EOF, rewind the file one byte.
         fseek(infile, -1, SEEK_CUR);
     }
    return(i);
}

static char doc_FileScanN[] =
"FileScanN(file, N)\n\n"
"Reads N values in the ascii file, and produces a Numeric vector of\n"
"length N full of Floats (C doubles).\n\n"
"Raises an exception if there are fewer than N  numbers in the file.\n\n"
"All text in the file that is not part of a floating point number is\n"
"skipped over.\n\n"
"After reading N numbers, the file is left before the next non-whitespace\n"
"character in the file. This will often leave the file at the start of\n"
"the next line, after scanning a line full of numbers.\n";

static char doc_FileScanN_single[] =
"FileScanN_single(file, N)\n\n"
"Reads N values in the ascii file, and produces a Numeric vector of\n"
"length N full of single precision floats (C floats).\n\n"
"Raises an exception if there are fewer than N  numbers in the file.\n\n"
"All text in the file that is not part of a floating point number is\n"
"skipped over.\n\n"
"After reading N numbers, the file is left before the next non-whitespace\n"
"character in the file. This will often leave the file at the start of\n"
"the next line, after scanning a line full of numbers.\n";

static PyObject * file_scanner_FileScanN(PyObject *self, PyObject *args)
{

    PyFileObject *File;
    PyArrayObject *Array;
    npy_intp length;
    
    double *Data;
    int i;

    //printf("Starting\n");

    if (!PyArg_ParseTuple(args, "O!i", &PyFile_Type, &File, &length) ) {
        return NULL;
    }  

    Data = calloc(length, sizeof(double) );

    if ((i = filescan_double(PyFile_AsFile( (PyObject*)File ), length, Data)) < 
length){
            PyErr_SetString (PyExc_ValueError,
                     "End of File reached before all numbers found");
            free(Data);
            return NULL;
    }
    
    Array = (PyArrayObject *) PyArray_SimpleNew(1, &length, PyArray_DOUBLE);
  
    for (i = 0; i< length ; i++){
        *(double *)(Array->data + (i * Array->strides[0] ) ) = Data[i];
    }

    free(Data);

    return PyArray_Return(Array);
}
static PyObject * file_scanner_FileScanN_single(PyObject *self, PyObject *args)
{

    PyFileObject *File;
    PyArrayObject *Array;
    npy_intp length;
    
    float *Data;
    int i;

    //printf("Starting\n");

    if (!PyArg_ParseTuple(args, "O!i", &PyFile_Type, &File, &length) ) {
        return NULL;
    }  

    Data = calloc(length, sizeof(float) );

    if ((i = filescan_single(PyFile_AsFile( (PyObject*)File ), length, Data)) < 
length){
            PyErr_SetString (PyExc_ValueError,
                     "End of File reached before all numbers found");
            free(Data);
            return NULL;
    }
    
    Array = (PyArrayObject *) PyArray_SimpleNew(1, &length, PyArray_FLOAT);
    /*fixme: is there a copy data function I could use here?*/
    for (i = 0; i< length ; i++){
        *(float *)(Array->data + (i * Array->strides[0] ) ) = Data[i];
    }

    free(Data);

    return PyArray_Return(Array);
}

static char doc_FileScan[] =
"FileScan(file)\n\n"
"Reads all the values in rest of the open ascii file: file, and produces\n"
"a Numeric vector full of Floats (C doubles).\n\n"
"All text in the file that is not part of a floating point number is\n"
"skipped over.\n\n"
;


static PyObject * file_scanner_FileScan(PyObject *self, PyObject *args)
{

    FILE *infile;
    char *DataPtr;
    PyFileObject *File;
    PyArrayObject *Array;
    double *(*P_array);
    double *(*Old_P_array);
    int i,j,k;
    //int ScanCount = 0;
    npy_intp ScanCount = 0;
    int BufferSize = BUFFERSIZE2;
    int OldBufferSize = 0;
    int StartOfBuffer = 0;
    int NumBuffers = 0;

    if (!PyArg_ParseTuple(args, "O!", &PyFile_Type, &File) ) {
        return NULL;
    }  
    infile = PyFile_AsFile( (PyObject*)File );

    P_array = (double**) calloc(BufferSize, sizeof(void*) );
    while (1) {
        for (j=StartOfBuffer; j < BufferSize; j++){
            P_array[j] = (double*) calloc(BUFFERSIZE1, sizeof(double));
            NumBuffers++ ;
            i = filescan_double(infile, BUFFERSIZE1, P_array[j]);
            if (i) {
                ScanCount += i;
                //for (k=0; k<BUFFERSIZE1; k++){ 
                //    printf("%.14g\n", P_array[j][k]);
                //}
            }
            if (i == 0){
                break;
            }
        }
        if (i == 0) {
            break;
        }
        // Need more memory
        OldBufferSize = BufferSize;
        BufferSize += BUFFERSIZE2;
        StartOfBuffer += BUFFERSIZE2;
        Old_P_array = P_array;
        P_array = (double**) calloc(BufferSize, sizeof(void*) );
        
        for (j=0; j < OldBufferSize; j++){
            P_array[j] = Old_P_array[j];
        }
        free(Old_P_array);
    }

    // copy all the data to a PyArray
    Array = (PyArrayObject *) PyArray_SimpleNew(1, &ScanCount, PyArray_DOUBLE);

    i = 0;
    DataPtr = Array->data;
    for (j=0; j<BufferSize; j++){
        for (k=0; k<BUFFERSIZE1; k++){
            if (i >= ScanCount) {
                break;
            }
            *(double *)DataPtr = P_array[j][k];
            DataPtr +=  Array->strides[0];
            i++;
        }
    }

    //free all the memory
    for (j=0; j<NumBuffers; j++){
        free(P_array[j]);
    }
    free(P_array);

    return PyArray_Return(Array);
}


static PyMethodDef file_scanner_methods[] = {
  {"FileScanN_single", file_scanner_FileScanN_single, METH_VARARGS, 
doc_FileScanN_single},
  {"FileScanN", file_scanner_FileScanN, METH_VARARGS, doc_FileScanN},
  {"FileScan", file_scanner_FileScan, METH_VARARGS, doc_FileScan},
  //  {"byteswap", NumericExtras_byteswap, METH_VARARGS, doc_byteswap},
  //{"changetype", NumericExtras_changetype, METH_VARARGS, doc_changetype},
  {NULL, NULL} /* Sentinel */
};


void initfile_scanner(void){
  (void) Py_InitModule("file_scanner", file_scanner_methods);
  import_array()
}

test_simple_large.py
Description: application/python

_______________________________________________
NumPy-Discussion mailing list
[email protected]
http://mail.scipy.org/mailman/listinfo/numpy-discussion

Re: [Numpy-discussion] seeking advice on a fast string->array conversion

Reply via email to