I have the following scrip which transforms several pytables files by
changing the format of one field in a table from a StringCol to Int32
col. I have a couple of problems:

1. The memory usage of the process keeps growing and then dies. I have
to restart it several times
2. Is there any way to do this without converting it to a python list
and make it faster.

import os
from tables import *
from time import clock, time
from path import path
from mx.DateTime import Parser, Time, DateTime
from attrdict import attrdict
filterProps = Filters(complevel=2,  complib='blosc')

#Old table description:
#class DataDescriptionDaily(IsDescription):
#    fullSymbol = StringCol(100, pos=0)
#    calcField = StringCol(100, pos=1)
#    hhmmss = StringCol(6, pos=2)
#    value = Float32Col(pos=3)

class DataDescriptionDaily(IsDescription):
    fullSymbol = StringCol(100, pos=0)
    calcField = StringCol(100, pos=1)
    hhmmss = Int32Col(pos=2)
    value = Float32Col(pos=3)

def getTables(mapping):
    try:
        mapping.tblData = mapping.hdf5.getNode('/data')
    except:
        mapping.tblData = mapping.hdf5.createTable('/', 'data',
DataDescriptionDaily, expectedrows=75000000, filters=filterProps)
        createIndexes(mapping)


def createIndexes(mapping):
    if mapping.tblData.cols.hhmmss.is_indexed:
        mapping.tblData.cols.hhmmss.reIndex()
    else:
        mapping.tblData.cols.hhmmss.createCSIndex()
    if mapping.tblData.cols.calcField.is_indexed:
        mapping.tblData.cols.calcField.reIndex()
    else:
        mapping.tblData.cols.calcField.createCSIndex()
    if mapping.tblData.cols.fullSymbol.is_indexed:
        mapping.tblData.cols.fullSymbol.reIndex()
    else:
        mapping.tblData.cols.fullSymbol.createCSIndex()
    mapping.hdf5.flush()

dirPathList = [path("/calc")]
for dirPath in dirPathList:
    for f in dirPath.files("*.h5"):
        oldFileName = path("%s.old" % f)
        if oldFileName.exists():
            print 'Ignoring file: ', f
            continue

        mapping = attrdict()
        mapping.filePath = f
        mapping.hdf5 = openFile(mapping.filePath, "a")
        getTables(mapping)

        mapping2 = attrdict()
        mapping2.filePath = f+"_new"
        mapping2.hdf5 = openFile(mapping2.filePath, "w")
        getTables(mapping2)

        #save symbol mapping
        rowsToBeInserted = []
        for row in mapping.tblData.iterrows():
            row[2] = int(row[2])
            rowsToBeInserted.append((row[0], row[1], row[2], row[3]))
        if rowsToBeInserted:
            mapping2.tblData.append(rowsToBeInserted)
        mapping2.tblData.flush()

        #create index and flush file
        createIndexes(mapping2)
        mapping2.hdf5.flush()

        #get rid of old file
        lines = os.popen("mv %s %s.old" % (mapping.filePath,
mapping.filePath)).readlines()
        if lines: print lines

        lines = os.popen("mv %s %s" % (mapping2.filePath,
mapping.filePath)).readlines()
        if lines: print lines

------------------------------------------------------------------------------
This SF.net email is sponsored by 

Make an app they can't live without
Enter the BlackBerry Developer Challenge
http://p.sf.net/sfu/RIM-dev2dev 
_______________________________________________
Pytables-users mailing list
Pytables-users@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/pytables-users

Reply via email to