Hi Andrew, The reason I am using mapped objects is that I need to abstract from the database implementation allowing the replication to target a number of different platforms. This will definitely slow things down. > process a whole pile in memory and then (perhaps every 10,000 - when your > memory is about to run out and start paging) flush the session. Under windows how can I tell when memory is about to run out? I guess there is no cross-platform solution to this. Writing external files has all come about from a post titled "Memory problems (garbage collection) by Carbon Man" which I never got a resolution to. I was trying to execute gc.collect() when a process was complete because I was having huge problems with memory (though the problem still remains). If I stop at "import schema" There are 2524104 objects processed by gc.collect()
There is a new issue in the code (marked with #### Problem with SQLalchemy), but here is the program as it now stands: #!/usr/bin/python # -*- coding: utf-8 -*- # Reads an XML file and creates schema.py from the TABLES branch (very simplistic table structures) # Processes the nodes within the DATA branch, into SQLalchemy from xml.dom import minidom import os import codecs from cStringIO import StringIO import time from datetime import * from sqlalchemy import * from sqlalchemy.orm import * class xmlProcessing: """ General class for XML processing""" def process(self, filename="", xmlString=""): if xmlString: pass elif filename: xmldoc = minidom.parse(filename) self.parse( xmldoc.documentElement ) def parseBranch(self, parentNode): """ Process an XML branch """ for node in parentNode.childNodes: try: parseMethod = getattr(self, "parse_%s" % node.__class__.__name__) except AttributeError: continue if parseMethod(node): continue self.parseBranch(node) def parse_Document(self, node): pass def parse_Text(self, node): pass def parse_Comment(self, node): pass def parse_Element(self, node): try: handlerMethod = getattr(self, "do_%s" % node.tagName) except AttributeError: return False handlerMethod(node) return True class reptorParsing(xmlProcessing): """ Specific class for generating a SQLalchemy program to create tables and populate them with data""" def __init__(self): self.schemaPreface = StringIO() self.schemaPreface.write("""from sqlalchemy import * import time from datetime import * from sqlalchemy.ext.declarative import declarative_base #engine = create_engine('sqlite:///tutorial.db', echo=False) #metadata = MetaData() Base = declarative_base()""") self.schemaTables = StringIO() self.schemaTablesCreate = StringIO() self.schemaFields = StringIO() self.tableDict = {} self.tableName = StringIO() self.tables = StringIO() def parse(self, parentNode): """Main entry point to begin processing a XML document""" self.parseBranch(parentNode) # Properties such as schemaTables and .tables are populated by the various methods below fupdate=codecs.open(os.path.join(os.getcwd(), "update.py"), 'w', 'UTF-8') if self.schemaTables: f=codecs.open(os.path.join(os.getcwd(), "schema.py"), 'w', 'UTF-8') f.write(self.schemaPreface.getvalue()+u"\n"+self.schemaTables.getvalue()+ u"if __name__ == '__main__':\n" + self.schemaTablesCreate.getvalue() + u"\n engine = create_engine('sqlite:///tutorial.db', echo=False)\n" + u" metadata = Base.metadata\n" + u" metadata.create_all(engine)\n") f.close() if self.tables: fupdate.write(self.tables.getvalue()) fupdate.close() def do_TABLES(self, tableNode): """Process schema for tables""" for node in tableNode.childNodes: self.tableName = node.tagName # Define a declaritive mapping class self.schemaTables.write("""\nclass %s(Base): __tablename__ = '%s' """ % (self.tableName, self.tableName)) self.schemaTablesCreate.write("\n tableDef = "+self.tableName+"()") self.schemaFields = StringIO() # allow for userA = users("Billy","Bob") via a __init__() self.schemaInitPreface = StringIO() self.schemaInitPreface.write(" def __init__(self") self.schemaInitBody = StringIO() self.parseBranch(node) self.schemaInitPreface.write("):\n") self.schemaTables.write(self.schemaFields.getvalue() + "\n" + \ self.schemaInitPreface.getvalue() + \ self.schemaInitBody.getvalue() + "\n") # Need a way to execute schema so that the tables will be created. def do_FIELDS(self, fieldsNode): """Process schema for fields within tables""" for node in fieldsNode.childNodes: if self.schemaFields: self.schemaFields.write("\n") cType = "" initType = '""' # The attribute type holds the type of field crType = node.attributes["type"].value if crType==u"C": cType = "String(length=%s)" % node.attributes["len"].value initType = '""' elif crType==u"N" and node.attributes["dec"].value==u'0': cType = "Integer" initType = '0' elif crType==u"N": cType = "Numeric(precision=%s, scale=%s)" % (node.attributes["len"].value,node.attributes["dec"].value) initType = '0' elif crType==u"L": cType = "Boolean" initType = 'True' elif crType==u"T": cType = "DateTime" initType = "datetime.strptime('','')" elif crType==u"D": cType = "Date" initType = "datetime.strptime('','')" elif crType==u"M" or crType==u"G": cType = "Text" initType = '""' if node.attributes.getNamedItem("primary"): cType += ", primary_key=True" self.schemaFields.write(" %s = Column(%s)" % (node.tagName, cType)) self.schemaInitPreface.write(", \\\n %s=%s" % (node.tagName, initType)) self.schemaInitBody.write(" self.%s = %s\n" % (node.tagName, node.tagName)) self.tableDict[self.tableName + "." + node.tagName] = crType def do_DATA(self, dataNode): """This is for processing actual data to be pushed into the tables Layout is DATA -> TABLE_NAME key='primary_field' -> TUPLE -> FIELD_NAME -> VALUE""" # This is the schema program created in the TABLES branch import schema engine = create_engine('sqlite:///tutorial.db', echo=False) Session = sessionmaker(bind=engine) self.session = Session() for node in dataNode.childNodes: self.tableName = node.tagName self.keyValue = "" self.keyField = node.attributes["key"].value self.parseBranch(node) #self.tables.write("\nimport update_%s" % (self.tableName)) self.fDataUpdate.close() self.session.commit() self.session.close() def do_TUPLE(self, tupleNode): """ A TUPLE is what the XML file refers to a table row. Sits below a DATA->table_name branch """ import schema colValues = {} TI = eval("schema."+self.tableName+"()") exists = None for node in tupleNode.childNodes: for dataNode in node.childNodes: crType = self.tableDict[self.tableName + "." + node.tagName] if crType==u"C" or crType==u"M": cValue = dataNode.data elif crType==u"T": cValue = datetime.strptime(dataNode.data, "%Y-%m-%d %H:%M") elif crType==u"D": cValue = datetime.strptime(dataNode.data, "%Y-%m-%d") else: cValue = dataNode.data if node.tagName == self.keyField: #### Problem with SQLalchemy. exists = self.session.query(TI).filter(getattr(TI,self.keyField)==cValue) # Set the value onto a list to be processed below colValues[node.tagName] = cValue if exists is None: return if exists.count == 0: entry = TI else: entry = exists[0] for col_name in colValues: setattr(entry, col_name, colValues[col_name]) if exists.count == 0: # Insert self.session.add(TI) if __name__ == '__main__': replicate = reptorParsing() replicate.process(filename=os.path.join(os.getcwd(), "request.xml")) del replicate -- http://mail.python.org/mailman/listinfo/python-list