Hi all,

I am using libxml2 for parsing html in python. I was thinking that libxml2
could be involved, so I modified one of the website python examples in order
to process a revelant number of html files while I checked the memory
comsuption with the top command.
And... yes! the program does increase the memory consumption till it finish.

Am I forgetting something in the code? Or there is something wrong with the
python bindings....

Thank you, Cesar

Note1: I do nothing in the Callback
Note2: I have tried to use the cleanup functions after the 'ctxt = None'
with the same results.

****************************************] The Code
[****************************************

#!/usr/bin/python -u
import libxml2

#------------------------------------------------------------------------------


# Memory debug specific
libxml2.debugMemory(1)

#------------------------------------------------------------------------------


class callback:
   def startDocument(self):
           print "."

   def endDocument(self):
       pass

   def startElement(self, tag, attrs):
       pass

   def endElement(self, tag):
       pass

   def characters(self, data):
       pass

   def warning(self, msg):
       pass

   def error(self, msg):
       pass

   def fatalError(self, msg):
       pass

#------------------------------------------------------------------------------
#------------------------------------------------------------------------------

import os
import sys

programName = os.path.basename(sys.argv[0])

if len(sys.argv) != 2:
 print "Use: %s <dir html files>" % programName
 sys.exit(1)

inputPath = sys.argv[1]

if not os.path.exists (inputPath):
 print "Error: directory does not exist"
 sys.exit(1)

inputFileNames = []
dirContent = os.listdir(inputPath)
for fichero in dirContent:
 extension1=fichero.rfind(".htm")
 extension2=fichero.rfind(".html")
 dot = fichero.rfind(".")
 extension = max(extension1,extension2)
 if extension != -1 and extension == dot:
     inputFileNames.append (fichero)

if len(inputFileNames) == 0:
 print "Error: no input files"
 sys.exit(1)


handler = callback()
NUM_ITERS = 5
for i in range(NUM_ITERS):
 for inputFileName in inputFileNames:
   print inputFileName
   inputFilePath = inputPath + inputFileName
   f = open(inputFilePath)
   data = f.read()
   f.close()

   ctxt = libxml2.htmlCreatePushParser(handler, "", 0, inputFileName)

   ctxt.htmlParseChunk(data, len(data), 1)
   ctxt = None


# Memory debug specific
libxml2.cleanupParser()
if libxml2.debugMemory(1) == 0:
   print "OK"
else:
   print "Memory leak %d bytes" % (libxml2.debugMemory(1))
   libxml2.dumpMemory()

# Other cleanup functions
#libxml2.cleanupCharEncodingHandlers()
#libxml2.cleanupEncodingAliases()
#libxml2.cleanupGlobals()
#libxml2.cleanupInputCallbacks()
#libxml2.cleanupOutputCallbacks()
#libxml2.cleanupPredefinedEntities()
_______________________________________________
xml mailing list, project page  http://xmlsoft.org/
[email protected]
http://mail.gnome.org/mailman/listinfo/xml

Reply via email to