Hi there,

I've been trying out your new Python directory implementation for
NXLucene which is using PyLucene 1.9.1

We are then using gcc-3.4.6. Everything is stable for weeks now but I'm
reaching the 2 go limitation of gcc-3.4.6.

I saw two problems with the PythonFileLock you implemented :

  - It appears that it is not thread safe. Try out with something like
100 threads.

I got timeout within obtainTimeout() from the PythonFileLock in some of
my NXLucene multi threading tests. It seems there is an issue as soon as
you got 2 kind of lock names (write / commit locks).

  - It appears to be really inefficient with lots of threads compared to
the FSDirectory. I suspect lots of retries occurred within the
obtainTimeout().

I don't have really a lot of time investigating this right now. If you
guys do have an idea ?

I'm gonna give a try to the gcc-4.1.0 + patch solution.

        J.

Yura Smolsky wrote:
>>> I am going to implement fully functional python directory same to Java
>>> Lucene one. I will provide it here later, so you can include into PyLucene
>>> if somebody will need.
> AV> Great !
> 
> 1. Ok. Here is the deal. I have finished the class.
> This class implements FSDIrectory functionality,
> but using python only to avoid 2 gb limit with gcc 3.4.6. (see 
> PythonDirectory.py)
> 
> This is the test for it (see test_PythonDirectory2.py)
> 
> Please feel free to put this class into PyLucene distribution.
> 
> 2. Everything seems to be perfect on Linux (Debian), but sometimes I do
> receive random exceptions when I run testcases on Windows - about 1
> time per 5 runs.
> 
> And even bigger problem. Optimize method completely does not work on
> windows platform for this index (see attached archive). Use optimizeIndex.py
> to reproduce problem on windows plaftorm. I got this exception:
> 
> Traceback (most recent call last):
>   File "D:\workshop\index\optimizeIndex.py", line 16, in ?
>     writer.optimize()
> PyLucene.JavaError: java.lang.NullPointerException
> 
> Again, everything is okay on Linux.
> 
> --
> Yura Smolsky,
> http://altervisionmedia.com/
> 
> 
> ------------------------------------------------------------------------
> 
> import os, sys
> import PyLucene
> import md5
> import time
> 
> DEBUG = False
> 
> class DebugWrapper( object ):
> 
>     def __init__(self, obj ):
>         self.obj = obj
> 
>     def __getattr__(self, name):
>         print self.obj.__class__.__name__, self.obj.name, name
>         sys.stdout.flush()
>         return getattr(self.obj, name )
>         
> class DebugFactory( object ):
>     
>     def __init__(self, klass):
>         self.klass = klass
>         
>     def __call__(self, *args, **kw):
>         instance = self.klass(*args, **kw)
>         return DebugWrapper( instance )
> 
> class PythonFileLock( object ):
>     # safe for a multimple processes
>     
>     LOCK_POLL_INTERVAL = 1000
>     
>     def __init__(self, lockDir, lockFile):
>         self.name = lockFile
>         self.lockDir = lockDir
>         self.lockFile = os.path.join(lockDir, lockFile)
>         #print self.lockFile
> 
>     def isLocked(self):
>         return os.path.exists(self.lockFile)
> 
>     def obtainTimeout( self, timeout ):
>         locked = self.obtain()
>         maxSleepCount = round(timeout / self.LOCK_POLL_INTERVAL)
>         sleepCount = 0
>         while (not locked):
>             if sleepCount >= maxSleepCount:
>                 raise Exception("Lock obtain timed out: " + self.toString())
>             time.sleep(timeout/1000)
>             locked = self.obtain()
>             sleepCount += 1
>         return locked
> 
>     def obtain( self ):
>         if not os.path.exists(self.lockDir):
>             os.makedirs(self.lockDir)
>         
>         if self.isLocked():
>             return False
> 
>         try:
>             open(self.lockFile, 'w')
>         except:
>             return False
>         else:
>             return True
> 
>     def release( self ):
>         os.remove(self.lockFile)
>         return True
>     
>     def toString(self):
>         return 'Lock@' + self.lockFile
> 
> 
> class PythonFileStream(object):
> 
>     def __init__(self, name, fh, size=0L):
>         self.name = name
>         self.fh = fh
>         self._length = size
>         self.isOpen = True
> 
>     def close(self, isClone=False):
>         if isClone or not self.isOpen:
>             return
>         self.isOpen = False
>         self.fh.close()
> 
>     def seek(self, pos):
>         self.fh.seek(pos)
> 
>     def read(self, length, pos):
>         self.fh.seek(pos)
>         return self.fh.read(length)
> 
>     def write(self, buffer):
>         self.fh.write(buffer)
>         self.fh.flush()
>         self._length += len(buffer)
> 
>     def length(self):
>         return self._length
> 
>         
> class PythonFileDirectory( object ):
> 
>     LOCK_DIR = PyLucene.System.getProperty("org.apache.lucene.lockDir",
>       PyLucene.System.getProperty("java.io.tmpdir"));
>     
>     def __init__(self, path, create=False ):
>         self.path = os.path.realpath(path)
>         self.name = self.path
>         self._locks = {}
>         self._streams = []
>         if not self.LOCK_DIR:
>             self.LOCK_DIR = self.path
>         if create:
>             self.create()
> 
>         assert os.path.isdir( path )
> 
>     def create(self):
>         if not os.path.exists(self.path):
>             os.makedirs(self.path)
> 
>         oldFiles = os.listdir(self.path)
>         for oldFile in oldFiles:
>             os.remove(os.path.join(self.path, oldFile))
> 
>         lockPrefix = self.getLockPrefix()
>         tmpFiles = os.listdir(self.LOCK_DIR)
>         for tmpFile in tmpFiles:
>             if tmpFile.startswith(lockPrefix):
>                 os.remove(os.path.join(self.LOCK_DIR, tmpFile))
>         
>         
>     def close(self):
>         for s in self._streams:
>             s.close()
> 
>     def createOutput(self, name ):
>         file_path = os.path.join( self.path, name )
>         fh = open( file_path, "w" )
>         stream = PythonFileStream( name, fh )
>         self._streams.append(stream)
>         return stream
> 
>     def deleteFile( self, name ):
>         if self.fileExists(name):
>             os.unlink( os.path.join( self.path, name ) )
> 
>     def fileExists( self, name ):
>         return os.path.exists( os.path.join( self.path, name ) )
> 
>     def fileLength( self, name ):
>         file_path = os.path.join( self.path, name )
>         return os.path.getsize( file_path )
> 
>     def fileModified( self, name ):
>         file_path = os.path.join( self.path, name )
>         return int( os.path.getmtime( file_path ))
> 
>     def list(self):
>         return os.listdir( self.path )
> 
>     def openInput( self, name ):
>         file_path = os.path.join( self.path, name )
>         fh = open( file_path, 'r')
>         stream = PythonFileStream( name, fh, os.path.getsize(file_path) )
>         self._streams.append(stream)
>         return stream
> 
>     def renameFile(self, fname, tname):
>         fromName = os.path.join( self.path, fname )
>         toName = os.path.join( self.path, tname )
>         if os.path.exists( toName ):
>             os.remove( toName )
>         os.rename( fromName, toName )
> 
>     def touchFile( self, name):
> 
>         file_path = os.path.join( self.path, name )        
>         fh = open( file_path, 'rw')
>         c = fh.read(1)
>         fh.seek(0)
>         fh.write(c)
>         fh.close()
> 
>     def makeLock( self, name ):
>         lockDir = self.LOCK_DIR
>         lockFile = self.getLockPrefix() + '-' + name
>         lock = self._locks.setdefault( name, PythonFileLock(lockDir, 
> lockFile) )
>         #print lock.toString()
>         return lock
> 
>     def getHexDigest(self, string):
>         m = md5.new(string)
>         return m.hexdigest()
>     
>     def getLockPrefix(self):
>         dirName = os.path.realpath(self.path)
>         prefix = 'lucene-' + self.getHexDigest(dirName)
>         return prefix
> 
> if DEBUG:
>     _globals = globals()
>     _globals['PythonFileDirectory'] = DebugFactory( PythonFileDirectory )
>     _globals['PythonFileStream'] = DebugFactory( PythonFileStream )
>     _globals['PythonFileLock'] = DebugFactory( PythonFileLock )
>     del _globals
> 
> 
> ------------------------------------------------------------------------
> 
> #!/usr/local/bin/python
> 
> import os, sys, unittest, shutil, weakref
> import test_PyLucene 
> from PythonDirectory import *
> 
> """
> The Directory Implementation here is for testing purposes only, not meant
> as an example of writing one, the implementation here suffers from a lack
> of safety when dealing with concurrent modifications as it does away with 
> the file locking in the default lucene fsdirectory implementation.
> """
> 
> 
> 
> 
> class PythonDirectoryTests( unittest.TestCase,
>                             test_PyLucene.Test_PyLuceneBase ):
> 
>     STORE_DIR = "testpyrepo"
> 
>     def setUp( self ):
>         if not os.path.exists( self.STORE_DIR ):
>             os.mkdir( self.STORE_DIR )
> 
>     def tearDown( self ):
>         if os.path.exists(self.STORE_DIR):
>             shutil.rmtree(self.STORE_DIR)
> 
>     def openStore( self ):
>         return PythonFileDirectory( self.STORE_DIR )
> 
>     def closeStore(self, store, *args):
>         for arg in args:
>             if arg: arg.close()
>         store.close()
> 
>     def test_IncrementalLoop( self ):
>         print "Testing Indexing Incremental Looping"
>         for i in range(100):
>             print "indexing ", i
>             sys.stdout.flush()
>             self.test_indexDocument()
>                        
> 
> if __name__ == "__main__":
>     import sys
>     if '-loop' in sys.argv:
>         sys.argv.remove('-loop')
>         while True:
>             try:
>                 unittest.main()
>             except:
>                 pass
>     else:
>         unittest.main()
> 
> 
> 
> 
> ------------------------------------------------------------------------
> 
> _______________________________________________
> pylucene-dev mailing list
> [email protected]
> http://lists.osafoundation.org/mailman/listinfo/pylucene-dev


-- 
Julien Anguenot | Nuxeo R&D (Paris, France)
Open Source ECM - www.nuxeo.com
CPS Platform - http://www.cps-project.org
Mobile: +33 (0) 6 72 57 57 66

Attachment: signature.asc
Description: OpenPGP digital signature

_______________________________________________
pylucene-dev mailing list
[email protected]
http://lists.osafoundation.org/mailman/listinfo/pylucene-dev

Reply via email to