Hi there, I've been trying out your new Python directory implementation for NXLucene which is using PyLucene 1.9.1
We are then using gcc-3.4.6. Everything is stable for weeks now but I'm
reaching the 2 go limitation of gcc-3.4.6.
I saw two problems with the PythonFileLock you implemented :
- It appears that it is not thread safe. Try out with something like
100 threads.
I got timeout within obtainTimeout() from the PythonFileLock in some of
my NXLucene multi threading tests. It seems there is an issue as soon as
you got 2 kind of lock names (write / commit locks).
- It appears to be really inefficient with lots of threads compared to
the FSDirectory. I suspect lots of retries occurred within the
obtainTimeout().
I don't have really a lot of time investigating this right now. If you
guys do have an idea ?
I'm gonna give a try to the gcc-4.1.0 + patch solution.
J.
Yura Smolsky wrote:
>>> I am going to implement fully functional python directory same to Java
>>> Lucene one. I will provide it here later, so you can include into PyLucene
>>> if somebody will need.
> AV> Great !
>
> 1. Ok. Here is the deal. I have finished the class.
> This class implements FSDIrectory functionality,
> but using python only to avoid 2 gb limit with gcc 3.4.6. (see
> PythonDirectory.py)
>
> This is the test for it (see test_PythonDirectory2.py)
>
> Please feel free to put this class into PyLucene distribution.
>
> 2. Everything seems to be perfect on Linux (Debian), but sometimes I do
> receive random exceptions when I run testcases on Windows - about 1
> time per 5 runs.
>
> And even bigger problem. Optimize method completely does not work on
> windows platform for this index (see attached archive). Use optimizeIndex.py
> to reproduce problem on windows plaftorm. I got this exception:
>
> Traceback (most recent call last):
> File "D:\workshop\index\optimizeIndex.py", line 16, in ?
> writer.optimize()
> PyLucene.JavaError: java.lang.NullPointerException
>
> Again, everything is okay on Linux.
>
> --
> Yura Smolsky,
> http://altervisionmedia.com/
>
>
> ------------------------------------------------------------------------
>
> import os, sys
> import PyLucene
> import md5
> import time
>
> DEBUG = False
>
> class DebugWrapper( object ):
>
> def __init__(self, obj ):
> self.obj = obj
>
> def __getattr__(self, name):
> print self.obj.__class__.__name__, self.obj.name, name
> sys.stdout.flush()
> return getattr(self.obj, name )
>
> class DebugFactory( object ):
>
> def __init__(self, klass):
> self.klass = klass
>
> def __call__(self, *args, **kw):
> instance = self.klass(*args, **kw)
> return DebugWrapper( instance )
>
> class PythonFileLock( object ):
> # safe for a multimple processes
>
> LOCK_POLL_INTERVAL = 1000
>
> def __init__(self, lockDir, lockFile):
> self.name = lockFile
> self.lockDir = lockDir
> self.lockFile = os.path.join(lockDir, lockFile)
> #print self.lockFile
>
> def isLocked(self):
> return os.path.exists(self.lockFile)
>
> def obtainTimeout( self, timeout ):
> locked = self.obtain()
> maxSleepCount = round(timeout / self.LOCK_POLL_INTERVAL)
> sleepCount = 0
> while (not locked):
> if sleepCount >= maxSleepCount:
> raise Exception("Lock obtain timed out: " + self.toString())
> time.sleep(timeout/1000)
> locked = self.obtain()
> sleepCount += 1
> return locked
>
> def obtain( self ):
> if not os.path.exists(self.lockDir):
> os.makedirs(self.lockDir)
>
> if self.isLocked():
> return False
>
> try:
> open(self.lockFile, 'w')
> except:
> return False
> else:
> return True
>
> def release( self ):
> os.remove(self.lockFile)
> return True
>
> def toString(self):
> return 'Lock@' + self.lockFile
>
>
> class PythonFileStream(object):
>
> def __init__(self, name, fh, size=0L):
> self.name = name
> self.fh = fh
> self._length = size
> self.isOpen = True
>
> def close(self, isClone=False):
> if isClone or not self.isOpen:
> return
> self.isOpen = False
> self.fh.close()
>
> def seek(self, pos):
> self.fh.seek(pos)
>
> def read(self, length, pos):
> self.fh.seek(pos)
> return self.fh.read(length)
>
> def write(self, buffer):
> self.fh.write(buffer)
> self.fh.flush()
> self._length += len(buffer)
>
> def length(self):
> return self._length
>
>
> class PythonFileDirectory( object ):
>
> LOCK_DIR = PyLucene.System.getProperty("org.apache.lucene.lockDir",
> PyLucene.System.getProperty("java.io.tmpdir"));
>
> def __init__(self, path, create=False ):
> self.path = os.path.realpath(path)
> self.name = self.path
> self._locks = {}
> self._streams = []
> if not self.LOCK_DIR:
> self.LOCK_DIR = self.path
> if create:
> self.create()
>
> assert os.path.isdir( path )
>
> def create(self):
> if not os.path.exists(self.path):
> os.makedirs(self.path)
>
> oldFiles = os.listdir(self.path)
> for oldFile in oldFiles:
> os.remove(os.path.join(self.path, oldFile))
>
> lockPrefix = self.getLockPrefix()
> tmpFiles = os.listdir(self.LOCK_DIR)
> for tmpFile in tmpFiles:
> if tmpFile.startswith(lockPrefix):
> os.remove(os.path.join(self.LOCK_DIR, tmpFile))
>
>
> def close(self):
> for s in self._streams:
> s.close()
>
> def createOutput(self, name ):
> file_path = os.path.join( self.path, name )
> fh = open( file_path, "w" )
> stream = PythonFileStream( name, fh )
> self._streams.append(stream)
> return stream
>
> def deleteFile( self, name ):
> if self.fileExists(name):
> os.unlink( os.path.join( self.path, name ) )
>
> def fileExists( self, name ):
> return os.path.exists( os.path.join( self.path, name ) )
>
> def fileLength( self, name ):
> file_path = os.path.join( self.path, name )
> return os.path.getsize( file_path )
>
> def fileModified( self, name ):
> file_path = os.path.join( self.path, name )
> return int( os.path.getmtime( file_path ))
>
> def list(self):
> return os.listdir( self.path )
>
> def openInput( self, name ):
> file_path = os.path.join( self.path, name )
> fh = open( file_path, 'r')
> stream = PythonFileStream( name, fh, os.path.getsize(file_path) )
> self._streams.append(stream)
> return stream
>
> def renameFile(self, fname, tname):
> fromName = os.path.join( self.path, fname )
> toName = os.path.join( self.path, tname )
> if os.path.exists( toName ):
> os.remove( toName )
> os.rename( fromName, toName )
>
> def touchFile( self, name):
>
> file_path = os.path.join( self.path, name )
> fh = open( file_path, 'rw')
> c = fh.read(1)
> fh.seek(0)
> fh.write(c)
> fh.close()
>
> def makeLock( self, name ):
> lockDir = self.LOCK_DIR
> lockFile = self.getLockPrefix() + '-' + name
> lock = self._locks.setdefault( name, PythonFileLock(lockDir,
> lockFile) )
> #print lock.toString()
> return lock
>
> def getHexDigest(self, string):
> m = md5.new(string)
> return m.hexdigest()
>
> def getLockPrefix(self):
> dirName = os.path.realpath(self.path)
> prefix = 'lucene-' + self.getHexDigest(dirName)
> return prefix
>
> if DEBUG:
> _globals = globals()
> _globals['PythonFileDirectory'] = DebugFactory( PythonFileDirectory )
> _globals['PythonFileStream'] = DebugFactory( PythonFileStream )
> _globals['PythonFileLock'] = DebugFactory( PythonFileLock )
> del _globals
>
>
> ------------------------------------------------------------------------
>
> #!/usr/local/bin/python
>
> import os, sys, unittest, shutil, weakref
> import test_PyLucene
> from PythonDirectory import *
>
> """
> The Directory Implementation here is for testing purposes only, not meant
> as an example of writing one, the implementation here suffers from a lack
> of safety when dealing with concurrent modifications as it does away with
> the file locking in the default lucene fsdirectory implementation.
> """
>
>
>
>
> class PythonDirectoryTests( unittest.TestCase,
> test_PyLucene.Test_PyLuceneBase ):
>
> STORE_DIR = "testpyrepo"
>
> def setUp( self ):
> if not os.path.exists( self.STORE_DIR ):
> os.mkdir( self.STORE_DIR )
>
> def tearDown( self ):
> if os.path.exists(self.STORE_DIR):
> shutil.rmtree(self.STORE_DIR)
>
> def openStore( self ):
> return PythonFileDirectory( self.STORE_DIR )
>
> def closeStore(self, store, *args):
> for arg in args:
> if arg: arg.close()
> store.close()
>
> def test_IncrementalLoop( self ):
> print "Testing Indexing Incremental Looping"
> for i in range(100):
> print "indexing ", i
> sys.stdout.flush()
> self.test_indexDocument()
>
>
> if __name__ == "__main__":
> import sys
> if '-loop' in sys.argv:
> sys.argv.remove('-loop')
> while True:
> try:
> unittest.main()
> except:
> pass
> else:
> unittest.main()
>
>
>
>
> ------------------------------------------------------------------------
>
> _______________________________________________
> pylucene-dev mailing list
> [email protected]
> http://lists.osafoundation.org/mailman/listinfo/pylucene-dev
--
Julien Anguenot | Nuxeo R&D (Paris, France)
Open Source ECM - www.nuxeo.com
CPS Platform - http://www.cps-project.org
Mobile: +33 (0) 6 72 57 57 66
signature.asc
Description: OpenPGP digital signature
_______________________________________________ pylucene-dev mailing list [email protected] http://lists.osafoundation.org/mailman/listinfo/pylucene-dev
