Yura Smolsky wrote: > Hello, Julien. > > i will look into the problem today. thanks.
To be more precise here is the errors I got :
File "/usr/local/lib/python2.4/site-packages/PyLucene.py", line 2913,
in __init__
newobj = _PyLucene.new_IndexSearcher(*args)
File "./src/nxlucene/directory/PythonDirectory.py", line 68, in release
os.remove(self.lockFile)
OSError: [Errno 2] No such file or directory:
'/tmp/lucene-a14687cc515226325f527aade90082ad-commit.lock'
seems the lock file is removed by an another thread. (max 5 errors out
of 100 threads)
I'm currently investigating. I see 2 options :
1. my code does some wrong things regarding writer / indexer / searcher
and the PythonFileDirectory is more sensitive to this.
2. we do have an actual bug in the PythonFileLock implementation.
Keep you posted if I find something.
J.
> JA> Hi there,
>
> JA> I've been trying out your new Python directory implementation for
> JA> NXLucene which is using PyLucene 1.9.1
>
> JA> We are then using gcc-3.4.6. Everything is stable for weeks now but I'm
> JA> reaching the 2 go limitation of gcc-3.4.6.
>
> JA> I saw two problems with the PythonFileLock you implemented :
>
> JA> - It appears that it is not thread safe. Try out with something like
> JA> 100 threads.
>
> JA> I got timeout within obtainTimeout() from the PythonFileLock in some of
> JA> my NXLucene multi threading tests. It seems there is an issue as soon as
> JA> you got 2 kind of lock names (write / commit locks).
>
> JA> - It appears to be really inefficient with lots of threads compared to
> JA> the FSDirectory. I suspect lots of retries occurred within the
> JA> obtainTimeout().
>
> JA> I don't have really a lot of time investigating this right now. If you
> JA> guys do have an idea ?
>
> JA> I'm gonna give a try to the gcc-4.1.0 + patch solution.
>
> JA> J.
>
> JA> Yura Smolsky wrote:
>>>>> I am going to implement fully functional python directory same to Java
>>>>> Lucene one. I will provide it here later, so you can include into PyLucene
>>>>> if somebody will need.
>>> AV> Great !
>>>
>>> 1. Ok. Here is the deal. I have finished the class.
>>> This class implements FSDIrectory functionality,
>>> but using python only to avoid 2 gb limit with gcc 3.4.6. (see
>>> PythonDirectory.py)
>>>
>>> This is the test for it (see test_PythonDirectory2.py)
>>>
>>> Please feel free to put this class into PyLucene distribution.
>>>
>>> 2. Everything seems to be perfect on Linux (Debian), but sometimes I do
>>> receive random exceptions when I run testcases on Windows - about 1
>>> time per 5 runs.
>>>
>>> And even bigger problem. Optimize method completely does not work on
>>> windows platform for this index (see attached archive). Use optimizeIndex.py
>>> to reproduce problem on windows plaftorm. I got this exception:
>>>
>>> Traceback (most recent call last):
>>> File "D:\workshop\index\optimizeIndex.py", line 16, in ?
>>> writer.optimize()
>>> PyLucene.JavaError: java.lang.NullPointerException
>>>
>>> Again, everything is okay on Linux.
>>>
>>> --
>>> Yura Smolsky,
>>> http://altervisionmedia.com/
>>>
>>>
>>> ------------------------------------------------------------------------
>>>
>>> import os, sys
>>> import PyLucene
>>> import md5
>>> import time
>>>
>>> DEBUG = False
>>>
>>> class DebugWrapper( object ):
>>>
>>> def __init__(self, obj ):
>>> self.obj = obj
>>>
>>> def __getattr__(self, name):
>>> print self.obj.__class__.__name__, self.obj.name, name
>>> sys.stdout.flush()
>>> return getattr(self.obj, name )
>>>
>>> class DebugFactory( object ):
>>>
>>> def __init__(self, klass):
>>> self.klass = klass
>>>
>>> def __call__(self, *args, **kw):
>>> instance = self.klass(*args, **kw)
>>> return DebugWrapper( instance )
>>>
>>> class PythonFileLock( object ):
>>> # safe for a multimple processes
>>>
>>> LOCK_POLL_INTERVAL = 1000
>>>
>>> def __init__(self, lockDir, lockFile):
>>> self.name = lockFile
>>> self.lockDir = lockDir
>>> self.lockFile = os.path.join(lockDir, lockFile)
>>> #print self.lockFile
>>>
>>> def isLocked(self):
>>> return os.path.exists(self.lockFile)
>>>
>>> def obtainTimeout( self, timeout ):
>>> locked = self.obtain()
>>> maxSleepCount = round(timeout / self.LOCK_POLL_INTERVAL)
>>> sleepCount = 0
>>> while (not locked):
>>> if sleepCount >= maxSleepCount:
>>> raise Exception("Lock obtain timed out: " + self.toString())
>>> time.sleep(timeout/1000)
>>> locked = self.obtain()
>>> sleepCount += 1
>>> return locked
>>>
>>> def obtain( self ):
>>> if not os.path.exists(self.lockDir):
>>> os.makedirs(self.lockDir)
>>>
>>> if self.isLocked():
>>> return False
>>>
>>> try:
>>> open(self.lockFile, 'w')
>>> except:
>>> return False
>>> else:
>>> return True
>>>
>>> def release( self ):
>>> os.remove(self.lockFile)
>>> return True
>>>
>>> def toString(self):
>>> return 'Lock@' + self.lockFile
>>>
>>>
>>> class PythonFileStream(object):
>>>
>>> def __init__(self, name, fh, size=0L):
>>> self.name = name
>>> self.fh = fh
>>> self._length = size
>>> self.isOpen = True
>>>
>>> def close(self, isClone=False):
>>> if isClone or not self.isOpen:
>>> return
>>> self.isOpen = False
>>> self.fh.close()
>>>
>>> def seek(self, pos):
>>> self.fh.seek(pos)
>>>
>>> def read(self, length, pos):
>>> self.fh.seek(pos)
>>> return self.fh.read(length)
>>>
>>> def write(self, buffer):
>>> self.fh.write(buffer)
>>> self.fh.flush()
>>> self._length += len(buffer)
>>>
>>> def length(self):
>>> return self._length
>>>
>>>
>>> class PythonFileDirectory( object ):
>>>
>>> LOCK_DIR = PyLucene.System.getProperty("org.apache.lucene.lockDir",
>>> PyLucene.System.getProperty("java.io.tmpdir"));
>>>
>>> def __init__(self, path, create=False ):
>>> self.path = os.path.realpath(path)
>>> self.name = self.path
>>> self._locks = {}
>>> self._streams = []
>>> if not self.LOCK_DIR:
>>> self.LOCK_DIR = self.path
>>> if create:
>>> self.create()
>>>
>>> assert os.path.isdir( path )
>>>
>>> def create(self):
>>> if not os.path.exists(self.path):
>>> os.makedirs(self.path)
>>>
>>> oldFiles = os.listdir(self.path)
>>> for oldFile in oldFiles:
>>> os.remove(os.path.join(self.path, oldFile))
>>>
>>> lockPrefix = self.getLockPrefix()
>>> tmpFiles = os.listdir(self.LOCK_DIR)
>>> for tmpFile in tmpFiles:
>>> if tmpFile.startswith(lockPrefix):
>>> os.remove(os.path.join(self.LOCK_DIR, tmpFile))
>>>
>>>
>>> def close(self):
>>> for s in self._streams:
>>> s.close()
>>>
>>> def createOutput(self, name ):
>>> file_path = os.path.join( self.path, name )
>>> fh = open( file_path, "w" )
>>> stream = PythonFileStream( name, fh )
>>> self._streams.append(stream)
>>> return stream
>>>
>>> def deleteFile( self, name ):
>>> if self.fileExists(name):
>>> os.unlink( os.path.join( self.path, name ) )
>>>
>>> def fileExists( self, name ):
>>> return os.path.exists( os.path.join( self.path, name ) )
>>>
>>> def fileLength( self, name ):
>>> file_path = os.path.join( self.path, name )
>>> return os.path.getsize( file_path )
>>>
>>> def fileModified( self, name ):
>>> file_path = os.path.join( self.path, name )
>>> return int( os.path.getmtime( file_path ))
>>>
>>> def list(self):
>>> return os.listdir( self.path )
>>>
>>> def openInput( self, name ):
>>> file_path = os.path.join( self.path, name )
>>> fh = open( file_path, 'r')
>>> stream = PythonFileStream( name, fh, os.path.getsize(file_path) )
>>> self._streams.append(stream)
>>> return stream
>>>
>>> def renameFile(self, fname, tname):
>>> fromName = os.path.join( self.path, fname )
>>> toName = os.path.join( self.path, tname )
>>> if os.path.exists( toName ):
>>> os.remove( toName )
>>> os.rename( fromName, toName )
>>>
>>> def touchFile( self, name):
>>>
>>> file_path = os.path.join( self.path, name )
>>> fh = open( file_path, 'rw')
>>> c = fh.read(1)
>>> fh.seek(0)
>>> fh.write(c)
>>> fh.close()
>>>
>>> def makeLock( self, name ):
>>> lockDir = self.LOCK_DIR
>>> lockFile = self.getLockPrefix() + '-' + name
>>> lock = self._locks.setdefault( name, PythonFileLock(lockDir,
>>> lockFile) )
>>> #print lock.toString()
>>> return lock
>>>
>>> def getHexDigest(self, string):
>>> m = md5.new(string)
>>> return m.hexdigest()
>>>
>>> def getLockPrefix(self):
>>> dirName = os.path.realpath(self.path)
>>> prefix = 'lucene-' + self.getHexDigest(dirName)
>>> return prefix
>>>
>>> if DEBUG:
>>> _globals = globals()
>>> _globals['PythonFileDirectory'] = DebugFactory( PythonFileDirectory )
>>> _globals['PythonFileStream'] = DebugFactory( PythonFileStream )
>>> _globals['PythonFileLock'] = DebugFactory( PythonFileLock )
>>> del _globals
>>>
>>>
>>> ------------------------------------------------------------------------
>>>
>>> #!/usr/local/bin/python
>>>
>>> import os, sys, unittest, shutil, weakref
>>> import test_PyLucene
>>> from PythonDirectory import *
>>>
>>> """
>>> The Directory Implementation here is for testing purposes only, not meant
>>> as an example of writing one, the implementation here suffers from a lack
>>> of safety when dealing with concurrent modifications as it does away with
>>> the file locking in the default lucene fsdirectory implementation.
>>> """
>>>
>>>
>>>
>>>
>>> class PythonDirectoryTests( unittest.TestCase,
>>> test_PyLucene.Test_PyLuceneBase ):
>>>
>>> STORE_DIR = "testpyrepo"
>>>
>>> def setUp( self ):
>>> if not os.path.exists( self.STORE_DIR ):
>>> os.mkdir( self.STORE_DIR )
>>>
>>> def tearDown( self ):
>>> if os.path.exists(self.STORE_DIR):
>>> shutil.rmtree(self.STORE_DIR)
>>>
>>> def openStore( self ):
>>> return PythonFileDirectory( self.STORE_DIR )
>>>
>>> def closeStore(self, store, *args):
>>> for arg in args:
>>> if arg: arg.close()
>>> store.close()
>>>
>>> def test_IncrementalLoop( self ):
>>> print "Testing Indexing Incremental Looping"
>>> for i in range(100):
>>> print "indexing ", i
>>> sys.stdout.flush()
>>> self.test_indexDocument()
>>>
>>>
>>> if __name__ == "__main__":
>>> import sys
>>> if '-loop' in sys.argv:
>>> sys.argv.remove('-loop')
>>> while True:
>>> try:
>>> unittest.main()
>>> except:
>>> pass
>>> else:
>>> unittest.main()
>>>
>>>
>>>
>>>
>>> ------------------------------------------------------------------------
>>>
>>> _______________________________________________
>>> pylucene-dev mailing list
>>> [email protected]
>>> http://lists.osafoundation.org/mailman/listinfo/pylucene-dev
>
>
>
>
>
> --
> Yura Smolsky,
> http://altervisionmedia.com/
>
>
--
Julien Anguenot | Nuxeo R&D (Paris, France)
Open Source ECM - www.nuxeo.com
CPS Platform - http://www.cps-project.org
Mobile: +33 (0) 6 72 57 57 66
signature.asc
Description: OpenPGP digital signature
_______________________________________________ pylucene-dev mailing list [email protected] http://lists.osafoundation.org/mailman/listinfo/pylucene-dev
