Author: dmeyer
Date: Wed Mar 29 17:23:23 2006
New Revision: 1362

Modified:
   trunk/beacon/src/crawl.py

Log:
improve crawler

Modified: trunk/beacon/src/crawl.py
==============================================================================
--- trunk/beacon/src/crawl.py   (original)
+++ trunk/beacon/src/crawl.py   Wed Mar 29 17:23:23 2006
@@ -4,141 +4,183 @@
 from kaa.notifier import Timer, OneShotTimer
 
 import parser
+from inotify import INotify
+from directory import Directory
 
 log = logging.getLogger('crawler')
 
-DIRECTORY_BLACKLIST  = [ '/usr/', '/bin/' ]
-DIRECTORY_QUICKCHECK = [ '/', '/home/', os.path.expanduser("~/") ]
-
-CHECK_TIMER  = 0.03
-PARSE_TIMER  = 0.02
-UPDATE_TIMER = 0.03
-
-_crawling = []
-
 class Crawler(object):
 
+    PARSE_TIMER  = 0.02
+    UPDATE_TIMER = 0.03
+
     active = 0
     nextid = 0
     
     def __init__(self, db):
+        """
+        Init the Crawler.
+        Parameter db is a beacon.db.Database object.
+        The Crawler is used by Mountpoint
+        """
         self.db = db
-        self._checked = []
-        self._tocheck = []
-        self._toparse = []
-        self._toupdate = []
+        self.monitoring = []
+        self.scan_directory_items = []
+        self.check_mtime_items = []
+        self.update_items = []
         Crawler.nextid += 1
         self.num = Crawler.nextid
-        
+        try:
+            self.inotify = INotify()
+        except SystemError, e:
+            log.warning('%s', e)
+            self.inotify = None
+        self.timer = None
+        self.restart_timer = None
+        self.restart_args = []
 
-    def crawl(self, item):
-        if not item.filename in DIRECTORY_QUICKCHECK + DIRECTORY_BLACKLIST:
-            items = [ item ]
-        else:
-            items = self.search(item)
-            
-        for child in items:
-            for c in _crawling:
-                if child.filename.startswith(c):
-                    break
+
+    def inotify_callback(self, mask, name):
+        if mask & INotify.WATCH_MASK:
+            item = self.db.query(filename=name)
+            if os.path.exists(name):
+                # created or modified, we don't care
+                if item._beacon_isdir:
+                    self.scan_directory_items.append(item)
+                self.check_mtime_items.append(item)
+                if not self.timer:
+                    Crawler.active += 1
+                    self.check_mtime()
             else:
-                self._toparse.append(child)
-                self._tocheck.append(child)
-                _crawling.append(child.filename)
-        if not self._toparse:
-            return
-        Crawler.active += 1
-        log.info('start crawler %s for %s' % (self.num, [ x.filename for x in 
items]))
-        self.timer = Timer(self.parse)
-        self.timer.start(PARSE_TIMER / Crawler.active)
+                # deleted
+                item = self.db.query(filename=name)
+                if item._beacon_id:
+                    self.db.delete_object(item._beacon_id, 
beacon_immediately=True)
+                if name + '/' in self.monitoring:
+                    for m in self.monitoring[:]:
+                        if m.startswith(name + '/'):
+                            if self.inotify:
+                                self.inotify.ignore(m)
+                                log.info('remove inotify for %s', m)
+                            self.monitoring.remove(m)
+
+    def append(self, item):
+        log.info('crawl %s', item)
+        self.check_mtime_items.append(item)
+        self.scan_directory_items.append(item)
+        self.restart_args.append(item)
+        if not self.timer:
+            Crawler.active += 1
+            log.info('start crawler %s' % self.num)
+            self.check_mtime()
 
 
-    def stop(self):
+    def finished(self):
         if not self.timer:
             return
         log.info('crawler %s finished', self.num)
         Crawler.active -= 1
         self.timer.stop()
         self.timer = None
-        for child in self._tocheck:
-            if child.filename in _crawling:
-                _crawling.remove(child.filename)
-        self._tocheck = self._toparse = self._toupdate = []
+        self.scan_directory_items = []
+        self.check_mtime_items = []
+        self.update_items = []
         self.db.commit()
+        if not self.inotify:
+            log.info('schedule rescan')
+            self.restart_timer = OneShotTimer(self.restart).start(10)
+                
 
+    def stop(self):
+        self.finished()
+        self.monitoring = []
+        self.inotify = None
         
-    def search(self, object):
-        if not object._beacon_isdir or object.filename in DIRECTORY_BLACKLIST:
-            return []
-        if object._beacon_data['mtime'] and \
-               not object.filename in DIRECTORY_QUICKCHECK:
-            return [ object ]
-        ret = []
-        for child in self.db.query(parent=object):
-            if not child._beacon_id:
-                continue
-            ret += self.search(child)
-        return ret
+        
+    def restart(self):
+        self.PARSE_TIMER = 1
 
+        self.monitoring = []
+        for item in self.restart_args:
+            self.check_mtime_items.append(item)
+            self.scan_directory_items.append(item)
+        Crawler.active += 1
+        log.info('start crawler %s' % self.num)
+        self.check_mtime()
 
-    def check(self):
+        
+    def scan_directory(self):
         if not self.timer:
             return False
 
-        if not self._tocheck:
-            self.stop()
+        if not self.scan_directory_items:
+            self.finished()
             return False
 
-        item = self._tocheck.pop(0)
-        self._checked.append(item)
+        item = self.scan_directory_items.pop(0)
+        if not isinstance(item, Directory):
+            log.warning('%s is no directory item', item)
+            if hasattr(item, 'filename') and item.filename + '/' in 
self.monitoring:
+                self.monitoring.remove(item.filename + '/')
+            return True
+
         log.debug('check %s', item)
-        if item.filename in _crawling:
-            _crawling.remove(item.filename)
         for child in self.db.query(parent=item):
             if child._beacon_isdir:
-                for x in self._tocheck + self._checked:
-                    if child.filename == x.filename:
-                        self._toparse.append(child)
+                for fname in [ f.filename for f in self.scan_directory_items ] 
+ \
+                        self.monitoring:
+                    if child.filename == fname:
+                        self.check_mtime_items.append(child)
                         break
                 else:
-                    self._toparse.append(child)
-                    self._tocheck.append(child)
-                    _crawling.append(child.filename)
+                    self.check_mtime_items.append(child)
+                    self.scan_directory_items.append(child)
                 continue
-            self._toparse.append(child)
-        self.timer = Timer(self.parse)
-        self.timer.start(PARSE_TIMER / Crawler.active)
+            self.check_mtime_items.append(child)
+        if not item.filename in self.monitoring:
+            if self.inotify:
+                log.info('add inotify for %s' % item.filename)
+                
self.inotify.watch(item.filename[:-1]).connect(self.inotify_callback)
+            self.monitoring.append(item.filename)
+        self.check_mtime()
         return True
 
 
-    def parse(self):
+    def check_mtime(self):
+        self.timer = Timer(self.check_mtime_step)
+        self.timer.start(self.PARSE_TIMER / Crawler.active)
+
+        
+    def check_mtime_step(self):
         if not self.timer:
             return False
         counter = 0
         while True:
-            if not self._toparse:
-                if self._toupdate:
-                    self.timer = Timer(self.update)
-                    self.timer.start(UPDATE_TIMER / Crawler.active)
-                else:
-                    self.timer = OneShotTimer(self.check)
-                    self.timer.start(CHECK_TIMER / Crawler.active)
+            if not self.check_mtime_items:
+                self.update()
                 return False
-            item = self._toparse.pop(0)
+            item = self.check_mtime_items.pop(0)
             counter += 1
             if item._beacon_data['mtime'] != item._beacon_mtime():
-                self._toupdate.append(item)
-            if counter == 20 and len(self._toparse) > 10:
+                self.update_items.append(item)
+            if counter == 20 and len(self.check_mtime_items) > 10:
                 return True
 
 
     def update(self):
+        if self.update_items:
+            self.timer = Timer(self.update_step)
+            self.timer.start(self.UPDATE_TIMER / Crawler.active)
+        else:
+            self.scan_directory()
+        
+
+    def update_step(self):
         if not self.timer:
             return False
-        if not self._toupdate:
-            self.timer = OneShotTimer(self.check)
-            self.timer.start(CHECK_TIMER / Crawler.active)
+        if not self.update_items:
+            self.scan_directory()
             return False
-        item = self._toupdate.pop(0)
+        item = self.update_items.pop(0)
         parser.parse(self.db, item)
         return True


-------------------------------------------------------
This SF.Net email is sponsored by xPML, a groundbreaking scripting language
that extends applications into web and mobile media. Attend the live webcast
and join the prime developer group breaking into this new coding territory!
http://sel.as-us.falkag.net/sel?cmd=lnk&kid=110944&bid=241720&dat=121642
_______________________________________________
Freevo-cvslog mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/freevo-cvslog

Reply via email to