Log message for revision 115092: More catalog work, merge in the non-queryplan and non-btree changes of queryplan and optimize more of date range index internals
Changed: U Zope/trunk/doc/CHANGES.rst U Zope/trunk/src/Products/PluginIndexes/DateIndex/DateIndex.py U Zope/trunk/src/Products/PluginIndexes/DateRangeIndex/DateRangeIndex.py U Zope/trunk/src/Products/PluginIndexes/common/UnIndex.py U Zope/trunk/src/Products/PluginIndexes/interfaces.py U Zope/trunk/src/Products/ZCatalog/Catalog.py -=- Modified: Zope/trunk/doc/CHANGES.rst =================================================================== --- Zope/trunk/doc/CHANGES.rst 2010-07-25 18:09:26 UTC (rev 115091) +++ Zope/trunk/doc/CHANGES.rst 2010-07-25 22:03:14 UTC (rev 115092) @@ -56,6 +56,17 @@ Features Added ++++++++++++++ +- Various optimizations to indexes _apply_index and the catalog's search + method inspired by experimental.catalogqueryplan. + +- Added a new ILimitedResultIndex to Products.PluginIndexes and made most + built-in indexes compatible with it. This allows indexes to consider the + already calculated result set inside their own calculations. + +- Changed the internals of the DateRangeIndex to always use IITreeSet and do + an inline migration from IISet. Some datum tend to have large number of + documents, for example when using default floor or ceiling dates. + - Added a new reporting tab to `Products.ZCatalog` instances. You can use this to get an overview of slow catalog queries, as specified by a configurable threshold value. The reports are per running Zope process. Modified: Zope/trunk/src/Products/PluginIndexes/DateIndex/DateIndex.py =================================================================== --- Zope/trunk/src/Products/PluginIndexes/DateIndex/DateIndex.py 2010-07-25 18:09:26 UTC (rev 115091) +++ Zope/trunk/src/Products/PluginIndexes/DateIndex/DateIndex.py 2010-07-25 22:03:14 UTC (rev 115092) @@ -157,7 +157,7 @@ return returnStatus - def _apply_index(self, request): + def _apply_index(self, request, resultset=None): """Apply the index to query parameters given in the argument Normalize the 'query' arguments into integer values at minute @@ -176,7 +176,7 @@ #experimental code for specifing the operator operator = record.get( 'operator', self.useOperator ) if not operator in self.operators : - raise RuntimeError, "operator not valid: %s" % operator + raise RuntimeError("operator not valid: %s" % operator) # depending on the operator we use intersection or union if operator=="or": @@ -223,6 +223,9 @@ if set is not None: if isinstance(set, int): set = IISet((set,)) + else: + # set can't be bigger than resultset + set = intersection(set, resultset) r = set_func(r, set) if isinstance(r, int): Modified: Zope/trunk/src/Products/PluginIndexes/DateRangeIndex/DateRangeIndex.py =================================================================== --- Zope/trunk/src/Products/PluginIndexes/DateRangeIndex/DateRangeIndex.py 2010-07-25 18:09:26 UTC (rev 115091) +++ Zope/trunk/src/Products/PluginIndexes/DateRangeIndex/DateRangeIndex.py 2010-07-25 22:03:14 UTC (rev 115092) @@ -28,7 +28,6 @@ from BTrees.IIBTree import IITreeSet from BTrees.IIBTree import intersection from BTrees.IIBTree import multiunion -from BTrees.IIBTree import union from BTrees.IOBTree import IOBTree from BTrees.Length import Length from DateTime.DateTime import DateTime @@ -242,7 +241,7 @@ return tuple( result ) - def _apply_index(self, request): + def _apply_index(self, request, resultset=None): """ Apply the index to query parameters given in 'request', which should be a mapping object. @@ -265,34 +264,17 @@ # Aggregate sets for each bucket separately, to avoid # large-small union penalties. # - #until_only = IISet() - #map( until_only.update, self._until_only.values( term ) ) - # XXX use multi-union until_only = multiunion( self._until_only.values( term ) ) - - #since_only = IISet() - #map( since_only.update, self._since_only.values( None, term ) ) - # XXX use multi-union since_only = multiunion( self._since_only.values( None, term ) ) - - #until = IISet() - #map( until.update, self._until.values( term ) ) - # XXX use multi-union until = multiunion( self._until.values( term ) ) - #since = IISet() - #map( since.update, self._since.values( None, term ) ) - # XXX use multi-union - since = multiunion( self._since.values( None, term ) ) + # Total result is bound by resultset + until = intersection(resultset, until) + since = multiunion(self._since.values(None, term)) + bounded = intersection(until, since) - bounded = intersection( until, since ) - # Merge from smallest to largest. - #result = union( self._always, until_only ) - result = union( bounded, until_only ) - result = union( result, since_only ) - #result = union( result, bounded ) - result = union( result, self._always ) + result = multiunion([bounded, until_only, since_only, self._always]) return result, ( self._since_field, self._until_field ) @@ -314,8 +296,8 @@ if set is None: self._until_only[ until ] = documentId else: - if isinstance(set, int): - set = self._until_only[ until ] = IISet((set, documentId)) + if isinstance(set, (int, IISet)): + set = self._until_only[until] = IITreeSet((set, documentId)) else: set.insert( documentId ) elif until is None: @@ -324,8 +306,8 @@ if set is None: self._since_only[ since ] = documentId else: - if isinstance(set, int): - set = self._since_only[ since ] = IISet((set, documentId)) + if isinstance(set, (int, IISet)): + set = self._since_only[since] = IITreeSet((set, documentId)) else: set.insert( documentId ) @@ -335,8 +317,8 @@ if set is None: self._since[ since ] = documentId else: - if isinstance(set, int): - set = self._since[ since ] = IISet((set, documentId)) + if isinstance(set, (int, IISet)): + set = self._since[since] = IITreeSet((set, documentId)) else: set.insert( documentId ) @@ -344,8 +326,8 @@ if set is None: self._until[ until ] = documentId else: - if isinstance(set, int): - set = self._until[ until ] = IISet((set, documentId)) + if isinstance(set, (int, IISet)): + set = self._until[until] = IITreeSet((set, documentId)) else: set.insert( documentId ) Modified: Zope/trunk/src/Products/PluginIndexes/common/UnIndex.py =================================================================== --- Zope/trunk/src/Products/PluginIndexes/common/UnIndex.py 2010-07-25 18:09:26 UTC (rev 115091) +++ Zope/trunk/src/Products/PluginIndexes/common/UnIndex.py 2010-07-25 22:03:14 UTC (rev 115092) @@ -21,7 +21,7 @@ from BTrees.IIBTree import intersection from BTrees.IIBTree import IITreeSet from BTrees.IIBTree import IISet -from BTrees.IIBTree import union +from BTrees.IIBTree import multiunion from BTrees.IOBTree import IOBTree from BTrees.Length import Length from BTrees.OOBTree import OOBTree @@ -31,7 +31,7 @@ from Products.PluginIndexes.common import safe_callable from Products.PluginIndexes.common.util import parseIndexRequest -from Products.PluginIndexes.interfaces import IPluggableIndex +from Products.PluginIndexes.interfaces import ILimitedResultIndex from Products.PluginIndexes.interfaces import ISortIndex from Products.PluginIndexes.interfaces import IUniqueValueIndex @@ -43,7 +43,7 @@ """Simple forward and reverse index. """ - implements(IPluggableIndex, IUniqueValueIndex, ISortIndex) + implements(ILimitedResultIndex, IUniqueValueIndex, ISortIndex) def __init__( self, id, ignore_ex=None, call_methods=None, extra=None, caller=None): @@ -302,7 +302,7 @@ LOG.debug('Attempt to unindex nonexistent document' ' with id %s' % documentId,exc_info=True) - def _apply_index(self, request): + def _apply_index(self, request, resultset=None): """Apply the index to query parameters given in the request arg. The request argument should be a mapping object. @@ -348,12 +348,8 @@ # experimental code for specifing the operator operator = record.get('operator',self.useOperator) if not operator in self.operators : - raise RuntimeError,"operator not valid: %s" % escape(operator) + raise RuntimeError("operator not valid: %s" % escape(operator)) - # depending on the operator we use intersection or union - if operator=="or": set_func = union - else: set_func = intersection - # Range parameter range_parm = record.get('range',None) if range_parm: @@ -375,24 +371,84 @@ if 'max' in opr_args: hi = max(record.keys) else: hi = None if hi: - setlist = index.items(lo,hi) + setlist = index.values(lo,hi) else: - setlist = index.items(lo) + setlist = index.values(lo) - for k, set in setlist: - if isinstance(set, int): - set = IISet((set,)) - r = set_func(r, set) + # If we only use one key, intersect and return immediately + if len(setlist) == 1: + result = setlist[0] + if isinstance(result, int): + result = IISet((result,)) + return result, (self.id,) + + if operator == 'or': + tmp = [] + for s in setlist: + if isinstance(s, int): + s = IISet((s,)) + tmp.append(s) + r = multiunion(tmp) + else: + # For intersection, sort with smallest data set first + tmp = [] + for s in setlist: + if isinstance(s, int): + s = IISet((s,)) + tmp.append(s) + if len(tmp) > 2: + setlist = sorted(tmp, key=len) + else: + setlist = tmp + r = resultset + for s in setlist: + # the result is bound by the resultset + r = intersection(r, s) + else: # not a range search - for key in record.keys: - set=index.get(key, None) - if set is None: - set = IISet(()) - elif isinstance(set, int): - set = IISet((set,)) - r = set_func(r, set) + # Filter duplicates + setlist = [] + for k in record.keys: + s = index.get(k, None) + # If None, try to bail early + if s is None: + if operator == 'or': + # If union, we can't possibly get a bigger result + continue + # If intersection, we can't possibly get a smaller result + return IISet(), (self.id,) + elif isinstance(s, int): + s = IISet((s,)) + setlist.append(s) - if isinstance(r, int): r=IISet((r,)) + # If we only use one key return immediately + if len(setlist) == 1: + result = setlist[0] + if isinstance(result, int): + result = IISet((result,)) + return result, (self.id,) + + if operator == 'or': + # If we already get a small result set passed in, intersecting + # the various indexes with it and doing the union later is + # faster than creating a multiunion first. + if resultset is not None and len(resultset) < 200: + smalllist = [] + for s in setlist: + smalllist.append(intersection(resultset, s)) + r = multiunion(smalllist) + else: + r = multiunion(setlist) + else: + # For intersection, sort with smallest data set first + if len(setlist) > 2: + setlist = sorted(setlist, key=len) + r = resultset + for s in setlist: + r = intersection(r, s) + + if isinstance(r, int): + r = IISet((r, )) if r is None: return IISet(), (self.id,) else: Modified: Zope/trunk/src/Products/PluginIndexes/interfaces.py =================================================================== --- Zope/trunk/src/Products/PluginIndexes/interfaces.py 2010-07-25 18:09:26 UTC (rev 115091) +++ Zope/trunk/src/Products/PluginIndexes/interfaces.py 2010-07-25 22:03:14 UTC (rev 115092) @@ -85,6 +85,15 @@ """Empty the index""" +class ILimitedResultIndex(IPluggableIndex): + + def _apply_index(request, resultset=None): + """Same as IPluggableIndex' _apply_index method. The additional + resultset argument contains the resultset, as already calculated by + ZCatalog's search method. + """ + + class IUniqueValueIndex(IPluggableIndex): """An index which can return lists of unique values contained in it""" Modified: Zope/trunk/src/Products/ZCatalog/Catalog.py =================================================================== --- Zope/trunk/src/Products/ZCatalog/Catalog.py 2010-07-25 18:09:26 UTC (rev 115091) +++ Zope/trunk/src/Products/ZCatalog/Catalog.py 2010-07-25 22:03:14 UTC (rev 115092) @@ -22,6 +22,7 @@ import ExtensionClass from Missing import MV from Persistence import Persistent +from Products.PluginIndexes.interfaces import ILimitedResultIndex import BTrees.Length from BTrees.IIBTree import intersection, weightedIntersection, IISet @@ -475,6 +476,10 @@ query[iid] = value return query + def _sorted_search_indexes(self, query): + # Simple implementation doing no ordering. + return self.indexes.keys() + def search(self, query, sort_index=None, reverse=0, limit=None, merge=1): """Iterate through the indexes, applying the query to each one. If merge is true then return a lazy result set (sorted if appropriate) @@ -497,25 +502,44 @@ # Canonicalize the request into a sensible query before passing it on query = self.make_query(query) + query_keys = query.keys() cr = self.getCatalogReport(query) cr.start() - for i in self.indexes.keys(): + for i in self._sorted_search_indexes(query): + if i not in query_keys: + # Do not ask indexes to restrict the result, which aren't + # part of the query + continue + index = self.getIndex(i) _apply_index = getattr(index, "_apply_index", None) if _apply_index is None: continue + limit_result = False + if ILimitedResultIndex.providedBy(index): + limit_result = True + cr.split(i) - r = _apply_index(query) + if limit_result: + r = _apply_index(query, rs) + else: + r = _apply_index(query) cr.split(i) if r is not None: r, u = r + # Short circuit if empty result + # BBB: We can remove the "r is not None" check in Zope 2.14 + # once we don't need to support the "return everything" case + # anymore + if r is not None and not r: + return LazyCat([]) w, rs = weightedIntersection(rs, r) if not rs: - break + break cr.stop() _______________________________________________ Zope-Checkins maillist - Zope-Checkins@zope.org https://mail.zope.org/mailman/listinfo/zope-checkins