Seif Lotfy has proposed merging lp:~seif/zeitgeist/rewrite-find-related-uris into lp:zeitgeist.
Requested reviews: Zeitgeist Framework Team (zeitgeist) I took the time to rewrite the find_related_uris to use a more or less graph structuring before pushing everything into one pot and going through a 1-step aproiri. The results are the same except for one test case where [i1, i3, i5] is expected but [i3, i1, i5] is sent. However this results is also right since i3 and i2 both have the same count = 2 but were sorted differently whereas i5 count = 1. The new code is documented and is much shorter allowing me to get rid of many custom helpers that were written for find_related_uris. If this branch is ok with you i will change the last test case and merge it. Cheers Seif -- https://code.launchpad.net/~seif/zeitgeist/rewrite-find-related-uris/+merge/38746 Your team Zeitgeist Framework Team is requested to review the proposed merge of lp:~seif/zeitgeist/rewrite-find-related-uris into lp:zeitgeist.
=== modified file '_zeitgeist/engine/main.py' --- _zeitgeist/engine/main.py 2010-10-12 18:37:10 +0000 +++ _zeitgeist/engine/main.py 2010-10-18 18:09:42 +0000 @@ -36,8 +36,6 @@ from _zeitgeist.engine import constants from _zeitgeist.engine.sql import get_default_cursor, unset_cursor, \ TableLookup, WhereClause - -WINDOW_SIZE = 7 logging.basicConfig(level=logging.DEBUG) log = logging.getLogger("zeitgeist.engine") @@ -325,8 +323,8 @@ Return modes: - 0: IDs. - 1: Events. - - 2: (Timestamp, SubjectUri) """ + print "*********", time_range t = time.time() where = self._build_sql_event_filter(time_range, event_templates, @@ -339,8 +337,6 @@ sql = "SELECT DISTINCT id FROM event_view" elif return_mode == 1: sql = "SELECT id FROM event_view" - elif return_mode == 2: - sql = "SELECT subj_uri, timestamp FROM event_view" else: raise NotImplementedError, "Unsupported return_mode." @@ -388,10 +384,7 @@ result = [row[0] for row in result] elif return_mode == 1: log.debug("Found %d events IDs in %fs" % (len(result), time.time()- t)) - result = self.get_events(ids=[row[0] for row in result], sender=sender) - elif return_mode == 2: - log.debug("Found %d (uri,timestamp) tuples in %fs" % (len(result), time.time()- t)) - result = map(lambda row: (row[0], row[1]), result) + result = self.get_events(ids=[row[0] for row in result], sender=sender) else: raise Exception("%d" % return_mode) @@ -403,12 +396,6 @@ def find_events(self, *args): return self._find_events(1, *args) - def __add_window(self, _set, assoc, landmarks, windows): - if _set & landmarks: # intersection != 0 - windows.append(_set) - for i in _set.difference(landmarks): - assoc[i] += 1 - def find_related_uris(self, timerange, event_templates, result_event_templates, result_storage_state, num_results, result_type): """ @@ -421,79 +408,58 @@ """ if result_type == 0 or result_type == 1: - + # Instead of using sliding windows we will be using a graph representation + t1 = time.time() - if len(result_event_templates) == 0: - uris = self._find_events(2, timerange, result_event_templates, - result_storage_state, 0, ResultType.LeastRecentEvents) - else: - uris = self._find_events(2, timerange, result_event_templates + event_templates, - result_storage_state, 0, ResultType.LeastRecentEvents) - - assoc = defaultdict(int) - - landmarks = self._find_events(2, timerange, event_templates, - result_storage_state, 0, 4) - landmarks = set([unicode(event[0]) for event in landmarks]) - - latest_uris = dict(uris) - events = [unicode(u[0]) for u in uris] - - furis = filter(lambda x: x[0] in landmarks, uris) - if len(furis) == 0: - return [] - - _min = min(furis, key=operator.itemgetter(1)) - _max = max(furis, key=operator.itemgetter(1)) - min_index = uris.index(_min) - WINDOW_SIZE - max_index = uris.index(_max) + WINDOW_SIZE - _min = _min[1] - _max = _max[1] - - if min_index < 0: - min_index = 0 - if max_index > len(events): - max_index = -1 - - func = self.__add_window - - if len(events) == 0 or len(landmarks) == 0: - return [] - - windows = [] - - if len(events) <= WINDOW_SIZE: - #TODO bug! windows is not defined, seems the algorithm never touches these loop - func(events, assoc, landmarks, windows) - else: - events = events[min_index:max_index] - offset = WINDOW_SIZE/2 - - for i in xrange(offset): - func(set(events[0: offset - i]), assoc, landmarks, - windows) - func(set(events[len(events) - offset + i: len(events)]), - assoc, landmarks, windows) - - it = iter(events) - result = tuple(islice(it, WINDOW_SIZE)) - for elem in it: - result = result[1:] + (elem,) - func(set(result), assoc, landmarks, windows) - - + # We pick out the ids for relational event so we can set them as roots + # the ids are taken from the events that match the events_templates + ids = self._find_events(0, timerange, event_templates, + result_storage_state, 0, ResultType.LeastRecentEvents) + + # Pick out the result_ids for the filtered results we would like to take into account + # the ids are taken from the events that match the result_event_templates + # if no result_event_templates are set we consider all results as allowed + result_ids = [] + if len(result_event_templates) > 0: + result_ids = self._find_events(0, timerange, result_event_templates, + result_storage_state, 0, ResultType.LeastRecentEvents) + + # From here we create several graphs with the maximum height depth of 2 + # and push all the nodes and vertices (events) in one pot together + # FIXME: the depth should be adaptable + pot = [] + for id in ids: + for x in xrange(id-2,id+3): + if len(result_ids) == 0 or x in result_ids: + pot.append(x) + + # Out of the pot we get all respected events and count which uris occur most + events = self.get_events(pot) + subject_uri_counter = {} + latest_uris = {} + for event in events: + if event and not event.id in ids: + subj = event.subjects[0] + if not subject_uri_counter.has_key(subj.uri): + subject_uri_counter[subj.uri] = 0 + subject_uri_counter[subj.uri] += 1 + latest_uris[subj.uri] = event + log.debug("FindRelatedUris: Finished sliding windows in %fs." % \ (time.time()-t1)) if result_type == 0: - sets = [[v, k] for k, v in assoc.iteritems()] + sets = [[v, k] for k, v in subject_uri_counter.iteritems()] elif result_type == 1: - sets = [[latest_uris[k], k] for k in assoc] + sets = [[latest_uris[k], k] for k in subject_uri_counter] sets.sort(reverse = True) + print sets sets = map(lambda result: result[1], sets[:num_results]) + print "####################" + print subject_uri_counter return sets else: raise NotImplementedError, "Unsupported ResultType."
_______________________________________________ Mailing list: https://launchpad.net/~zeitgeist Post to : zeitgeist@lists.launchpad.net Unsubscribe : https://launchpad.net/~zeitgeist More help : https://help.launchpad.net/ListHelp