Merge authors: Michal Hruby (mhr3) Related merge proposals: https://code.launchpad.net/~zeitgeist/zeitgeist/fts-origin-hashing/+merge/98281 proposed by: Siegfried Gevatter (rainct) review: Approve - Siegfried Gevatter (rainct) ------------------------------------------------------------ revno: 440 [merge] committer: Michal Hruby <michal....@gmail.com> branch nick: zeitgeist timestamp: Mon 2012-03-19 22:44:48 +0100 message: Merge lp:~zeitgeist/zeitgeist/fts-origin-hashing modified: extensions/fts++/indexer.cpp extensions/fts++/indexer.h
-- lp:zeitgeist https://code.launchpad.net/~zeitgeist/zeitgeist/bluebird Your team Zeitgeist Framework Team is subscribed to branch lp:zeitgeist. To unsubscribe from this branch go to https://code.launchpad.net/~zeitgeist/zeitgeist/bluebird/+edit-subscription
=== modified file 'extensions/fts++/indexer.cpp' --- extensions/fts++/indexer.cpp 2012-03-19 19:56:38 +0000 +++ extensions/fts++/indexer.cpp 2012-03-19 21:33:59 +0000 @@ -43,6 +43,7 @@ const Xapian::valueno VALUE_EVENT_ID = 0; const Xapian::valueno VALUE_TIMESTAMP = 1; const Xapian::valueno VALUE_URI_HASH = 2; +const Xapian::valueno VALUE_ORIGIN_HASH = 3; #define QUERY_PARSER_FLAGS \ Xapian::QueryParser::FLAG_PHRASE | Xapian::QueryParser::FLAG_BOOLEAN | \ @@ -763,7 +764,11 @@ result_type == ZEITGEIST_RESULT_TYPE_MOST_RECENT_SUBJECTS || result_type == ZEITGEIST_RESULT_TYPE_LEAST_RECENT_SUBJECTS || result_type == ZEITGEIST_RESULT_TYPE_MOST_POPULAR_SUBJECTS || - result_type == ZEITGEIST_RESULT_TYPE_LEAST_POPULAR_SUBJECTS) + result_type == ZEITGEIST_RESULT_TYPE_LEAST_POPULAR_SUBJECTS || + result_type == ZEITGEIST_RESULT_TYPE_MOST_RECENT_ORIGIN || + result_type == ZEITGEIST_RESULT_TYPE_LEAST_RECENT_ORIGIN || + result_type == ZEITGEIST_RESULT_TYPE_MOST_POPULAR_ORIGIN || + result_type == ZEITGEIST_RESULT_TYPE_LEAST_POPULAR_ORIGIN) { maxhits = count; } @@ -795,8 +800,7 @@ result_type == ZEITGEIST_RESULT_TYPE_MOST_POPULAR_ORIGIN || result_type == ZEITGEIST_RESULT_TYPE_LEAST_POPULAR_ORIGIN) { - // FIXME: not really correct but close :) - enquire->set_collapse_key (VALUE_URI_HASH); + enquire->set_collapse_key (VALUE_ORIGIN_HASH); } else if (result_type == ZEITGEIST_RESULT_TYPE_MOST_RECENT_EVENTS || result_type == ZEITGEIST_RESULT_TYPE_LEAST_RECENT_EVENTS) @@ -1137,10 +1141,8 @@ result_type == ZEITGEIST_RESULT_TYPE_MOST_POPULAR_ORIGIN || result_type == ZEITGEIST_RESULT_TYPE_LEAST_POPULAR_ORIGIN) { - // FIXME: not really correct but close :) enquire->set_sort_by_relevance_then_value (VALUE_TIMESTAMP, reversed_sort); - enquire->set_collapse_key (VALUE_URI_HASH); - maxhits *= 3; + enquire->set_collapse_key (VALUE_ORIGIN_HASH); } else { @@ -1272,6 +1274,16 @@ return results; } +static void +get_digest_for_uri (GChecksum *checksum, const gchar *uri, + guint8 *digest, gsize *digest_size) +{ + g_checksum_update (checksum, (guchar *) uri, -1); + g_checksum_get_digest (checksum, digest, digest_size); + g_checksum_reset (checksum); + g_assert (digest_size == NULL || *digest_size == HASH_LENGTH); +} + void Indexer::IndexEvent (ZeitgeistEvent *event) { try @@ -1322,19 +1334,28 @@ return; // ignore this event completely... } + guint8 uri_hash[HASH_LENGTH + 1]; + gsize hash_size = HASH_LENGTH; + // We need the subject URI so we can use Xapian's collapse key feature // for *_SUBJECT grouping. However, to save space, we'll just save a hash. // A better option would be using URI's id, but for that we'd need a SQL // query that'd be subject to races. // FIXME(?): This doesn't work for events with multiple subjects. - g_checksum_update (checksum, (guchar *) uri.c_str (), -1); - guint8 uri_hash[HASH_LENGTH + 1]; - gsize hash_size = HASH_LENGTH; - g_checksum_get_digest (checksum, uri_hash, &hash_size); - g_checksum_reset (checksum); - g_assert (hash_size == HASH_LENGTH); + get_digest_for_uri (checksum, uri.c_str (), uri_hash, &hash_size); doc.add_value (VALUE_URI_HASH, std::string((char *) uri_hash, hash_size)); + size_t colon_pos = uri.find (':'); + // FIXME: current_origin once we have that + val = zeitgeist_subject_get_origin (subject); + // make sure the schemas of the URI and origin are the same + if (val && colon_pos != std::string::npos && strncmp (uri.c_str (), val, colon_pos+1) == 0) + { + hash_size = HASH_LENGTH; + get_digest_for_uri (checksum, val, uri_hash, &hash_size); + doc.add_value (VALUE_ORIGIN_HASH, std::string((char *) uri_hash, hash_size)); + } + val = zeitgeist_subject_get_text (subject); if (val && val[0] != '\0') { === modified file 'extensions/fts++/indexer.h' --- extensions/fts++/indexer.h 2012-03-12 14:22:16 +0000 +++ extensions/fts++/indexer.h 2012-03-19 21:42:52 +0000 @@ -29,7 +29,7 @@ namespace ZeitgeistFTS { -const std::string INDEX_VERSION = "2"; +const std::string INDEX_VERSION = "3"; class Indexer {
_______________________________________________ Mailing list: https://launchpad.net/~zeitgeist Post to : zeitgeist@lists.launchpad.net Unsubscribe : https://launchpad.net/~zeitgeist More help : https://help.launchpad.net/ListHelp