------------------------------------------------------------ revno: 420 committer: Michal Hruby <michal....@gmail.com> branch nick: zeitgeist timestamp: Mon 2012-03-12 15:22:16 +0100 message: Index only recognized uri schemes modified: extensions/fts++/indexer.cpp extensions/fts++/indexer.h
-- lp:zeitgeist https://code.launchpad.net/~zeitgeist/zeitgeist/bluebird Your team Zeitgeist Framework Team is subscribed to branch lp:zeitgeist. To unsubscribe from this branch go to https://code.launchpad.net/~zeitgeist/zeitgeist/bluebird/+edit-subscription
=== modified file 'extensions/fts++/indexer.cpp' --- extensions/fts++/indexer.cpp 2012-03-07 16:08:26 +0000 +++ extensions/fts++/indexer.cpp 2012-03-12 14:22:16 +0000 @@ -106,9 +106,17 @@ g_assert (g_checksum_type_get_length (G_CHECKSUM_MD5) == HASH_LENGTH); this->checksum = g_checksum_new (G_CHECKSUM_MD5); - if (!this->checksum) - g_critical ("GChecksum initialization failed."); - + if (!this->checksum) g_critical ("GChecksum initialization failed."); + + GError *error = NULL; + /* we need to be careful with what we log, for example ubuntuone logs its + * weird uids and that screws up the index */ + this->uri_schemes_regex = g_regex_new ( + "(file|http[s]?|[s]?ftp|ssh|smb|dav[s]?|application)$", G_REGEX_OPTIMIZE, + (GRegexMatchFlags) 0, &error); + + if (error) + g_critical ("Unable to initialize uri scheme regex: %s", error->message); } catch (const Xapian::Error &xp_error) { @@ -399,7 +407,7 @@ tokenizer->index_text (StringUtils::AsciiFold (text), 5); } -void Indexer::IndexUri (std::string const& uri, std::string const& origin) +bool Indexer::IndexUri (std::string const& uri, std::string const& origin) { GFile *f = g_file_new_for_uri (uri.c_str ()); @@ -407,12 +415,21 @@ if (scheme == NULL) { g_warning ("Invalid URI: %s", uri.c_str ()); - return; + g_object_unref (f); + return false; } std::string scheme_str(scheme); g_free (scheme); + // do we support this scheme? + if (!g_regex_match (uri_schemes_regex, scheme_str.c_str (), + (GRegexMatchFlags) 0, NULL)) + { + g_object_unref (f); + return false; + } + if (scheme_str == "file") { // FIXME: special case some typical filenames (like photos) @@ -462,7 +479,7 @@ weight_index < G_N_ELEMENTS (path_weights)) { // if this is already home directory we don't want it - if (path_component == home_dir_path) return; + if (path_component == home_dir_path) break; gchar *name = g_path_get_basename (path_component.c_str ()); @@ -481,10 +498,11 @@ // mailto:usern...@server.com size_t scheme_len = scheme_str.length () + 1; size_t at_pos = uri.find ('@', scheme_len); - if (at_pos == std::string::npos) return; - - tokenizer->index_text (uri.substr (scheme_len, at_pos - scheme_len), 5); - tokenizer->index_text (uri.substr (at_pos + 1), 1); + if (at_pos != std::string::npos) + { + tokenizer->index_text (uri.substr (scheme_len, at_pos - scheme_len), 5); + tokenizer->index_text (uri.substr (at_pos + 1), 1); + } } else if (scheme_str.compare (0, 4, "http") == 0) { @@ -578,6 +596,8 @@ } g_object_unref (f); + + return true; } bool Indexer::IndexActor (std::string const& actor, bool is_subject) @@ -1035,15 +1055,11 @@ if (!IndexActor (uri, true)) IndexUri (uri, origin); } - else if (uri.compare (0, 10, "ubuntuone:") == 0) + else if (!IndexUri (uri, origin)) { - // U1 logs its uids, we don't want to index those + // unsupported uri scheme return; } - else - { - IndexUri (uri, origin); - } } AddDocFilters (event, doc); === modified file 'extensions/fts++/indexer.h' --- extensions/fts++/indexer.h 2012-03-07 16:08:26 +0000 +++ extensions/fts++/indexer.h 2012-03-12 14:22:16 +0000 @@ -57,6 +57,7 @@ if (query_parser) delete query_parser; if (db) delete db; if (checksum) g_checksum_free (checksum); + if (uri_schemes_regex) g_regex_unref (uri_schemes_regex); for (AppInfoMap::iterator it = app_info_cache.begin (); it != app_info_cache.end (); ++it) @@ -111,7 +112,7 @@ void AddDocFilters (ZeitgeistEvent *event, Xapian::Document &doc); void IndexText (std::string const& text); - void IndexUri (std::string const& uri, std::string const& origin); + bool IndexUri (std::string const& uri, std::string const& origin); bool IndexActor (std::string const& actor, bool is_subject); gboolean ClearFailedLookupsCb (); @@ -123,7 +124,8 @@ Xapian::TermGenerator *tokenizer; AppInfoMap app_info_cache; ApplicationSet failed_lookups; - GChecksum *checksum; + GChecksum *checksum; + GRegex *uri_schemes_regex; guint clear_failed_id; std::string home_dir_path;
_______________________________________________ Mailing list: https://launchpad.net/~zeitgeist Post to : zeitgeist@lists.launchpad.net Unsubscribe : https://launchpad.net/~zeitgeist More help : https://help.launchpad.net/ListHelp