Kelson has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/344963 )
Change subject: Store the stop words used by the indexer in the database. ...................................................................... Store the stop words used by the indexer in the database. To properly search in the database, a user need to use the same stop word that the ones used while indexing the content. By storing the used stop words in the database, a user code can use them on its side and correctly parse a query. It is not enough to store the language as we need to read a file/resources and user code may not have them. Change-Id: I6cbc9f8d30c39d4fc1e65a356347d8fbfd456494 --- M zimwriterfs/indexer.cpp M zimwriterfs/indexer.h M zimwriterfs/xapianIndexer.cpp M zimwriterfs/xapianIndexer.h 4 files changed, 14 insertions(+), 28 deletions(-) Approvals: Kelson: Verified; Looks good to me, approved diff --git a/zimwriterfs/indexer.cpp b/zimwriterfs/indexer.cpp index 33989f4..8e0c211 100644 --- a/zimwriterfs/indexer.cpp +++ b/zimwriterfs/indexer.cpp @@ -57,18 +57,6 @@ Indexer::~Indexer() { } - /* Read the stopwords */ - void Indexer::readStopWords(const string languageCode) { - std::string stopWord; - std::istringstream file(getResourceAsString("stopwords/" + languageCode)); - - this->stopWords.clear(); - - while (getline(file, stopWord, '\n')) { - this->stopWords.push_back(stopWord); - } - } - /* Article indexer methods */ void *Indexer::indexArticles(void *ptr) { pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, NULL); diff --git a/zimwriterfs/indexer.h b/zimwriterfs/indexer.h index 686d156..797db7a 100644 --- a/zimwriterfs/indexer.h +++ b/zimwriterfs/indexer.h @@ -68,10 +68,6 @@ virtual void flush() = 0; virtual void indexingPostlude() = 0; - /* Stop words */ - std::vector<std::string> stopWords; - void readStopWords(const string languageCode); - /* Others */ unsigned int countWords(const string &text); diff --git a/zimwriterfs/xapianIndexer.cpp b/zimwriterfs/xapianIndexer.cpp index 5abec26..c4c0b2e 100644 --- a/zimwriterfs/xapianIndexer.cpp +++ b/zimwriterfs/xapianIndexer.cpp @@ -18,13 +18,13 @@ */ #include "xapianIndexer.h" +#include "resourceTools.h" /* Constructor */ XapianIndexer::XapianIndexer(const std::string& language, const bool verbose) : language(language) { setVerboseFlag(verbose); - readStopWords(language); /* Build ICU Local object to retrieve ISO-639 language code (from ISO-639-3) */ @@ -38,6 +38,17 @@ } catch (...) { std::cout << "No steemming for language '" << languageLocale.getLanguage() << "'" << std::endl; } + + /* Read the stopwords */ + std::string stopWord; + this->stopwords = getResourceAsString("stopwords/"+language); + std::istringstream file(this->stopwords); + while (std::getline(file, stopWord, '\n')) { + this->stopper.add(stopWord); + } + + this->indexer.set_stopper(&(this->stopper)); + this->indexer.set_stopper_strategy(Xapian::TermGenerator::STOP_ALL); } XapianIndexer::~XapianIndexer(){ @@ -56,18 +67,8 @@ this->writableDatabase = Xapian::WritableDatabase(indexPath + ".tmp", Xapian::DB_CREATE_OR_OVERWRITE); this->writableDatabase.set_metadata("valuesmap", "title:0;wordcount:1"); this->writableDatabase.set_metadata("language", language); + this->writableDatabase.set_metadata("stopwords", stopwords); this->writableDatabase.begin_transaction(true); - - /* Insert the stopwords */ - if (!this->stopWords.empty()) { - std::vector<std::string>::iterator it = this->stopWords.begin(); - for( ; it != this->stopWords.end(); ++it) { - this->stopper.add(*it); - } - - this->indexer.set_stopper(&(this->stopper)); - this->indexer.set_stopper_strategy(Xapian::TermGenerator::STOP_ALL); - } } void XapianIndexer::index(const string &url, diff --git a/zimwriterfs/xapianIndexer.h b/zimwriterfs/xapianIndexer.h index 8f85337..510692d 100644 --- a/zimwriterfs/xapianIndexer.h +++ b/zimwriterfs/xapianIndexer.h @@ -74,6 +74,7 @@ Xapian::TermGenerator indexer; std::string indexPath; std::string language; + std::string stopwords; }; #endif // OPENZIM_ZIMWRITERFS_XAPIANINDEXER_H -- To view, visit https://gerrit.wikimedia.org/r/344963 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I6cbc9f8d30c39d4fc1e65a356347d8fbfd456494 Gerrit-PatchSet: 1 Gerrit-Project: openzim Gerrit-Branch: master Gerrit-Owner: Mgautierfr <mgaut...@kymeria.fr> Gerrit-Reviewer: Kelson <kel...@kiwix.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits