[MediaWiki-commits] [Gerrit] openzim[master]: Store the language used by the stemmer in the database.
Kelson has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/344962 ) Change subject: Store the language used by the stemmer in the database. .. Store the language used by the stemmer in the database. To properly search in the database, a user need to use the same stemming algorithm/data that the one use while indexing the content. By storing the used language in the database, a user code can create the same stemmer on its side and correctly parse a query. Change-Id: Idb7049f3639d4e96f50ca1af6bc491096ec2d52f --- M zimwriterfs/xapianIndexer.cpp M zimwriterfs/xapianIndexer.h 2 files changed, 10 insertions(+), 6 deletions(-) Approvals: Kelson: Verified; Looks good to me, approved diff --git a/zimwriterfs/xapianIndexer.cpp b/zimwriterfs/xapianIndexer.cpp index db27f9d..5abec26 100644 --- a/zimwriterfs/xapianIndexer.cpp +++ b/zimwriterfs/xapianIndexer.cpp @@ -20,21 +20,23 @@ #include "xapianIndexer.h" /* Constructor */ -XapianIndexer::XapianIndexer(const std::string& language, const bool verbose) { +XapianIndexer::XapianIndexer(const std::string& language, const bool verbose) : +language(language) +{ setVerboseFlag(verbose); readStopWords(language); /* Build ICU Local object to retrieve ISO-639 language code (from ISO-639-3) */ -icu::Locale *languageLocale = new icu::Locale(language.c_str()); +icu::Locale languageLocale(language.c_str()); /* Configuring language base steemming */ try { - this->stemmer = Xapian::Stem(languageLocale->getLanguage()); - this->indexer.set_stemmer(this->stemmer); - this->indexer.set_stemming_strategy(Xapian::TermGenerator::STEM_ALL); +this->stemmer = Xapian::Stem(languageLocale.getLanguage()); +this->indexer.set_stemmer(this->stemmer); +this->indexer.set_stemming_strategy(Xapian::TermGenerator::STEM_ALL); } catch (...) { - std::cout << "No steemming for language '" << languageLocale->getLanguage() << "'" << std::endl; +std::cout << "No steemming for language '" << languageLocale.getLanguage() << "'" << std::endl; } } @@ -53,6 +55,7 @@ indexPath = indexPath_; this->writableDatabase = Xapian::WritableDatabase(indexPath + ".tmp", Xapian::DB_CREATE_OR_OVERWRITE); this->writableDatabase.set_metadata("valuesmap", "title:0;wordcount:1"); +this->writableDatabase.set_metadata("language", language); this->writableDatabase.begin_transaction(true); /* Insert the stopwords */ diff --git a/zimwriterfs/xapianIndexer.h b/zimwriterfs/xapianIndexer.h index 16dc094..8f85337 100644 --- a/zimwriterfs/xapianIndexer.h +++ b/zimwriterfs/xapianIndexer.h @@ -73,6 +73,7 @@ Xapian::SimpleStopper stopper; Xapian::TermGenerator indexer; std::string indexPath; +std::string language; }; #endif // OPENZIM_ZIMWRITERFS_XAPIANINDEXER_H -- To view, visit https://gerrit.wikimedia.org/r/344962 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: Idb7049f3639d4e96f50ca1af6bc491096ec2d52f Gerrit-PatchSet: 1 Gerrit-Project: openzim Gerrit-Branch: master Gerrit-Owner: MgautierfrGerrit-Reviewer: Kelson ___ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
[MediaWiki-commits] [Gerrit] openzim[master]: Store the language used by the stemmer in the database.
Mgautierfr has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/344962 ) Change subject: Store the language used by the stemmer in the database. .. Store the language used by the stemmer in the database. To properly search in the database, a user need to use the same stemming algorithm/data that the one use while indexing the content. By storing the used language in the database, a user code can create the same stemmer on its side and correctly parse a query. Change-Id: Idb7049f3639d4e96f50ca1af6bc491096ec2d52f --- M zimwriterfs/xapianIndexer.cpp M zimwriterfs/xapianIndexer.h 2 files changed, 10 insertions(+), 6 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/openzim refs/changes/62/344962/1 diff --git a/zimwriterfs/xapianIndexer.cpp b/zimwriterfs/xapianIndexer.cpp index db27f9d..5abec26 100644 --- a/zimwriterfs/xapianIndexer.cpp +++ b/zimwriterfs/xapianIndexer.cpp @@ -20,21 +20,23 @@ #include "xapianIndexer.h" /* Constructor */ -XapianIndexer::XapianIndexer(const std::string& language, const bool verbose) { +XapianIndexer::XapianIndexer(const std::string& language, const bool verbose) : +language(language) +{ setVerboseFlag(verbose); readStopWords(language); /* Build ICU Local object to retrieve ISO-639 language code (from ISO-639-3) */ -icu::Locale *languageLocale = new icu::Locale(language.c_str()); +icu::Locale languageLocale(language.c_str()); /* Configuring language base steemming */ try { - this->stemmer = Xapian::Stem(languageLocale->getLanguage()); - this->indexer.set_stemmer(this->stemmer); - this->indexer.set_stemming_strategy(Xapian::TermGenerator::STEM_ALL); +this->stemmer = Xapian::Stem(languageLocale.getLanguage()); +this->indexer.set_stemmer(this->stemmer); +this->indexer.set_stemming_strategy(Xapian::TermGenerator::STEM_ALL); } catch (...) { - std::cout << "No steemming for language '" << languageLocale->getLanguage() << "'" << std::endl; +std::cout << "No steemming for language '" << languageLocale.getLanguage() << "'" << std::endl; } } @@ -53,6 +55,7 @@ indexPath = indexPath_; this->writableDatabase = Xapian::WritableDatabase(indexPath + ".tmp", Xapian::DB_CREATE_OR_OVERWRITE); this->writableDatabase.set_metadata("valuesmap", "title:0;wordcount:1"); +this->writableDatabase.set_metadata("language", language); this->writableDatabase.begin_transaction(true); /* Insert the stopwords */ diff --git a/zimwriterfs/xapianIndexer.h b/zimwriterfs/xapianIndexer.h index 16dc094..8f85337 100644 --- a/zimwriterfs/xapianIndexer.h +++ b/zimwriterfs/xapianIndexer.h @@ -73,6 +73,7 @@ Xapian::SimpleStopper stopper; Xapian::TermGenerator indexer; std::string indexPath; +std::string language; }; #endif // OPENZIM_ZIMWRITERFS_XAPIANINDEXER_H -- To view, visit https://gerrit.wikimedia.org/r/344962 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Idb7049f3639d4e96f50ca1af6bc491096ec2d52f Gerrit-PatchSet: 1 Gerrit-Project: openzim Gerrit-Branch: master Gerrit-Owner: Mgautierfr___ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits