Kelson has submitted this change and it was merged. ( 
https://gerrit.wikimedia.org/r/344962 )

Change subject: Store the language used by the stemmer in the database.
......................................................................


Store the language used by the stemmer in the database.

To properly search in the database, a user need to use the same stemming
algorithm/data that the one use while indexing the content.

By storing the used language in the database, a user code can create the
same stemmer on its side and correctly parse a query.

Change-Id: Idb7049f3639d4e96f50ca1af6bc491096ec2d52f
---
M zimwriterfs/xapianIndexer.cpp
M zimwriterfs/xapianIndexer.h
2 files changed, 10 insertions(+), 6 deletions(-)

Approvals:
  Kelson: Verified; Looks good to me, approved



diff --git a/zimwriterfs/xapianIndexer.cpp b/zimwriterfs/xapianIndexer.cpp
index db27f9d..5abec26 100644
--- a/zimwriterfs/xapianIndexer.cpp
+++ b/zimwriterfs/xapianIndexer.cpp
@@ -20,21 +20,23 @@
 #include "xapianIndexer.h"
 
 /* Constructor */
-XapianIndexer::XapianIndexer(const std::string& language, const bool verbose) {
+XapianIndexer::XapianIndexer(const std::string& language, const bool verbose) :
+    language(language)
+{
     setVerboseFlag(verbose);
     readStopWords(language);
 
     /* Build ICU Local object to retrieve ISO-639 language code (from
        ISO-639-3) */
-    icu::Locale *languageLocale = new icu::Locale(language.c_str());
+    icu::Locale languageLocale(language.c_str());
 
     /* Configuring language base steemming */
     try {
-      this->stemmer = Xapian::Stem(languageLocale->getLanguage());
-      this->indexer.set_stemmer(this->stemmer);
-      this->indexer.set_stemming_strategy(Xapian::TermGenerator::STEM_ALL);
+        this->stemmer = Xapian::Stem(languageLocale.getLanguage());
+        this->indexer.set_stemmer(this->stemmer);
+        this->indexer.set_stemming_strategy(Xapian::TermGenerator::STEM_ALL);
     } catch (...) {
-      std::cout << "No steemming for language '" << 
languageLocale->getLanguage() << "'" << std::endl;
+        std::cout << "No steemming for language '" << 
languageLocale.getLanguage() << "'" << std::endl;
     }
 }
 
@@ -53,6 +55,7 @@
     indexPath = indexPath_;
     this->writableDatabase = Xapian::WritableDatabase(indexPath + ".tmp", 
Xapian::DB_CREATE_OR_OVERWRITE);
     this->writableDatabase.set_metadata("valuesmap", "title:0;wordcount:1");
+    this->writableDatabase.set_metadata("language", language);
     this->writableDatabase.begin_transaction(true);
 
     /* Insert the stopwords */
diff --git a/zimwriterfs/xapianIndexer.h b/zimwriterfs/xapianIndexer.h
index 16dc094..8f85337 100644
--- a/zimwriterfs/xapianIndexer.h
+++ b/zimwriterfs/xapianIndexer.h
@@ -73,6 +73,7 @@
         Xapian::SimpleStopper stopper;
         Xapian::TermGenerator indexer;
         std::string indexPath;
+        std::string language;
 };
 
 #endif // OPENZIM_ZIMWRITERFS_XAPIANINDEXER_H

-- 
To view, visit https://gerrit.wikimedia.org/r/344962
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Idb7049f3639d4e96f50ca1af6bc491096ec2d52f
Gerrit-PatchSet: 1
Gerrit-Project: openzim
Gerrit-Branch: master
Gerrit-Owner: Mgautierfr <mgaut...@kymeria.fr>
Gerrit-Reviewer: Kelson <kel...@kiwix.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to