Kelson has submitted this change and it was merged. ( 
https://gerrit.wikimedia.org/r/344963 )

Change subject: Store the stop words used by the indexer in the database.
......................................................................


Store the stop words used by the indexer in the database.

To properly search in the database, a user need to use the same stop word
that the ones used while indexing the content.

By storing the used stop words in the database, a user code can use them
on its side and correctly parse a query.
It is not enough to store the language as we need to read a file/resources
and user code may not have them.

Change-Id: I6cbc9f8d30c39d4fc1e65a356347d8fbfd456494
---
M zimwriterfs/indexer.cpp
M zimwriterfs/indexer.h
M zimwriterfs/xapianIndexer.cpp
M zimwriterfs/xapianIndexer.h
4 files changed, 14 insertions(+), 28 deletions(-)

Approvals:
  Kelson: Verified; Looks good to me, approved



diff --git a/zimwriterfs/indexer.cpp b/zimwriterfs/indexer.cpp
index 33989f4..8e0c211 100644
--- a/zimwriterfs/indexer.cpp
+++ b/zimwriterfs/indexer.cpp
@@ -57,18 +57,6 @@
   Indexer::~Indexer() {
   }
 
-  /* Read the stopwords */
-  void Indexer::readStopWords(const string languageCode) {
-    std::string stopWord;
-    std::istringstream file(getResourceAsString("stopwords/" + languageCode));
-
-    this->stopWords.clear();
-
-    while (getline(file, stopWord, '\n')) {
-      this->stopWords.push_back(stopWord);
-    }
-  }
-
   /* Article indexer methods */
   void *Indexer::indexArticles(void *ptr) {
     pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, NULL);
diff --git a/zimwriterfs/indexer.h b/zimwriterfs/indexer.h
index 686d156..797db7a 100644
--- a/zimwriterfs/indexer.h
+++ b/zimwriterfs/indexer.h
@@ -68,10 +68,6 @@
     virtual void flush() = 0;
     virtual void indexingPostlude() = 0;
 
-    /* Stop words */
-    std::vector<std::string> stopWords;
-    void readStopWords(const string languageCode);
-
     /* Others */
     unsigned int countWords(const string &text);
 
diff --git a/zimwriterfs/xapianIndexer.cpp b/zimwriterfs/xapianIndexer.cpp
index 5abec26..c4c0b2e 100644
--- a/zimwriterfs/xapianIndexer.cpp
+++ b/zimwriterfs/xapianIndexer.cpp
@@ -18,13 +18,13 @@
  */
 
 #include "xapianIndexer.h"
+#include "resourceTools.h"
 
 /* Constructor */
 XapianIndexer::XapianIndexer(const std::string& language, const bool verbose) :
     language(language)
 {
     setVerboseFlag(verbose);
-    readStopWords(language);
 
     /* Build ICU Local object to retrieve ISO-639 language code (from
        ISO-639-3) */
@@ -38,6 +38,17 @@
     } catch (...) {
         std::cout << "No steemming for language '" << 
languageLocale.getLanguage() << "'" << std::endl;
     }
+
+     /* Read the stopwords */
+    std::string stopWord;
+    this->stopwords = getResourceAsString("stopwords/"+language);
+    std::istringstream file(this->stopwords);
+    while (std::getline(file, stopWord, '\n')) {
+        this->stopper.add(stopWord);
+    }
+
+    this->indexer.set_stopper(&(this->stopper));
+    this->indexer.set_stopper_strategy(Xapian::TermGenerator::STOP_ALL);
 }
 
 XapianIndexer::~XapianIndexer(){
@@ -56,18 +67,8 @@
     this->writableDatabase = Xapian::WritableDatabase(indexPath + ".tmp", 
Xapian::DB_CREATE_OR_OVERWRITE);
     this->writableDatabase.set_metadata("valuesmap", "title:0;wordcount:1");
     this->writableDatabase.set_metadata("language", language);
+    this->writableDatabase.set_metadata("stopwords", stopwords);
     this->writableDatabase.begin_transaction(true);
-
-    /* Insert the stopwords */
-    if (!this->stopWords.empty()) {
-      std::vector<std::string>::iterator it = this->stopWords.begin();
-      for( ; it != this->stopWords.end(); ++it) {
-       this->stopper.add(*it);
-      }
-
-      this->indexer.set_stopper(&(this->stopper));
-      this->indexer.set_stopper_strategy(Xapian::TermGenerator::STOP_ALL);
-    }
 }
 
 void XapianIndexer::index(const string &url,
diff --git a/zimwriterfs/xapianIndexer.h b/zimwriterfs/xapianIndexer.h
index 8f85337..510692d 100644
--- a/zimwriterfs/xapianIndexer.h
+++ b/zimwriterfs/xapianIndexer.h
@@ -74,6 +74,7 @@
         Xapian::TermGenerator indexer;
         std::string indexPath;
         std::string language;
+        std::string stopwords;
 };
 
 #endif // OPENZIM_ZIMWRITERFS_XAPIANINDEXER_H

-- 
To view, visit https://gerrit.wikimedia.org/r/344963
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I6cbc9f8d30c39d4fc1e65a356347d8fbfd456494
Gerrit-PatchSet: 1
Gerrit-Project: openzim
Gerrit-Branch: master
Gerrit-Owner: Mgautierfr <mgaut...@kymeria.fr>
Gerrit-Reviewer: Kelson <kel...@kiwix.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to