Mgautierfr has uploaded a new change for review. (
https://gerrit.wikimedia.org/r/344962 )
Change subject: Store the language used by the stemmer in the database.
......................................................................
Store the language used by the stemmer in the database.
To properly search in the database, a user need to use the same stemming
algorithm/data that the one use while indexing the content.
By storing the used language in the database, a user code can create the
same stemmer on its side and correctly parse a query.
Change-Id: Idb7049f3639d4e96f50ca1af6bc491096ec2d52f
---
M zimwriterfs/xapianIndexer.cpp
M zimwriterfs/xapianIndexer.h
2 files changed, 10 insertions(+), 6 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/openzim refs/changes/62/344962/1
diff --git a/zimwriterfs/xapianIndexer.cpp b/zimwriterfs/xapianIndexer.cpp
index db27f9d..5abec26 100644
--- a/zimwriterfs/xapianIndexer.cpp
+++ b/zimwriterfs/xapianIndexer.cpp
@@ -20,21 +20,23 @@
#include "xapianIndexer.h"
/* Constructor */
-XapianIndexer::XapianIndexer(const std::string& language, const bool verbose) {
+XapianIndexer::XapianIndexer(const std::string& language, const bool verbose) :
+ language(language)
+{
setVerboseFlag(verbose);
readStopWords(language);
/* Build ICU Local object to retrieve ISO-639 language code (from
ISO-639-3) */
- icu::Locale *languageLocale = new icu::Locale(language.c_str());
+ icu::Locale languageLocale(language.c_str());
/* Configuring language base steemming */
try {
- this->stemmer = Xapian::Stem(languageLocale->getLanguage());
- this->indexer.set_stemmer(this->stemmer);
- this->indexer.set_stemming_strategy(Xapian::TermGenerator::STEM_ALL);
+ this->stemmer = Xapian::Stem(languageLocale.getLanguage());
+ this->indexer.set_stemmer(this->stemmer);
+ this->indexer.set_stemming_strategy(Xapian::TermGenerator::STEM_ALL);
} catch (...) {
- std::cout << "No steemming for language '" <<
languageLocale->getLanguage() << "'" << std::endl;
+ std::cout << "No steemming for language '" <<
languageLocale.getLanguage() << "'" << std::endl;
}
}
@@ -53,6 +55,7 @@
indexPath = indexPath_;
this->writableDatabase = Xapian::WritableDatabase(indexPath + ".tmp",
Xapian::DB_CREATE_OR_OVERWRITE);
this->writableDatabase.set_metadata("valuesmap", "title:0;wordcount:1");
+ this->writableDatabase.set_metadata("language", language);
this->writableDatabase.begin_transaction(true);
/* Insert the stopwords */
diff --git a/zimwriterfs/xapianIndexer.h b/zimwriterfs/xapianIndexer.h
index 16dc094..8f85337 100644
--- a/zimwriterfs/xapianIndexer.h
+++ b/zimwriterfs/xapianIndexer.h
@@ -73,6 +73,7 @@
Xapian::SimpleStopper stopper;
Xapian::TermGenerator indexer;
std::string indexPath;
+ std::string language;
};
#endif // OPENZIM_ZIMWRITERFS_XAPIANINDEXER_H
--
To view, visit https://gerrit.wikimedia.org/r/344962
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: Idb7049f3639d4e96f50ca1af6bc491096ec2d52f
Gerrit-PatchSet: 1
Gerrit-Project: openzim
Gerrit-Branch: master
Gerrit-Owner: Mgautierfr <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits