[MediaWiki-commits] [Gerrit] Add a indexer. - change (openzim)
Kelson has submitted this change and it was merged. Change subject: Add a indexer. .. Add a indexer. This indexer is not used. This is mainly code from kiwix-indexer imported in openzim. Unused function in *Tools has been removed. No dependency to xapian. Change-Id: I55079339d21d6903634c265f83f4d1c6ba0ac333 --- M zimwriterfs/Makefile.am A zimwriterfs/indexer.cpp A zimwriterfs/indexer.h A zimwriterfs/pathTools.cpp A zimwriterfs/pathTools.h A zimwriterfs/resourceTools.cpp A zimwriterfs/resourceTools.h M zimwriterfs/zimwriterfs.cpp 8 files changed, 921 insertions(+), 2 deletions(-) Approvals: Kelson: Verified; Looks good to me, approved diff --git a/zimwriterfs/Makefile.am b/zimwriterfs/Makefile.am index 92641d9..628b74c 100644 --- a/zimwriterfs/Makefile.am +++ b/zimwriterfs/Makefile.am @@ -6,4 +6,7 @@ tools.cpp \ article.cpp \ articlesource.cpp \ +indexer.cpp \ +resourceTools.cpp \ +pathTools.cpp \ mimetypecounter.cpp diff --git a/zimwriterfs/indexer.cpp b/zimwriterfs/indexer.cpp new file mode 100644 index 000..7820a32 --- /dev/null +++ b/zimwriterfs/indexer.cpp @@ -0,0 +1,262 @@ +/* + * Copyright 2011-2014 Emmanuel Engelhart + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301, USA. + */ + +#include "indexer.h" +#include "resourceTools.h" +#include "pathTools.h" +#include + + /* Count word */ + unsigned int Indexer::countWords(const string &text) { +unsigned int numWords = 1; +unsigned int length = text.size(); + +for(unsigned int i=0; istopWords.clear(); + +while (getline(file, stopWord, '\n')) { + this->stopWords.push_back(stopWord); +} + } + + /* Article indexer methods */ + void *Indexer::indexArticles(void *ptr) { +pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, NULL); +Indexer *self = (Indexer *)ptr; +unsigned int indexedArticleCount = 0; +indexerToken token; + +self->indexingPrelude(self->getIndexPath()); + +while (self->popFromToIndexQueue(token)) { + self->index(token.url, + token.accentedTitle, + token.title, + token.keywords, + token.content, + token.snippet, + token.size, + token.wordCount + ); + + indexedArticleCount += 1; + + /* Make a hard-disk flush every 10.000 articles */ + if (indexedArticleCount % 5000 == 0) { + self->flush(); + } + + /* Test if the thread should be cancelled */ + pthread_testcancel(); +} +self->indexingPostlude(); + +/* Write content id file */ +string path = appendToDirectory(self->getIndexPath(), "content.id"); +writeTextFile(path, self->getZimId()); + +usleep(100); + +self->articleIndexerRunning(false); +pthread_exit(NULL); +return NULL; + } + + void Indexer::articleIndexerRunning(bool value) { +pthread_mutex_lock(&articleIndexerRunningMutex); +this->articleIndexerRunningFlag = value; +pthread_mutex_unlock(&articleIndexerRunningMutex); + } + + bool Indexer::isArticleIndexerRunning() { +pthread_mutex_lock(&articleIndexerRunningMutex); +bool retVal = this->articleIndexerRunningFlag; +pthread_mutex_unlock(&articleIndexerRunningMutex); +return retVal; + } + + /* ToIndexQueue methods */ + bool Indexer::isToIndexQueueEmpty() { +pthread_mutex_lock(&toIndexQueueMutex); +bool retVal = this->toIndexQueue.empty(); +pthread_mutex_unlock(&toIndexQueueMutex); +return retVal; + } + + void Indexer::pushToIndexQueue(indexerToken &token) { +pthread_mutex_lock(&toIndexQueueMutex); +this->toIndexQueue.push(token); +pthread_mutex_unlock(&toIndexQueueMutex); +usleep(int(this->toIndexQueue.size() / 200) / 10 * 1000); + } + + bool Indexer::popFromToIndexQueue(indexerToken &token) { +while (this->isToIndexQueueEmpty()) { + usleep(500); + if (this->getVerboseFlag()) { + std::cout << "Waiting... ToIndexQueue is empty for now..." << std::endl; + } + + pthread_testcancel(); +} + +pthread_mutex_lock(&toIndexQueueMutex); +token = this->toIndexQueue.front(); +this->toIndexQueue.pop(); +pthread_mutex_unlock(&toIndexQueueMutex); + +
[MediaWiki-commits] [Gerrit] Add a indexer. - change (openzim)
Mgautierfr has uploaded a new change for review. https://gerrit.wikimedia.org/r/295520 Change subject: Add a indexer. .. Add a indexer. This indexer is not used. This is mainly code from kiwix-indexer imported in openzim. Unused function in *Tools has been removed. No dependency to xapian. Change-Id: I0683706a136fb2303234e4caee77e9221714a5b1 --- M zimwriterfs/Makefile.am M zimwriterfs/configure.ac A zimwriterfs/indexer.cpp A zimwriterfs/indexer.h A zimwriterfs/pathTools.cpp A zimwriterfs/pathTools.h A zimwriterfs/resourceTools.cpp A zimwriterfs/resourceTools.h M zimwriterfs/tools.cpp M zimwriterfs/tools.h 10 files changed, 1,009 insertions(+), 1 deletion(-) git pull ssh://gerrit.wikimedia.org:29418/openzim refs/changes/20/295520/1 diff --git a/zimwriterfs/Makefile.am b/zimwriterfs/Makefile.am index 6e46553..e54c64d 100644 --- a/zimwriterfs/Makefile.am +++ b/zimwriterfs/Makefile.am @@ -5,4 +5,10 @@ zimwriterfs.cpp \ tools.cpp \ article.cpp \ -articlesource.cpp +articlesource.cpp \ +indexer.cpp \ +resourceTools.cpp \ +pathTools.cpp + +zimwriterfs_CXXFLAGS = $(ICU_CFLAGS) +zimwriterfs_LDFLAGS = $(ICU_LDFLAGS) diff --git a/zimwriterfs/configure.ac b/zimwriterfs/configure.ac index fb12c8f..ba23cd9 100644 --- a/zimwriterfs/configure.ac +++ b/zimwriterfs/configure.ac @@ -71,6 +71,81 @@ AC_DEFINE_UNQUOTED(LZMA_MEMORY_SIZE, 128, [set lzma uncompress memory size to number of MB]) AC_DEFINE(ENABLE_LZMA, [1], [defined if lzma compression is enabled]) + +function findLibrary { + found=0 + for f in $(echo $LIBS_ROOT|tr ":" "\n") ; do + sf=`find $f -name $1 | grep $ARCH | head -1 2> /dev/null` + if [[ -f "$sf" -a $found -eq 0 ]] + then + found=1 + echo $sf + fi + done + if [[ $found -eq 0 ]] + then + for f in $(echo $LIBS_ROOT|tr ":" "\n") ; do + sf=`find $f -name $1 | head -1 2> /dev/null` + if [[ -f "$sf" -a $found -eq 0 ]] + then + found=1 + echo $sf + fi + done + fi + if [[ $found -eq 0 ]] + then + echo "no" + fi +} + + + + ICU + + + +ICU_CFLAGS="" +ICU_LDFLAGS="-licui18n -licuuc -licudata" # replaced by icu-config +ICU_STATIC_LDFLAGS="" + +# if --with-x, add path to LIBRARY_PATH +AC_ARG_WITH(icu, +AC_HELP_STRING([--with-icu=DIR], [alternate location for icu-config]), +export LIBRARY_PATH="${withval}:${LIBRARY_PATH}";ICU_PATH=${withval} + ) + +# look for shared library. +# AC_CHECK_HEADER([zlib.h],, [AC_MSG_ERROR([[cannot find zlib header]])]) +# AC_CHECK_LIB([z], [zlibVersion],, [AC_MSG_ERROR([[cannot find zlib]]);COMPILE_ICU=1]) +# ICU_FILES=`findLibrary "libicuuc.${SHARED_EXT}"` + +AC_CHECK_TOOL(HAVE_ICU_CONFIG, icu-config,, "${ICU_PATH}:${PATH}") +if test [ ! "$HAVE_ICU_CONFIG" ] +then + AC_MSG_ERROR([[cannot find icu-config]]) +else +OLDPATH=$PATH +PATH="${ICU_PATH}:${PATH}" +ICU_CFLAGS=`icu-config --cxxflags`; +ICU_LDFLAGS=`icu-config --ldflags`; +ICU_VER=`icu-config --version`; +ICU_FILES="`findLibrary "libicuuc.${SHARED_EXT}"` `findLibrary "libicudata.${SHARED_EXT}"` `findLibrary "libicui18n.${SHARED_EXT}"`" +PATH=$OLDPATH +if [[ $ICU_VER \< "4.2" ]] + then +AC_MSG_ERROR([[You need a version of libicu >= 4.2]]) + fi +fi + + +AC_SUBST(ICU_CFLAGS) +AC_SUBST(ICU_LDFLAGS) +AC_SUBST(ICU_STATIC_LDFLAGS) +AC_SUBST(ICU_FILES) +AC_SUBST(COMPILED_ICUDATA_DAT) + + # Configure the output files AC_CONFIG_FILES([ Makefile diff --git a/zimwriterfs/indexer.cpp b/zimwriterfs/indexer.cpp new file mode 100644 index 000..7820a32 --- /dev/null +++ b/zimwriterfs/indexer.cpp @@ -0,0 +1,262 @@ +/* + * Copyright 2011-2014 Emmanuel Engelhart + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301, USA. + */ + +#include "indexer.h" +#include "resourceTools.h" +#include "pathTools.h"