[MediaWiki-commits] [Gerrit] Add a indexer. - change (openzim)

2016-07-03 Thread Kelson (Code Review)
Kelson has submitted this change and it was merged.

Change subject: Add a indexer.
..


Add a indexer.

This indexer is not used.
This is mainly code from kiwix-indexer imported in openzim.
Unused function in *Tools has been removed.
No dependency to xapian.

Change-Id: I55079339d21d6903634c265f83f4d1c6ba0ac333
---
M zimwriterfs/Makefile.am
A zimwriterfs/indexer.cpp
A zimwriterfs/indexer.h
A zimwriterfs/pathTools.cpp
A zimwriterfs/pathTools.h
A zimwriterfs/resourceTools.cpp
A zimwriterfs/resourceTools.h
M zimwriterfs/zimwriterfs.cpp
8 files changed, 921 insertions(+), 2 deletions(-)

Approvals:
  Kelson: Verified; Looks good to me, approved



diff --git a/zimwriterfs/Makefile.am b/zimwriterfs/Makefile.am
index 92641d9..628b74c 100644
--- a/zimwriterfs/Makefile.am
+++ b/zimwriterfs/Makefile.am
@@ -6,4 +6,7 @@
 tools.cpp \
 article.cpp \
 articlesource.cpp \
+indexer.cpp \
+resourceTools.cpp \
+pathTools.cpp \
 mimetypecounter.cpp
diff --git a/zimwriterfs/indexer.cpp b/zimwriterfs/indexer.cpp
new file mode 100644
index 000..7820a32
--- /dev/null
+++ b/zimwriterfs/indexer.cpp
@@ -0,0 +1,262 @@
+/*
+ * Copyright 2011-2014 Emmanuel Engelhart 
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU  General Public License as published by
+ * the Free Software Foundation; either version 3 of the License, or
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+ * MA 02110-1301, USA.
+ */
+
+#include "indexer.h"
+#include "resourceTools.h"
+#include "pathTools.h"
+#include 
+
+  /* Count word */
+  unsigned int Indexer::countWords(const string &text) {
+unsigned int numWords = 1;
+unsigned int length = text.size();
+
+for(unsigned int i=0; istopWords.clear();
+
+while (getline(file, stopWord, '\n')) {
+  this->stopWords.push_back(stopWord);
+}
+  }
+
+  /* Article indexer methods */
+  void *Indexer::indexArticles(void *ptr) {
+pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, NULL);
+Indexer *self = (Indexer *)ptr;
+unsigned int indexedArticleCount = 0;
+indexerToken token;
+
+self->indexingPrelude(self->getIndexPath());
+
+while (self->popFromToIndexQueue(token)) {
+  self->index(token.url,
+ token.accentedTitle,
+ token.title,
+ token.keywords,
+ token.content,
+ token.snippet,
+ token.size,
+ token.wordCount
+ );
+
+  indexedArticleCount += 1;
+
+  /* Make a hard-disk flush every 10.000 articles */
+  if (indexedArticleCount % 5000 == 0) {
+   self->flush();
+  }
+
+  /* Test if the thread should be cancelled */
+  pthread_testcancel();
+}
+self->indexingPostlude();
+
+/* Write content id file */
+string path = appendToDirectory(self->getIndexPath(), "content.id");
+writeTextFile(path, self->getZimId());
+
+usleep(100);
+
+self->articleIndexerRunning(false);
+pthread_exit(NULL);
+return NULL;
+  }
+
+  void Indexer::articleIndexerRunning(bool value) {
+pthread_mutex_lock(&articleIndexerRunningMutex);
+this->articleIndexerRunningFlag = value;
+pthread_mutex_unlock(&articleIndexerRunningMutex);
+  }
+
+  bool Indexer::isArticleIndexerRunning() {
+pthread_mutex_lock(&articleIndexerRunningMutex);
+bool retVal = this->articleIndexerRunningFlag;
+pthread_mutex_unlock(&articleIndexerRunningMutex);
+return retVal;
+  }
+
+  /* ToIndexQueue methods */
+  bool Indexer::isToIndexQueueEmpty() {
+pthread_mutex_lock(&toIndexQueueMutex);
+bool retVal = this->toIndexQueue.empty();
+pthread_mutex_unlock(&toIndexQueueMutex);
+return retVal;
+  }
+
+  void Indexer::pushToIndexQueue(indexerToken &token) {
+pthread_mutex_lock(&toIndexQueueMutex);
+this->toIndexQueue.push(token);
+pthread_mutex_unlock(&toIndexQueueMutex);
+usleep(int(this->toIndexQueue.size() / 200) / 10 * 1000);
+  }
+
+  bool Indexer::popFromToIndexQueue(indexerToken &token) {
+while (this->isToIndexQueueEmpty()) {
+  usleep(500);
+  if (this->getVerboseFlag()) {
+   std::cout << "Waiting... ToIndexQueue is empty for now..." << std::endl;
+  }
+
+  pthread_testcancel();
+}
+
+pthread_mutex_lock(&toIndexQueueMutex);
+token = this->toIndexQueue.front();
+this->toIndexQueue.pop();
+pthread_mutex_unlock(&toIndexQueueMutex);
+
+

[MediaWiki-commits] [Gerrit] Add a indexer. - change (openzim)

2016-06-22 Thread Mgautierfr (Code Review)
Mgautierfr has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/295520

Change subject: Add a indexer.
..

Add a indexer.

This indexer is not used.
This is mainly code from kiwix-indexer imported in openzim.
Unused function in *Tools has been removed.
No dependency to xapian.

Change-Id: I0683706a136fb2303234e4caee77e9221714a5b1
---
M zimwriterfs/Makefile.am
M zimwriterfs/configure.ac
A zimwriterfs/indexer.cpp
A zimwriterfs/indexer.h
A zimwriterfs/pathTools.cpp
A zimwriterfs/pathTools.h
A zimwriterfs/resourceTools.cpp
A zimwriterfs/resourceTools.h
M zimwriterfs/tools.cpp
M zimwriterfs/tools.h
10 files changed, 1,009 insertions(+), 1 deletion(-)


  git pull ssh://gerrit.wikimedia.org:29418/openzim refs/changes/20/295520/1

diff --git a/zimwriterfs/Makefile.am b/zimwriterfs/Makefile.am
index 6e46553..e54c64d 100644
--- a/zimwriterfs/Makefile.am
+++ b/zimwriterfs/Makefile.am
@@ -5,4 +5,10 @@
 zimwriterfs.cpp \
 tools.cpp \
 article.cpp \
-articlesource.cpp
+articlesource.cpp \
+indexer.cpp \
+resourceTools.cpp \
+pathTools.cpp
+
+zimwriterfs_CXXFLAGS = $(ICU_CFLAGS)
+zimwriterfs_LDFLAGS = $(ICU_LDFLAGS)
diff --git a/zimwriterfs/configure.ac b/zimwriterfs/configure.ac
index fb12c8f..ba23cd9 100644
--- a/zimwriterfs/configure.ac
+++ b/zimwriterfs/configure.ac
@@ -71,6 +71,81 @@
 AC_DEFINE_UNQUOTED(LZMA_MEMORY_SIZE, 128, [set lzma uncompress memory size to 
number of MB])
 AC_DEFINE(ENABLE_LZMA, [1], [defined if lzma compression is enabled])
 
+
+function findLibrary {
+   found=0
+   for f in $(echo $LIBS_ROOT|tr ":" "\n") ; do
+   sf=`find $f -name $1 | grep $ARCH | head -1 2> /dev/null`
+   if [[ -f "$sf" -a $found -eq 0 ]]
+   then
+   found=1
+   echo $sf
+   fi
+   done
+   if [[ $found -eq 0 ]]
+   then
+   for f in $(echo $LIBS_ROOT|tr ":" "\n") ; do
+   sf=`find $f -name $1 | head -1 2> /dev/null`
+   if [[ -f "$sf" -a $found -eq 0 ]]
+   then
+   found=1
+   echo $sf
+   fi
+   done
+   fi
+   if [[ $found -eq 0 ]]
+   then
+   echo "no"
+   fi
+}
+
+
+
+ ICU
+
+
+
+ICU_CFLAGS=""
+ICU_LDFLAGS="-licui18n -licuuc -licudata" # replaced by icu-config
+ICU_STATIC_LDFLAGS=""
+
+# if --with-x, add path to LIBRARY_PATH
+AC_ARG_WITH(icu,
+AC_HELP_STRING([--with-icu=DIR], [alternate location for 
icu-config]),
+export 
LIBRARY_PATH="${withval}:${LIBRARY_PATH}";ICU_PATH=${withval}
+   )
+
+# look for shared library.
+# AC_CHECK_HEADER([zlib.h],, [AC_MSG_ERROR([[cannot find zlib header]])])
+# AC_CHECK_LIB([z], [zlibVersion],, [AC_MSG_ERROR([[cannot find 
zlib]]);COMPILE_ICU=1])
+# ICU_FILES=`findLibrary "libicuuc.${SHARED_EXT}"`
+
+AC_CHECK_TOOL(HAVE_ICU_CONFIG, icu-config,, "${ICU_PATH}:${PATH}")
+if test [ ! "$HAVE_ICU_CONFIG" ]
+then
+ AC_MSG_ERROR([[cannot find icu-config]])
+else
+OLDPATH=$PATH
+PATH="${ICU_PATH}:${PATH}"
+ICU_CFLAGS=`icu-config --cxxflags`;
+ICU_LDFLAGS=`icu-config --ldflags`;
+ICU_VER=`icu-config --version`;
+ICU_FILES="`findLibrary "libicuuc.${SHARED_EXT}"` `findLibrary 
"libicudata.${SHARED_EXT}"` `findLibrary "libicui18n.${SHARED_EXT}"`"
+PATH=$OLDPATH
+if [[ $ICU_VER \< "4.2" ]]
+   then
+AC_MSG_ERROR([[You need a version of libicu >= 4.2]])
+   fi
+fi
+
+
+AC_SUBST(ICU_CFLAGS)
+AC_SUBST(ICU_LDFLAGS)
+AC_SUBST(ICU_STATIC_LDFLAGS)
+AC_SUBST(ICU_FILES)
+AC_SUBST(COMPILED_ICUDATA_DAT)
+
+
 # Configure the output files
 AC_CONFIG_FILES([
   Makefile
diff --git a/zimwriterfs/indexer.cpp b/zimwriterfs/indexer.cpp
new file mode 100644
index 000..7820a32
--- /dev/null
+++ b/zimwriterfs/indexer.cpp
@@ -0,0 +1,262 @@
+/*
+ * Copyright 2011-2014 Emmanuel Engelhart 
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU  General Public License as published by
+ * the Free Software Foundation; either version 3 of the License, or
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+ * MA 02110-1301, USA.
+ */
+
+#include "indexer.h"
+#include "resourceTools.h"
+#include "pathTools.h"