Kelson has submitted this change and it was merged. ( 
https://gerrit.wikimedia.org/r/295515 )

Change subject: Move few utility functions to a separate module.
......................................................................


Move few utility functions to a separate module.

Change-Id: Ia26754d14f0cd6c557626675beb5a7c5fe2cadaa
---
M zimwriterfs/Makefile.am
A zimwriterfs/tools.cpp
A zimwriterfs/tools.h
M zimwriterfs/zimwriterfs.cpp
4 files changed, 578 insertions(+), 481 deletions(-)

Approvals:
  Kelson: Verified; Looks good to me, approved



diff --git a/zimwriterfs/Makefile.am b/zimwriterfs/Makefile.am
index 0caa3c8..ea2ab7a 100644
--- a/zimwriterfs/Makefile.am
+++ b/zimwriterfs/Makefile.am
@@ -1,3 +1,6 @@
 AUTOMAKE_OPTIONS=subdir-objects
 bin_PROGRAMS=zimwriterfs
-zimwriterfs_SOURCES=zimwriterfs.cpp
+
+zimwriterfs_SOURCES= \
+        zimwriterfs.cpp \
+        tools.cpp
diff --git a/zimwriterfs/tools.cpp b/zimwriterfs/tools.cpp
new file mode 100644
index 0000000..019b22c
--- /dev/null
+++ b/zimwriterfs/tools.cpp
@@ -0,0 +1,525 @@
+/*
+ * Copyright 2013-2016 Emmanuel Engelhart <kel...@kiwix.org>
+ * Copyright 2016 Matthieu Gautier <mgaut...@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU  General Public License as published by
+ * the Free Software Foundation; either version 3 of the License, or
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+ * MA 02110-1301, USA.
+ */
+
+#include "tools.h"
+
+
+#include <zlib.h>
+#include <string.h>
+#include <stdexcept>
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <vector>
+#include <cerrno>
+#include <sys/stat.h>
+#include <magic.h>
+
+#ifdef _WIN32
+#define SEPARATOR "\\"
+#else
+#define SEPARATOR "/"
+#endif
+
+
+/* Init file extensions hash */
+static std::map<std::string, std::string> _create_extMimeTypes(){
+  std::map<std::string, std::string> extMimeTypes;
+  extMimeTypes["HTML"] = "text/html";
+  extMimeTypes["html"] = "text/html";
+  extMimeTypes["HTM"] = "text/html";
+  extMimeTypes["htm"] = "text/html";
+  extMimeTypes["PNG"] = "image/png";
+  extMimeTypes["png"] = "image/png";
+  extMimeTypes["TIFF"] = "image/tiff";
+  extMimeTypes["tiff"] = "image/tiff";
+  extMimeTypes["TIF"] = "image/tiff";
+  extMimeTypes["tif"] = "image/tiff";
+  extMimeTypes["JPEG"] = "image/jpeg";
+  extMimeTypes["jpeg"] = "image/jpeg";
+  extMimeTypes["JPG"] = "image/jpeg";
+  extMimeTypes["jpg"] = "image/jpeg";
+  extMimeTypes["GIF"] = "image/gif";
+  extMimeTypes["gif"] = "image/gif";
+  extMimeTypes["SVG"] = "image/svg+xml";
+  extMimeTypes["svg"] = "image/svg+xml";
+  extMimeTypes["TXT"] = "text/plain";
+  extMimeTypes["txt"] = "text/plain";
+  extMimeTypes["XML"] = "text/xml";
+  extMimeTypes["xml"] = "text/xml";
+  extMimeTypes["EPUB"] = "application/epub+zip";
+  extMimeTypes["epub"] = "application/epub+zip";
+  extMimeTypes["PDF"] = "application/pdf";
+  extMimeTypes["pdf"] = "application/pdf";
+  extMimeTypes["OGG"] = "application/ogg";
+  extMimeTypes["ogg"] = "application/ogg";
+  extMimeTypes["JS"] = "application/javascript";
+  extMimeTypes["js"] = "application/javascript";
+  extMimeTypes["JSON"] = "application/json";
+  extMimeTypes["json"] = "application/json";
+  extMimeTypes["CSS"] = "text/css";
+  extMimeTypes["css"] = "text/css";
+  extMimeTypes["otf"] = "application/vnd.ms-opentype";
+  extMimeTypes["OTF"] = "application/vnd.ms-opentype";
+  extMimeTypes["eot"] = "application/vnd.ms-fontobject";
+  extMimeTypes["EOT"] = "application/vnd.ms-fontobject";
+  extMimeTypes["ttf"] = "application/font-ttf";
+  extMimeTypes["TTF"] = "application/font-ttf";
+  extMimeTypes["woff"] = "application/font-woff";
+  extMimeTypes["WOFF"] = "application/font-woff";
+  extMimeTypes["vtt"] = "text/vtt";
+  extMimeTypes["VTT"] = "text/vtt";
+  
+  return extMimeTypes;
+}
+
+static std::map<std::string, std::string> extMimeTypes = 
_create_extMimeTypes();
+
+static std::map<std::string, std::string> fileMimeTypes;
+
+
+extern std::string directoryPath;
+extern bool inflateHtmlFlag;
+extern bool uniqueNamespace;
+extern magic_t magic;
+
+/* Decompress an STL string using zlib and return the original data. */
+inline std::string inflateString(const std::string& str) {
+  z_stream zs; // z_stream is zlib's control structure
+  memset(&zs, 0, sizeof(zs));
+
+  if (inflateInit(&zs) != Z_OK)
+    throw(std::runtime_error("inflateInit failed while decompressing."));
+
+  zs.next_in = (Bytef*)str.data();
+  zs.avail_in = str.size();
+
+  int ret;
+  char outbuffer[32768];
+  std::string outstring;
+
+  // get the decompressed bytes blockwise using repeated calls to inflate
+  do {
+    zs.next_out = reinterpret_cast<Bytef*>(outbuffer);
+    zs.avail_out = sizeof(outbuffer);
+
+    ret = inflate(&zs, 0);
+
+    if (outstring.size() < zs.total_out) {
+      outstring.append(outbuffer,
+                      zs.total_out - outstring.size());
+    }
+
+  } while (ret == Z_OK);
+
+  inflateEnd(&zs);
+
+  if (ret != Z_STREAM_END) { // an error occurred that was not EOF
+    std::ostringstream oss;
+    oss << "Exception during zlib decompression: (" << ret << ") "
+       << zs.msg;
+    throw(std::runtime_error(oss.str()));
+  }
+
+  return outstring;
+}
+
+inline bool seemsToBeHtml(const std::string &path) {
+  if (path.find_last_of(".") != std::string::npos) {
+    std::string mimeType = path.substr(path.find_last_of(".")+1);
+    if (extMimeTypes.find(mimeType) != extMimeTypes.end()) {
+      return "text/html" == extMimeTypes[mimeType];
+    }
+  }
+
+  return false;
+}
+
+std::string getFileContent(const std::string &path) {
+  std::ifstream in(path.c_str(), ::std::ios::binary);
+  if (in) {
+    std::string contents;
+    in.seekg(0, std::ios::end);
+    contents.resize(in.tellg());
+    in.seekg(0, std::ios::beg);
+    in.read(&contents[0], contents.size());
+    in.close();
+
+    /* Inflate if necessary */
+    if (inflateHtmlFlag && seemsToBeHtml(path)) {
+      try {
+       contents = inflateString(contents);
+      } catch(...) {
+       std::cerr << "Can not initialize inflate stream for: " << path << 
std::endl;
+      }
+    }
+    return(contents);
+ }
+  std::cerr << "zimwriterfs: unable to open file at path: " << path << 
std::endl;
+  throw(errno);
+}
+
+unsigned int getFileSize(const std::string &path) {
+  struct stat filestatus;
+  stat(path.c_str(), &filestatus);
+  return filestatus.st_size;
+}    
+
+bool fileExists(const std::string &path) {
+  bool flag = false;
+  std::fstream fin;
+  fin.open(path.c_str(), std::ios::in);
+  if (fin.is_open()) {
+    flag = true;
+  }
+  fin.close();
+  return flag;
+}
+
+/* base64 */
+static const std::string base64_chars = 
+             "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+             "abcdefghijklmnopqrstuvwxyz"
+  "0123456789+/";
+
+std::string base64_encode(unsigned char const* bytes_to_encode, unsigned int 
in_len) {
+  std::string ret;
+  int i = 0;
+  int j = 0;
+  unsigned char char_array_3[3];
+  unsigned char char_array_4[4];
+
+  while (in_len--) {
+    char_array_3[i++] = *(bytes_to_encode++);
+    if (i == 3) {
+      char_array_4[0] = (char_array_3[0] & 0xfc) >> 2;
+      char_array_4[1] = ((char_array_3[0] & 0x03) << 4) + ((char_array_3[1] & 
0xf0) >> 4);
+      char_array_4[2] = ((char_array_3[1] & 0x0f) << 2) + ((char_array_3[2] & 
0xc0) >> 6);
+      char_array_4[3] = char_array_3[2] & 0x3f;
+
+      for(i = 0; (i <4) ; i++)
+        ret += base64_chars[char_array_4[i]];
+      i = 0;
+    }
+  }
+
+  if (i)
+    {
+      for(j = i; j < 3; j++)
+       char_array_3[j] = '\0';
+
+      char_array_4[0] = (char_array_3[0] & 0xfc) >> 2;
+      char_array_4[1] = ((char_array_3[0] & 0x03) << 4) + ((char_array_3[1] & 
0xf0) >> 4);
+      char_array_4[2] = ((char_array_3[1] & 0x0f) << 2) + ((char_array_3[2] & 
0xc0) >> 6);
+      char_array_4[3] = char_array_3[2] & 0x3f;
+
+      for (j = 0; (j < i + 1); j++)
+       ret += base64_chars[char_array_4[j]];
+
+      while((i++ < 3))
+       ret += '=';
+
+    }
+
+  return ret;
+
+}
+
+std::string decodeUrl(const std::string &encodedUrl) {
+  std::string decodedUrl = encodedUrl;
+  std::string::size_type pos = 0;
+  char ch;
+
+  while ((pos = decodedUrl.find('%', pos)) != std::string::npos &&
+        pos + 2 < decodedUrl.length()) {
+    sscanf(decodedUrl.substr(pos + 1, 2).c_str(), "%x", (unsigned int*)&ch);
+    decodedUrl.replace(pos, 3, 1, ch);
+    ++pos;
+  }
+
+  return decodedUrl;
+}
+
+std::string removeLastPathElement(const std::string& path, const bool 
removePreSeparator, const bool removePostSeparator) {
+  std::string newPath = path;
+  size_t offset = newPath.find_last_of(SEPARATOR);
+ 
+  if (removePreSeparator && offset == newPath.length()-1) {
+    newPath = newPath.substr(0, offset);
+    offset = newPath.find_last_of(SEPARATOR);
+  }
+  newPath = removePostSeparator ? newPath.substr(0, offset) : 
newPath.substr(0, offset+1);
+
+  return newPath;
+}
+
+/* Split string in a token array */
+std::vector<std::string> split(const std::string & str,
+                                      const std::string & delims=" *-")
+{
+  std::string::size_type lastPos = str.find_first_not_of(delims, 0);
+  std::string::size_type pos = str.find_first_of(delims, lastPos);
+  std::vector<std::string> tokens;
+
+  while (std::string::npos != pos || std::string::npos != lastPos)
+    {
+      tokens.push_back(str.substr(lastPos, pos - lastPos));
+      lastPos = str.find_first_not_of(delims, pos);
+      pos     = str.find_first_of(delims, lastPos);
+    }
+
+  return tokens;
+}
+
+std::vector<std::string> split(const char* lhs, const char* rhs){
+  const std::string m1 (lhs), m2 (rhs);
+  return split(m1, m2);
+}
+
+std::vector<std::string> split(const char* lhs, const std::string& rhs){
+  return split(lhs, rhs.c_str());
+}
+
+std::vector<std::string> split(const std::string& lhs, const char* rhs){
+  return split(lhs.c_str(), rhs);
+}
+
+/* Warning: the relative path must be with slashes */
+std::string computeAbsolutePath(const std::string& path, const std::string& 
relativePath) {
+
+  /* Add a trailing / to the path if necessary */
+  std::string absolutePath = path[path.length()-1] == '/' ? path : 
removeLastPathElement(path, false, false);
+
+  /* Go through relative path */
+  std::vector<std::string> relativePathElements;
+  std::stringstream relativePathStream(relativePath);
+  std::string relativePathItem;
+  while (std::getline(relativePathStream, relativePathItem, '/')) {
+    if (relativePathItem == "..") {
+      absolutePath = removeLastPathElement(absolutePath, true, false); 
+    } else if (!relativePathItem.empty() && relativePathItem != ".") {
+      absolutePath += relativePathItem;
+      absolutePath += "/";
+    }
+  }
+  
+  /* Remove wront trailing / */
+  return absolutePath.substr(0, absolutePath.length()-1);
+}
+
+/* Warning: the relative path must be with slashes */
+std::string computeRelativePath(const std::string path, const std::string 
absolutePath) {
+  std::vector<std::string> pathParts = split(path, "/");
+  std::vector<std::string> absolutePathParts = split(absolutePath, "/");
+
+  unsigned int commonCount = 0;
+  while (commonCount < pathParts.size() && 
+        commonCount < absolutePathParts.size() && 
+        pathParts[commonCount] == absolutePathParts[commonCount]) {
+    if (!pathParts[commonCount].empty()) {
+      commonCount++;
+    }
+  }
+    
+  std::string relativePath;
+  for (unsigned int i = commonCount ; i < pathParts.size()-1 ; i++) {
+    relativePath += "../";
+  }
+
+  for (unsigned int i = commonCount ; i < absolutePathParts.size() ; i++) {
+    relativePath += absolutePathParts[i];
+    relativePath += i + 1 < absolutePathParts.size() ? "/" : "";
+  }
+
+  return relativePath;
+}
+
+
+
+static bool isLocalUrl(const std::string url) {
+  if (url.find(":") != std::string::npos) {
+    return (!(
+             url.find("://") != std::string::npos ||
+             url.find("//") == 0 ||
+             url.find("tel:") == 0 ||
+             url.find("geo:") == 0
+             ));
+  }
+  return true;
+}
+
+std::string extractRedirectUrlFromHtml(const GumboVector* head_children) {
+  std::string url;
+  
+  for (int i = 0; i < head_children->length; ++i) {
+    GumboNode* child = (GumboNode*)(head_children->data[i]);
+    if (child->type == GUMBO_NODE_ELEMENT &&
+       child->v.element.tag == GUMBO_TAG_META) {
+      GumboAttribute* attribute;
+      if (attribute = gumbo_get_attribute(&child->v.element.attributes, 
"http-equiv")) {
+       if (!strcmp(attribute->value, "refresh")) {
+         if (attribute = gumbo_get_attribute(&child->v.element.attributes, 
"content")) {
+           std::string targetUrl = attribute->value;
+           std::size_t found = targetUrl.find("URL=") != std::string::npos ? 
targetUrl.find("URL=") : targetUrl.find("url=");
+           if (found!=std::string::npos) {
+             url = targetUrl.substr(found+4);
+           } else {
+             throw std::string("Unable to find the redirect/refresh target url 
from the HTML DOM");
+           }
+         }
+       }
+      }
+    }
+  }
+  
+  return url;
+}
+
+void getLinks(GumboNode* node, std::map<std::string, bool> &links) {
+  if (node->type != GUMBO_NODE_ELEMENT) {
+    return;
+  }
+
+  GumboAttribute* attribute = NULL;
+  attribute = gumbo_get_attribute(&node->v.element.attributes, "href");
+  if (attribute == NULL) {
+    attribute = gumbo_get_attribute(&node->v.element.attributes, "src");
+  }
+
+  if (attribute != NULL && isLocalUrl(attribute->value)) {
+    links[attribute->value] = true;
+  }
+
+  GumboVector* children = &node->v.element.children;
+  for (int i = 0; i < children->length; ++i) {
+    getLinks(static_cast<GumboNode*>(children->data[i]), links);
+  }
+}
+
+void replaceStringInPlaceOnce(std::string& subject,
+                              const std::string& search,
+                              const std::string& replace) {
+  size_t pos = 0;
+  while ((pos = subject.find(search, pos)) != std::string::npos) {
+    subject.replace(pos, search.length(), replace);
+    pos += replace.length();
+    return; /* Do it once */
+  }
+}
+
+void replaceStringInPlace(std::string& subject,
+                          const std::string& search,
+                          const std::string& replace) {
+  size_t pos = 0;
+  while ((pos = subject.find(search, pos)) != std::string::npos) {
+    subject.replace(pos, search.length(), replace);
+    pos += replace.length();
+  }
+
+  return;
+}
+
+void stripTitleInvalidChars(std::string & str) {
+
+  /* Remove unicode orientation invisible characters */
+  replaceStringInPlace(str, "\u202A", "");
+  replaceStringInPlace(str, "\u202C", "");
+}
+
+std::string getMimeTypeForFile(const std::string& filename) {
+  std::string mimeType;
+
+  /* Try to get the mimeType from the file extension */
+  if (filename.find_last_of(".") != std::string::npos) {
+    mimeType = filename.substr(filename.find_last_of(".")+1);
+    if (extMimeTypes.find(mimeType) != extMimeTypes.end()) {
+      return extMimeTypes[mimeType];
+    }
+  }
+
+  /* Try to get the mimeType from the cache */
+  if (fileMimeTypes.find(filename) != fileMimeTypes.end()) {
+    return fileMimeTypes[filename];
+  }
+
+  /* Try to get the mimeType with libmagic */
+  try {
+    std::string path = directoryPath + "/" + filename;
+    mimeType = std::string(magic_file(magic, path.c_str()));
+    if (mimeType.find(";") != std::string::npos) {
+      mimeType = mimeType.substr(0, mimeType.find(";"));
+    }
+    fileMimeTypes[filename] = mimeType;
+    return mimeType;
+  } catch (...) {
+    return "";
+  }
+}
+
+std::string getNamespaceForMimeType(const std::string& mimeType) {
+  if (uniqueNamespace || mimeType.find("text") == 0 || mimeType.empty()) {
+    if (uniqueNamespace || mimeType.find("text/html") == 0 || 
mimeType.empty()) {
+      return "A";
+    } else {
+      return "-";
+    }
+  } else {
+    if (mimeType == "application/font-ttf" || 
+       mimeType == "application/font-woff" ||
+       mimeType == "application/vnd.ms-opentype" ||
+       mimeType == "application/vnd.ms-fontobject" ||
+       mimeType == "application/javascript" ||
+       mimeType == "application/json"
+       ) {
+      return "-";
+    } else {
+      return "I";
+    }
+  }
+}
+
+inline std::string removeLocalTagAndParameters(const std::string &url) {
+  std::string retVal = url;
+  std::size_t found;
+
+  /* Remove URL arguments */
+  found = retVal.find("?");
+  if (found != std::string::npos) {
+    retVal = retVal.substr(0, found);
+  }
+
+  /* Remove local tag */
+  found = retVal.find("#");
+  if (found != std::string::npos) {
+    retVal = retVal.substr(0, found);
+  }
+
+  return retVal;
+}
+
+std::string computeNewUrl(const std::string &aid, const std::string &url) {
+  std::string filename = computeAbsolutePath(aid, url);
+  std::string targetMimeType = 
getMimeTypeForFile(removeLocalTagAndParameters(decodeUrl(filename)));
+  std::string originMimeType = getMimeTypeForFile(aid);
+  std::string newUrl = "/" + getNamespaceForMimeType(targetMimeType) + "/" + 
filename;
+  std::string baseUrl = "/" + getNamespaceForMimeType(originMimeType) + "/" + 
aid;
+  return computeRelativePath(baseUrl, newUrl);
+}
+
diff --git a/zimwriterfs/tools.h b/zimwriterfs/tools.h
new file mode 100644
index 0000000..ec6b454
--- /dev/null
+++ b/zimwriterfs/tools.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2013-2016 Emmanuel Engelhart <kel...@kiwix.org>
+ * Copyright 2016 Matthieu Gautier <mgaut...@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU  General Public License as published by
+ * the Free Software Foundation; either version 3 of the License, or
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+ * MA 02110-1301, USA.
+ */
+
+
+#ifndef OPENZIM_ZIMWRITERFS_TOOLS_H
+#define OPENZIM_ZIMWRITERFS_TOOLS_H
+
+#include <string>
+#include <map>
+#include <gumbo.h>
+
+std::string getMimeTypeForFile(const std::string& filename);
+std::string getNamespaceForMimeType(const std::string& mimeType);
+std::string getFileContent(const std::string &path);
+unsigned int getFileSize(const std::string &path);
+std::string decodeUrl(const std::string &encodedUrl);
+std::string computeAbsolutePath(const std::string& path, const std::string& 
relativePath);
+bool fileExists(const std::string &path);
+std::string removeLastPathElement(const std::string& path, const bool 
removePreSeparator, const bool removePostSeparator);
+std::string computeNewUrl(const std::string &aid, const std::string &url);
+
+std::string base64_encode(unsigned char const* bytes_to_encode, unsigned int 
in_len);
+
+void replaceStringInPlaceOnce(std::string& subject, const std::string& search, 
const std::string& replace);
+void replaceStringInPlace(std::string& subject, const std::string& search, 
const std::string& replace);
+
+std::string extractRedirectUrlFromHtml(const GumboVector* head_children);
+void getLinks(GumboNode* node, std::map<std::string, bool> &links);
+
+#endif // OPENZIM_ZIMWRITERFS_TOOLS_H
diff --git a/zimwriterfs/zimwriterfs.cpp b/zimwriterfs/zimwriterfs.cpp
index f110e29..693e6a1 100644
--- a/zimwriterfs/zimwriterfs.cpp
+++ b/zimwriterfs/zimwriterfs.cpp
@@ -28,31 +28,19 @@
 
 #include <iomanip>
 #include <fstream>
-#include <iostream>
 #include <sstream>
-#include <vector>
 #include <queue>
 #include <map>
 #include <cstdio>
-#include <cerrno>
-#include <stdexcept>
-
 #include <magic.h>
-
-#include <zlib.h>
 
 #include <zim/writer/zimcreator.h>
 #include <zim/blob.h>
 
-#include <gumbo.h>
+
+#include "tools.h"
 
 #define MAX_QUEUE_SIZE 100
-
-#ifdef _WIN32
-#define SEPARATOR "\\"
-#else
-#define SEPARATOR "/"
-#endif
 
 std::string language;
 std::string creator;
@@ -80,261 +68,9 @@
 
 magic_t magic;
 std::map<std::string, unsigned int> counters;
-std::map<std::string, std::string> fileMimeTypes;
-std::map<std::string, std::string> extMimeTypes;
 char *data = NULL;
 unsigned int dataSize = 0;
 
-/* Decompress an STL string using zlib and return the original data. */
-inline std::string inflateString(const std::string& str) {
-  z_stream zs; // z_stream is zlib's control structure
-  memset(&zs, 0, sizeof(zs));
-
-  if (inflateInit(&zs) != Z_OK)
-    throw(std::runtime_error("inflateInit failed while decompressing."));
-
-  zs.next_in = (Bytef*)str.data();
-  zs.avail_in = str.size();
-
-  int ret;
-  char outbuffer[32768];
-  std::string outstring;
-
-  // get the decompressed bytes blockwise using repeated calls to inflate
-  do {
-    zs.next_out = reinterpret_cast<Bytef*>(outbuffer);
-    zs.avail_out = sizeof(outbuffer);
-
-    ret = inflate(&zs, 0);
-
-    if (outstring.size() < zs.total_out) {
-      outstring.append(outbuffer,
-                      zs.total_out - outstring.size());
-    }
-
-  } while (ret == Z_OK);
-
-  inflateEnd(&zs);
-
-  if (ret != Z_STREAM_END) { // an error occurred that was not EOF
-    std::ostringstream oss;
-    oss << "Exception during zlib decompression: (" << ret << ") "
-       << zs.msg;
-    throw(std::runtime_error(oss.str()));
-  }
-
-  return outstring;
-}
-
-inline bool seemsToBeHtml(const std::string &path) {
-  if (path.find_last_of(".") != std::string::npos) {
-    std::string mimeType = path.substr(path.find_last_of(".")+1);
-    if (extMimeTypes.find(mimeType) != extMimeTypes.end()) {
-      return "text/html" == extMimeTypes[mimeType];
-    }
-  }
-
-  return false;
-}
-
-inline std::string getFileContent(const std::string &path) {
-  std::ifstream in(path.c_str(), ::std::ios::binary);
-  if (in) {
-    std::string contents;
-    in.seekg(0, std::ios::end);
-    contents.resize(in.tellg());
-    in.seekg(0, std::ios::beg);
-    in.read(&contents[0], contents.size());
-    in.close();
-
-    /* Inflate if necessary */
-    if (inflateHtmlFlag && seemsToBeHtml(path)) {
-      try {
-       contents = inflateString(contents);
-      } catch(...) {
-       std::cerr << "Can not initialize inflate stream for: " << path << 
std::endl;
-      }
-    }
-    return(contents);
- }
-  std::cerr << "zimwriterfs: unable to open file at path: " << path << 
std::endl;
-  throw(errno);
-}
-
-inline unsigned int getFileSize(const std::string &path) {
-  struct stat filestatus;
-  stat(path.c_str(), &filestatus);
-  return filestatus.st_size;
-}    
-
-inline bool fileExists(const std::string &path) {
-  bool flag = false;
-  std::fstream fin;
-  fin.open(path.c_str(), std::ios::in);
-  if (fin.is_open()) {
-    flag = true;
-  }
-  fin.close();
-  return flag;
-}
-
-/* base64 */
-static const std::string base64_chars = 
-             "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
-             "abcdefghijklmnopqrstuvwxyz"
-  "0123456789+/";
-
-std::string base64_encode(unsigned char const* bytes_to_encode, unsigned int 
in_len) {
-  std::string ret;
-  int i = 0;
-  int j = 0;
-  unsigned char char_array_3[3];
-  unsigned char char_array_4[4];
-
-  while (in_len--) {
-    char_array_3[i++] = *(bytes_to_encode++);
-    if (i == 3) {
-      char_array_4[0] = (char_array_3[0] & 0xfc) >> 2;
-      char_array_4[1] = ((char_array_3[0] & 0x03) << 4) + ((char_array_3[1] & 
0xf0) >> 4);
-      char_array_4[2] = ((char_array_3[1] & 0x0f) << 2) + ((char_array_3[2] & 
0xc0) >> 6);
-      char_array_4[3] = char_array_3[2] & 0x3f;
-
-      for(i = 0; (i <4) ; i++)
-        ret += base64_chars[char_array_4[i]];
-      i = 0;
-    }
-  }
-
-  if (i)
-    {
-      for(j = i; j < 3; j++)
-       char_array_3[j] = '\0';
-
-      char_array_4[0] = (char_array_3[0] & 0xfc) >> 2;
-      char_array_4[1] = ((char_array_3[0] & 0x03) << 4) + ((char_array_3[1] & 
0xf0) >> 4);
-      char_array_4[2] = ((char_array_3[1] & 0x0f) << 2) + ((char_array_3[2] & 
0xc0) >> 6);
-      char_array_4[3] = char_array_3[2] & 0x3f;
-
-      for (j = 0; (j < i + 1); j++)
-       ret += base64_chars[char_array_4[j]];
-
-      while((i++ < 3))
-       ret += '=';
-
-    }
-
-  return ret;
-
-}
-
-inline std::string decodeUrl(const std::string &encodedUrl) {
-  std::string decodedUrl = encodedUrl;
-  std::string::size_type pos = 0;
-  char ch;
-
-  while ((pos = decodedUrl.find('%', pos)) != std::string::npos &&
-        pos + 2 < decodedUrl.length()) {
-    sscanf(decodedUrl.substr(pos + 1, 2).c_str(), "%x", (unsigned int*)&ch);
-    decodedUrl.replace(pos, 3, 1, ch);
-    ++pos;
-  }
-
-  return decodedUrl;
-}
-
-inline std::string removeLastPathElement(const std::string path, const bool 
removePreSeparator, const bool removePostSeparator) {
-  std::string newPath = path;
-  size_t offset = newPath.find_last_of(SEPARATOR);
- 
-  if (removePreSeparator && offset == newPath.length()-1) {
-    newPath = newPath.substr(0, offset);
-    offset = newPath.find_last_of(SEPARATOR);
-  }
-  newPath = removePostSeparator ? newPath.substr(0, offset) : 
newPath.substr(0, offset+1);
-
-  return newPath;
-}
-
-/* Split string in a token array */
-std::vector<std::string> split(const std::string & str,
-                                      const std::string & delims=" *-")
-{
-  std::string::size_type lastPos = str.find_first_not_of(delims, 0);
-  std::string::size_type pos = str.find_first_of(delims, lastPos);
-  std::vector<std::string> tokens;
-
-  while (std::string::npos != pos || std::string::npos != lastPos)
-    {
-      tokens.push_back(str.substr(lastPos, pos - lastPos));
-      lastPos = str.find_first_not_of(delims, pos);
-      pos     = str.find_first_of(delims, lastPos);
-    }
-
-  return tokens;
-}
-
-std::vector<std::string> split(const char* lhs, const char* rhs){
-  const std::string m1 (lhs), m2 (rhs);
-  return split(m1, m2);
-}
-
-std::vector<std::string> split(const char* lhs, const std::string& rhs){
-  return split(lhs, rhs.c_str());
-}
-
-std::vector<std::string> split(const std::string& lhs, const char* rhs){
-  return split(lhs.c_str(), rhs);
-}
-
-/* Warning: the relative path must be with slashes */
-inline std::string computeAbsolutePath(const std::string path, const 
std::string relativePath) {
-
-  /* Add a trailing / to the path if necessary */
-  std::string absolutePath = path[path.length()-1] == '/' ? path : 
removeLastPathElement(path, false, false);
-
-  /* Go through relative path */
-  std::vector<std::string> relativePathElements;
-  std::stringstream relativePathStream(relativePath);
-  std::string relativePathItem;
-  while (std::getline(relativePathStream, relativePathItem, '/')) {
-    if (relativePathItem == "..") {
-      absolutePath = removeLastPathElement(absolutePath, true, false); 
-    } else if (!relativePathItem.empty() && relativePathItem != ".") {
-      absolutePath += relativePathItem;
-      absolutePath += "/";
-    }
-  }
-  
-  /* Remove wront trailing / */
-  return absolutePath.substr(0, absolutePath.length()-1);
-}
-
-/* Warning: the relative path must be with slashes */
-std::string computeRelativePath(const std::string path, const std::string 
absolutePath) {
-  std::vector<std::string> pathParts = split(path, "/");
-  std::vector<std::string> absolutePathParts = split(absolutePath, "/");
-
-  unsigned int commonCount = 0;
-  while (commonCount < pathParts.size() && 
-        commonCount < absolutePathParts.size() && 
-        pathParts[commonCount] == absolutePathParts[commonCount]) {
-    if (!pathParts[commonCount].empty()) {
-      commonCount++;
-    }
-  }
-    
-  std::string relativePath;
-  for (unsigned int i = commonCount ; i < pathParts.size()-1 ; i++) {
-    relativePath += "../";
-  }
-
-  for (unsigned int i = commonCount ; i < absolutePathParts.size() ; i++) {
-    relativePath += absolutePathParts[i];
-    relativePath += i + 1 < absolutePathParts.size() ? "/" : "";
-  }
-
-  return relativePath;
-}
 
 void directoryVisitorRunning(bool value) {
   pthread_mutex_lock(&directoryVisitorRunningMutex);
@@ -464,174 +200,6 @@
   }
 };
 
-static bool isLocalUrl(const std::string url) {
-  if (url.find(":") != std::string::npos) {
-    return (!(
-             url.find("://") != std::string::npos ||
-             url.find("//") == 0 ||
-             url.find("tel:") == 0 ||
-             url.find("geo:") == 0
-             ));
-  }
-  return true;
-}
-
-static std::string extractRedirectUrlFromHtml(const GumboVector* 
head_children) {
-  std::string url;
-  
-  for (int i = 0; i < head_children->length; ++i) {
-    GumboNode* child = (GumboNode*)(head_children->data[i]);
-    if (child->type == GUMBO_NODE_ELEMENT &&
-       child->v.element.tag == GUMBO_TAG_META) {
-      GumboAttribute* attribute;
-      if (attribute = gumbo_get_attribute(&child->v.element.attributes, 
"http-equiv")) {
-       if (!strcmp(attribute->value, "refresh")) {
-         if (attribute = gumbo_get_attribute(&child->v.element.attributes, 
"content")) {
-           std::string targetUrl = attribute->value;
-           std::size_t found = targetUrl.find("URL=") != std::string::npos ? 
targetUrl.find("URL=") : targetUrl.find("url=");
-           if (found!=std::string::npos) {
-             url = targetUrl.substr(found+4);
-           } else {
-             throw std::string("Unable to find the redirect/refresh target url 
from the HTML DOM");
-           }
-         }
-       }
-      }
-    }
-  }
-  
-  return url;
-}
-
-static void getLinks(GumboNode* node, std::map<std::string, bool> &links) {
-  if (node->type != GUMBO_NODE_ELEMENT) {
-    return;
-  }
-
-  GumboAttribute* attribute = NULL;
-  attribute = gumbo_get_attribute(&node->v.element.attributes, "href");
-  if (attribute == NULL) {
-    attribute = gumbo_get_attribute(&node->v.element.attributes, "src");
-  }
-
-  if (attribute != NULL && isLocalUrl(attribute->value)) {
-    links[attribute->value] = true;
-  }
-
-  GumboVector* children = &node->v.element.children;
-  for (int i = 0; i < children->length; ++i) {
-    getLinks(static_cast<GumboNode*>(children->data[i]), links);
-  }
-}
-
-inline static void replaceStringInPlaceOnce(std::string& subject,
-                                           const std::string& search,
-                                           const std::string& replace) {
-  size_t pos = 0;
-  while ((pos = subject.find(search, pos)) != std::string::npos) {
-    subject.replace(pos, search.length(), replace);
-    pos += replace.length();
-    return; /* Do it once */
-  }
-}
-
-inline static void replaceStringInPlace(std::string& subject, const 
std::string& search,
-                                const std::string& replace) {
-  size_t pos = 0;
-  while ((pos = subject.find(search, pos)) != std::string::npos) {
-    subject.replace(pos, search.length(), replace);
-    pos += replace.length();
-  }
-
-  return;
-}
-
-inline static void stripTitleInvalidChars(std::string & str) {
-
-  /* Remove unicode orientation invisible characters */
-  replaceStringInPlace(str, "\u202A", "");
-  replaceStringInPlace(str, "\u202C", "");
-}
-
-static std::string getMimeTypeForFile(const std::string& filename) {
-  std::string mimeType;
-
-  /* Try to get the mimeType from the file extension */
-  if (filename.find_last_of(".") != std::string::npos) {
-    mimeType = filename.substr(filename.find_last_of(".")+1);
-    if (extMimeTypes.find(mimeType) != extMimeTypes.end()) {
-      return extMimeTypes[mimeType];
-    }
-  }
-
-  /* Try to get the mimeType from the cache */
-  if (fileMimeTypes.find(filename) != fileMimeTypes.end()) {
-    return fileMimeTypes[filename];
-  }
-
-  /* Try to get the mimeType with libmagic */
-  try {
-    std::string path = directoryPath + "/" + filename;
-    mimeType = std::string(magic_file(magic, path.c_str()));
-    if (mimeType.find(";") != std::string::npos) {
-      mimeType = mimeType.substr(0, mimeType.find(";"));
-    }
-    fileMimeTypes[filename] = mimeType;
-    return mimeType;
-  } catch (...) {
-    return "";
-  }
-}
-
-inline std::string getNamespaceForMimeType(const std::string& mimeType) {
-  if (uniqueNamespace || mimeType.find("text") == 0 || mimeType.empty()) {
-    if (uniqueNamespace || mimeType.find("text/html") == 0 || 
mimeType.empty()) {
-      return "A";
-    } else {
-      return "-";
-    }
-  } else {
-    if (mimeType == "application/font-ttf" || 
-       mimeType == "application/font-woff" ||
-       mimeType == "application/vnd.ms-opentype" ||
-       mimeType == "application/vnd.ms-fontobject" ||
-       mimeType == "application/javascript" ||
-       mimeType == "application/json"
-       ) {
-      return "-";
-    } else {
-      return "I";
-    }
-  }
-}
-
-inline std::string removeLocalTagAndParameters(const std::string &url) {
-  std::string retVal = url;
-  std::size_t found;
-
-  /* Remove URL arguments */
-  found = retVal.find("?");
-  if (found != std::string::npos) {
-    retVal = retVal.substr(0, found);
-  }
-
-  /* Remove local tag */
-  found = retVal.find("#");
-  if (found != std::string::npos) {
-    retVal = retVal.substr(0, found);
-  }
-
-  return retVal;
-}
-
-inline std::string computeNewUrl(const std::string &aid, const std::string 
&url) {
-  std::string filename = computeAbsolutePath(aid, url);
-  std::string targetMimeType = 
getMimeTypeForFile(removeLocalTagAndParameters(decodeUrl(filename)));
-  std::string originMimeType = getMimeTypeForFile(aid);
-  std::string newUrl = "/" + getNamespaceForMimeType(targetMimeType) + "/" + 
filename;
-  std::string baseUrl = "/" + getNamespaceForMimeType(originMimeType) + "/" + 
aid;
-  return computeRelativePath(baseUrl, newUrl);
-}
 
 Article::Article(const std::string& path, const bool detectRedirects = true) {
   invalid = false;
@@ -678,7 +246,6 @@
              GumboNode* title_text = 
(GumboNode*)(child->v.element.children.data[0]);
              if (title_text->type == GUMBO_NODE_TEXT) {
                title = title_text->v.text.text;
-               stripTitleInvalidChars(title);
              }
            }
          }
@@ -1091,51 +658,6 @@
   ArticleSource source;
   int minChunkSize = 2048;
 
-  /* Init file extensions hash */
-  extMimeTypes["HTML"] = "text/html";
-  extMimeTypes["html"] = "text/html";
-  extMimeTypes["HTM"] = "text/html";
-  extMimeTypes["htm"] = "text/html";
-  extMimeTypes["PNG"] = "image/png";
-  extMimeTypes["png"] = "image/png";
-  extMimeTypes["TIFF"] = "image/tiff";
-  extMimeTypes["tiff"] = "image/tiff";
-  extMimeTypes["TIF"] = "image/tiff";
-  extMimeTypes["tif"] = "image/tiff";
-  extMimeTypes["JPEG"] = "image/jpeg";
-  extMimeTypes["jpeg"] = "image/jpeg";
-  extMimeTypes["JPG"] = "image/jpeg";
-  extMimeTypes["jpg"] = "image/jpeg";
-  extMimeTypes["GIF"] = "image/gif";
-  extMimeTypes["gif"] = "image/gif";
-  extMimeTypes["SVG"] = "image/svg+xml";
-  extMimeTypes["svg"] = "image/svg+xml";
-  extMimeTypes["TXT"] = "text/plain";
-  extMimeTypes["txt"] = "text/plain";
-  extMimeTypes["XML"] = "text/xml";
-  extMimeTypes["xml"] = "text/xml";
-  extMimeTypes["EPUB"] = "application/epub+zip";
-  extMimeTypes["epub"] = "application/epub+zip";
-  extMimeTypes["PDF"] = "application/pdf";
-  extMimeTypes["pdf"] = "application/pdf";
-  extMimeTypes["OGG"] = "application/ogg";
-  extMimeTypes["ogg"] = "application/ogg";
-  extMimeTypes["JS"] = "application/javascript";
-  extMimeTypes["js"] = "application/javascript";
-  extMimeTypes["JSON"] = "application/json";
-  extMimeTypes["json"] = "application/json";
-  extMimeTypes["CSS"] = "text/css";
-  extMimeTypes["css"] = "text/css";
-  extMimeTypes["otf"] = "application/vnd.ms-opentype";
-  extMimeTypes["OTF"] = "application/vnd.ms-opentype";
-  extMimeTypes["eot"] = "application/vnd.ms-fontobject";
-  extMimeTypes["EOT"] = "application/vnd.ms-fontobject";
-  extMimeTypes["ttf"] = "application/font-ttf";
-  extMimeTypes["TTF"] = "application/font-ttf";
-  extMimeTypes["woff"] = "application/font-woff";
-  extMimeTypes["WOFF"] = "application/font-woff";
-  extMimeTypes["vtt"] = "text/vtt";
-  extMimeTypes["VTT"] = "text/vtt";
 
   /* Argument parsing */
   static struct option long_options[] = {

-- 
To view, visit https://gerrit.wikimedia.org/r/295515
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Ia26754d14f0cd6c557626675beb5a7c5fe2cadaa
Gerrit-PatchSet: 1
Gerrit-Project: openzim
Gerrit-Branch: master
Gerrit-Owner: Mgautierfr <mgaut...@kymeria.fr>
Gerrit-Reviewer: Kelson <kel...@kiwix.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to