Mgautierfr has uploaded a new change for review. https://gerrit.wikimedia.org/r/295516
Change subject: Move article's related stuffs in article.(h|cpp). ...................................................................... Move article's related stuffs in article.(h|cpp). Change-Id: I2a257ea1a0a13eca0748b444838a525666a9090d --- M zimwriterfs/Makefile.am A zimwriterfs/article.cpp A zimwriterfs/article.h M zimwriterfs/zimwriterfs.cpp 4 files changed, 253 insertions(+), 199 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/openzim refs/changes/16/295516/1 diff --git a/zimwriterfs/Makefile.am b/zimwriterfs/Makefile.am index ea2ab7a..3383e35 100644 --- a/zimwriterfs/Makefile.am +++ b/zimwriterfs/Makefile.am @@ -3,4 +3,5 @@ zimwriterfs_SOURCES= \ zimwriterfs.cpp \ - tools.cpp + tools.cpp \ + article.cpp diff --git a/zimwriterfs/article.cpp b/zimwriterfs/article.cpp new file mode 100644 index 0000000..f743cde --- /dev/null +++ b/zimwriterfs/article.cpp @@ -0,0 +1,158 @@ +/* + * Copyright 2013-2016 Emmanuel Engelhart <kel...@kiwix.org> + * Copyright 2016 Matthieu Gautier <mgaut...@kymeria.fr> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301, USA. + */ + +#include "article.h" +#include "tools.h" + + +extern std::string directoryPath; + +Article::Article(const std::string& path, const bool detectRedirects) { + invalid = false; + + /* aid */ + aid = path.substr(directoryPath.size()+1); + + /* url */ + url = aid; + + /* mime-type */ + mimeType = getMimeTypeForFile(aid); + + /* namespace */ + ns = getNamespaceForMimeType(mimeType)[0]; + + /* HTML specific code */ + if (mimeType.find("text/html") != std::string::npos) { + std::size_t found; + std::string html = getFileContent(path); + GumboOutput* output = gumbo_parse(html.c_str()); + GumboNode* root = output->root; + + /* Search the content of the <title> tag in the HTML */ + if (root->type == GUMBO_NODE_ELEMENT && root->v.element.children.length >= 2) { + const GumboVector* root_children = &root->v.element.children; + GumboNode* head = NULL; + for (int i = 0; i < root_children->length; ++i) { + GumboNode* child = (GumboNode*)(root_children->data[i]); + if (child->type == GUMBO_NODE_ELEMENT && + child->v.element.tag == GUMBO_TAG_HEAD) { + head = child; + break; + } + } + + if (head != NULL) { + GumboVector* head_children = &head->v.element.children; + for (int i = 0; i < head_children->length; ++i) { + GumboNode* child = (GumboNode*)(head_children->data[i]); + if (child->type == GUMBO_NODE_ELEMENT && + child->v.element.tag == GUMBO_TAG_TITLE) { + if (child->v.element.children.length == 1) { + GumboNode* title_text = (GumboNode*)(child->v.element.children.data[0]); + if (title_text->type == GUMBO_NODE_TEXT) { + title = title_text->v.text.text; + } + } + } + } + + /* Detect if this is a redirection (if no redirects CSV specified) */ + std::string targetUrl; + try { + targetUrl = detectRedirects ? extractRedirectUrlFromHtml(head_children) : ""; + } catch (std::string &error) { + std::cerr << error << std::endl; + } + if (!targetUrl.empty()) { + redirectAid = computeAbsolutePath(aid, decodeUrl(targetUrl)); + if (!fileExists(directoryPath + "/" + redirectAid)) { + redirectAid.clear(); + invalid = true; + } + } + } + + /* If no title, then compute one from the filename */ + if (title.empty()) { + found = path.rfind("/"); + if (found != std::string::npos) { + title = path.substr(found+1); + found = title.rfind("."); + if (found!=std::string::npos) { + title = title.substr(0, found); + } + } else { + title = path; + } + std::replace(title.begin(), title.end(), '_', ' '); + } + } + + gumbo_destroy_output(&kGumboDefaultOptions, output); + } +} + +std::string Article::getAid() const +{ + return aid; +} + +bool Article::isInvalid() const +{ + return invalid; +} + +char Article::getNamespace() const +{ + return ns; +} + +std::string Article::getUrl() const +{ + return url; +} + +std::string Article::getTitle() const +{ + return title; +} + +bool Article::isRedirect() const +{ + return !redirectAid.empty(); +} + +std::string Article::getMimeType() const +{ + return mimeType; +} + +std::string Article::getRedirectAid() const +{ + return redirectAid; +} + +bool Article::shouldCompress() const { + return (getMimeType().find("text") == 0 || + getMimeType() == "application/javascript" || + getMimeType() == "application/json" || + getMimeType() == "image/svg+xml" ? true : false); +} diff --git a/zimwriterfs/article.h b/zimwriterfs/article.h new file mode 100644 index 0000000..2585fc6 --- /dev/null +++ b/zimwriterfs/article.h @@ -0,0 +1,91 @@ +/* + * Copyright 2013-2016 Emmanuel Engelhart <kel...@kiwix.org> + * Copyright 2016 Matthieu Gautier <mgaut...@kymeria.fr> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301, USA. + */ + +#ifndef OPENZIM_ZIMWRITERFS_ARTICLE_H +#define OPENZIM_ZIMWRITERFS_ARTICLE_H + +#include <string> +#include <zim/writer/zimcreator.h> + +extern std::string favicon; + +class Article : public zim::writer::Article { + protected: + char ns; + bool invalid; + std::string aid; + std::string url; + std::string title; + std::string mimeType; + std::string redirectAid; + std::string data; + + public: + Article() { + invalid = false; + } + explicit Article(const std::string& id, const bool detectRedirects = true); + virtual std::string getAid() const; + virtual char getNamespace() const; + virtual std::string getUrl() const; + virtual bool isInvalid() const; + virtual std::string getTitle() const; + virtual bool isRedirect() const; + virtual std::string getMimeType() const; + virtual std::string getRedirectAid() const; + virtual bool shouldCompress() const; +}; + +class MetadataArticle : public Article { + public: + MetadataArticle(std::string &id) { + if (id == "Favicon") { + aid = "/-/" + id; + mimeType="image/png"; + redirectAid = favicon; + ns = '-'; + url = "favicon"; + } else { + aid = "/M/" + id; + mimeType="text/plain"; + ns = 'M'; + url = id; + } + } +}; + +class RedirectArticle : public Article { + public: + RedirectArticle(const std::string &line) { + size_t start; + size_t end; + ns = line[0]; + end = line.find_first_of("\t", 2); + url = line.substr(2, end - 2); + start = end + 1; + end = line.find_first_of("\t", start); + title = line.substr(start, end - start); + redirectAid = line.substr(end + 1); + aid = "/" + line.substr(0, 1) + "/" + url; + mimeType = "text/plain"; + } +}; + +#endif // OPENZIM_ZIMWRITERFS_ARTICLE_H diff --git a/zimwriterfs/zimwriterfs.cpp b/zimwriterfs/zimwriterfs.cpp index 693e6a1..93987b4 100644 --- a/zimwriterfs/zimwriterfs.cpp +++ b/zimwriterfs/zimwriterfs.cpp @@ -39,6 +39,7 @@ #include "tools.h" +#include "article.h" #define MAX_QUEUE_SIZE 100 @@ -135,203 +136,6 @@ } while (isDirectoryVisitorRunning() || !isFilenameQueueEmpty()); return retVal; -} - -/* Article class */ -class Article : public zim::writer::Article { - protected: - char ns; - bool invalid; - std::string aid; - std::string url; - std::string title; - std::string mimeType; - std::string redirectAid; - std::string data; - - public: - Article() { - invalid = false; - } - explicit Article(const std::string& id, const bool detectRedirects); - virtual std::string getAid() const; - virtual char getNamespace() const; - virtual std::string getUrl() const; - virtual bool isInvalid() const; - virtual std::string getTitle() const; - virtual bool isRedirect() const; - virtual std::string getMimeType() const; - virtual std::string getRedirectAid() const; - virtual bool shouldCompress() const; -}; - -class MetadataArticle : public Article { - public: - MetadataArticle(std::string &id) { - if (id == "Favicon") { - aid = "/-/" + id; - mimeType="image/png"; - redirectAid = favicon; - ns = '-'; - url = "favicon"; - } else { - aid = "/M/" + id; - mimeType="text/plain"; - ns = 'M'; - url = id; - } - } -}; - -class RedirectArticle : public Article { - public: - RedirectArticle(const std::string &line) { - size_t start; - size_t end; - ns = line[0]; - end = line.find_first_of("\t", 2); - url = line.substr(2, end - 2); - start = end + 1; - end = line.find_first_of("\t", start); - title = line.substr(start, end - start); - redirectAid = line.substr(end + 1); - aid = "/" + line.substr(0, 1) + "/" + url; - mimeType = "text/plain"; - } -}; - - -Article::Article(const std::string& path, const bool detectRedirects = true) { - invalid = false; - - /* aid */ - aid = path.substr(directoryPath.size()+1); - - /* url */ - url = aid; - - /* mime-type */ - mimeType = getMimeTypeForFile(aid); - - /* namespace */ - ns = getNamespaceForMimeType(mimeType)[0]; - - /* HTML specific code */ - if (mimeType.find("text/html") != std::string::npos) { - std::size_t found; - std::string html = getFileContent(path); - GumboOutput* output = gumbo_parse(html.c_str()); - GumboNode* root = output->root; - - /* Search the content of the <title> tag in the HTML */ - if (root->type == GUMBO_NODE_ELEMENT && root->v.element.children.length >= 2) { - const GumboVector* root_children = &root->v.element.children; - GumboNode* head = NULL; - for (int i = 0; i < root_children->length; ++i) { - GumboNode* child = (GumboNode*)(root_children->data[i]); - if (child->type == GUMBO_NODE_ELEMENT && - child->v.element.tag == GUMBO_TAG_HEAD) { - head = child; - break; - } - } - - if (head != NULL) { - GumboVector* head_children = &head->v.element.children; - for (int i = 0; i < head_children->length; ++i) { - GumboNode* child = (GumboNode*)(head_children->data[i]); - if (child->type == GUMBO_NODE_ELEMENT && - child->v.element.tag == GUMBO_TAG_TITLE) { - if (child->v.element.children.length == 1) { - GumboNode* title_text = (GumboNode*)(child->v.element.children.data[0]); - if (title_text->type == GUMBO_NODE_TEXT) { - title = title_text->v.text.text; - } - } - } - } - - /* Detect if this is a redirection (if no redirects CSV specified) */ - std::string targetUrl; - try { - targetUrl = detectRedirects ? extractRedirectUrlFromHtml(head_children) : ""; - } catch (std::string &error) { - std::cerr << error << std::endl; - } - if (!targetUrl.empty()) { - redirectAid = computeAbsolutePath(aid, decodeUrl(targetUrl)); - if (!fileExists(directoryPath + "/" + redirectAid)) { - redirectAid.clear(); - invalid = true; - } - } - } - - /* If no title, then compute one from the filename */ - if (title.empty()) { - found = path.rfind("/"); - if (found != std::string::npos) { - title = path.substr(found+1); - found = title.rfind("."); - if (found!=std::string::npos) { - title = title.substr(0, found); - } - } else { - title = path; - } - std::replace(title.begin(), title.end(), '_', ' '); - } - } - - gumbo_destroy_output(&kGumboDefaultOptions, output); - } -} - -std::string Article::getAid() const -{ - return aid; -} - -bool Article::isInvalid() const -{ - return invalid; -} - -char Article::getNamespace() const -{ - return ns; -} - -std::string Article::getUrl() const -{ - return url; -} - -std::string Article::getTitle() const -{ - return title; -} - -bool Article::isRedirect() const -{ - return !redirectAid.empty(); -} - -std::string Article::getMimeType() const -{ - return mimeType; -} - -std::string Article::getRedirectAid() const -{ - return redirectAid; -} - -bool Article::shouldCompress() const { - return (getMimeType().find("text") == 0 || - getMimeType() == "application/javascript" || - getMimeType() == "application/json" || - getMimeType() == "image/svg+xml" ? true : false); } /* ArticleSource class */ @@ -657,7 +461,7 @@ int main(int argc, char** argv) { ArticleSource source; int minChunkSize = 2048; - + /* Argument parsing */ static struct option long_options[] = { -- To view, visit https://gerrit.wikimedia.org/r/295516 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I2a257ea1a0a13eca0748b444838a525666a9090d Gerrit-PatchSet: 1 Gerrit-Project: openzim Gerrit-Branch: master Gerrit-Owner: Mgautierfr <mgaut...@kymeria.fr> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits