CMakeLists.txt | 2 goo/gbase64.cc | 50 ++++++++++++++++++++++ goo/gbase64.h | 28 ++++++++++++ goo/gbasename.cc | 51 ++++++++++++++++++++++ goo/gbasename.h | 22 +++++++++ utils/CMakeLists.txt | 1 utils/HtmlOutputDev.cc | 110 +++++++++++++++++++++---------------------------- utils/HtmlOutputDev.h | 22 ++++----- utils/InMemoryFile.cc | 75 +++++++++++++++++++++++++++++++++ utils/InMemoryFile.h | 51 ++++++++++++++++++++++ utils/pdfsig.cc | 6 -- utils/pdftohtml.1 | 3 + utils/pdftohtml.cc | 49 +++++++++++++++------ 13 files changed, 377 insertions(+), 93 deletions(-)
New commits: commit 5f6ff67b0e1dc075d737fc840642c292329dcd08 Author: Albert Astals Cid <[email protected]> Date: Sun Feb 10 15:32:26 2019 +0100 pdftohtml: Add -dataurls to man page diff --git a/utils/pdftohtml.1 b/utils/pdftohtml.1 index 5d711ba9..ea386ffd 100644 --- a/utils/pdftohtml.1 +++ b/utils/pdftohtml.1 @@ -43,6 +43,9 @@ generate complex output .B \-s generate single HTML that includes all pages .TP +.B \-dataurls +use data URLs instead of external images in HTML. No available in all platforms +.TP .B \-i ignore images .TP commit 7b8dbc0a4dc8e0738658b8e4fe7c44adad15af24 Author: Greg Knight <[email protected]> Date: Fri Nov 23 22:30:12 2018 -0500 pdftohtml: singleHtml and stout are not mutually exclusive. with -dataurls is actually quite reasonable. diff --git a/utils/pdftohtml.cc b/utils/pdftohtml.cc index 864ece99..6218ff3c 100644 --- a/utils/pdftohtml.cc +++ b/utils/pdftohtml.cc @@ -318,7 +318,7 @@ int main(int argc, char *argv[]) { if (scale>3.0) scale=3.0; if (scale<0.5) scale=0.5; - if (complexMode || singleHtml) { + if (complexMode) { //noframes=false; stout=false; } @@ -326,7 +326,6 @@ int main(int argc, char *argv[]) { if (stout) { noframes=true; complexMode=false; - singleHtml=false; } if (xml) commit 91ab53fa635e9ea964f10e9a6681d04d7185c732 Author: Greg Knight <[email protected]> Date: Fri Nov 23 19:53:38 2018 -0500 pdftohtml: add support for dataUrls argument eliminate the 'extension' field used to regenerate background images; replace with a list of background images diff --git a/utils/HtmlOutputDev.cc b/utils/HtmlOutputDev.cc index 69a93724..1b078003 100644 --- a/utils/HtmlOutputDev.cc +++ b/utils/HtmlOutputDev.cc @@ -60,6 +60,8 @@ #include "goo/GooString.h" #include "goo/gbasename.h" #include "goo/GooList.h" +#include "goo/gbase64.h" +#include "goo/gbasename.h" #include "UnicodeMap.h" #include "goo/gmem.h" #include "Error.h" @@ -71,6 +73,7 @@ #include "HtmlOutputDev.h" #include "HtmlFonts.h" #include "HtmlUtils.h" +#include "InMemoryFile.h" #include "Outline.h" #include "PDFDoc.h" @@ -102,6 +105,7 @@ static inline bool IS_CLOSER(float x, float y, float z) { return fabs((x)-(y)) < extern bool complexMode; extern bool singleHtml; +extern bool dataUrls; extern bool ignore; extern bool printCommands; extern bool printHtml; @@ -267,7 +271,7 @@ void HtmlString::endString() // HtmlPage //------------------------------------------------------------------------ -HtmlPage::HtmlPage(bool rawOrder, const char *imgExtVal) { +HtmlPage::HtmlPage(bool rawOrder) { this->rawOrder = rawOrder; curStr = nullptr; yxStrings = nullptr; @@ -281,7 +285,6 @@ HtmlPage::HtmlPage(bool rawOrder, const char *imgExtVal) { fontsPageMarker = 0; DocName=nullptr; firstPage = -1; - imgExt = new GooString(imgExtVal); } HtmlPage::~HtmlPage() { @@ -289,7 +292,6 @@ HtmlPage::~HtmlPage() { delete DocName; delete fonts; delete links; - delete imgExt; deleteGooList<HtmlImage>(imgList); } @@ -849,14 +851,12 @@ int HtmlPage::dumpComplexHeaders(FILE * const file, FILE *& pageFile, int page) return 0; } -void HtmlPage::dumpComplex(FILE *file, int page){ +void HtmlPage::dumpComplex(FILE *file, int page, const std::vector<std::string>& backgroundImages) { FILE* pageFile; if( firstPage == -1 ) firstPage = page; if (dumpComplexHeaders(file, pageFile, page)) { error(errIO, -1, "Couldn't write headers."); return; } - - const std::string str = gbasename(DocName->c_str()); fputs("<style type=\"text/css\">\n<!--\n",pageFile); fputs("\tp {margin: 0; padding: 0;}",pageFile); @@ -880,12 +880,11 @@ void HtmlPage::dumpComplex(FILE *file, int page){ fprintf(pageFile,"<div id=\"page%d-div\" style=\"position:relative;width:%dpx;height:%dpx;\">\n", page, pageWidth, pageHeight); - if( !ignore ) + if(!ignore && (size_t) (page - firstPage) < backgroundImages.size()) { fprintf(pageFile, - "<img width=\"%d\" height=\"%d\" src=\"%s%03d.%s\" alt=\"background image\"/>\n", - pageWidth, pageHeight, str.c_str(), - (page-firstPage+1), imgExt->c_str()); + "<img width=\"%d\" height=\"%d\" src=\"%s\" alt=\"background image\"/>\n", + pageWidth, pageHeight, backgroundImages[page - firstPage].c_str()); } for(HtmlString *tmp1=yxStrings;tmp1;tmp1=tmp1->yxNext){ @@ -915,12 +914,12 @@ void HtmlPage::dumpComplex(FILE *file, int page){ } -void HtmlPage::dump(FILE *f, int pageNum) +void HtmlPage::dump(FILE *f, int pageNum, const std::vector<std::string>& backgroundImages) { if (complexMode || singleHtml) { if (xml) dumpAsXML(f, pageNum); - if (!xml) dumpComplex(f, pageNum); + if (!xml) dumpComplex(f, pageNum, backgroundImages); } else { @@ -1083,7 +1082,6 @@ void HtmlOutputDev::doFrame(int firstPage){ HtmlOutputDev::HtmlOutputDev(Catalog *catalogA, const char *fileName, const char *title, const char *author, const char *keywords, const char *subject, const char *date, - const char *extension, bool rawOrder, int firstPage, bool outline) { catalog = catalogA; @@ -1099,7 +1097,7 @@ HtmlOutputDev::HtmlOutputDev(Catalog *catalogA, const char *fileName, const char //pageNum=firstPage; // open file needClose = false; - pages = new HtmlPage(rawOrder, extension); + pages = new HtmlPage(rawOrder); glMetaVars = new GooList(); glMetaVars->push_back(new HtmlMetaVar("generator", "pdftohtml 0.36")); @@ -1107,7 +1105,7 @@ HtmlOutputDev::HtmlOutputDev(Catalog *catalogA, const char *fileName, const char if( keywords ) glMetaVars->push_back(new HtmlMetaVar("keywords", keywords)); if( date ) glMetaVars->push_back(new HtmlMetaVar("date", date)); if( subject ) glMetaVars->push_back(new HtmlMetaVar("subject", subject)); - + maxPageWidth = 0; maxPageHeight = 0; @@ -1272,7 +1270,7 @@ void HtmlOutputDev::endPage() { pages->conv(); pages->coalesce(); - pages->dump(page, pageNum); + pages->dump(page, pageNum, backgroundImages); // I don't yet know what to do in the case when there are pages of different // sizes and we want complex output: running ghostscript many times @@ -1284,6 +1282,10 @@ void HtmlOutputDev::endPage() { if(!stout && !globalParams->getErrQuiet()) printf("Page-%d\n",(pageNum)); } +void HtmlOutputDev::addBackgroundImage(const std::string& img) { + backgroundImages.push_back(img); +} + void HtmlOutputDev::updateFont(GfxState *state) { pages->updateFont(state); } @@ -1309,12 +1311,14 @@ void HtmlOutputDev::drawChar(GfxState *state, double x, double y, void HtmlOutputDev::drawJpegImage(GfxState *state, Stream *str) { - FILE *f1; + InMemoryFile ims; + FILE *f1 = nullptr; int c; // open the image file - GooString *fName=createImageFileName("jpg"); - if (!(f1 = fopen(fName->c_str(), "wb"))) { + GooString *fName = createImageFileName("jpg"); + f1 = dataUrls ? ims.open("wb") : fopen(fName->c_str(), "wb"); + if (!f1) { error(errIO, -1, "Couldn't open image file '{0:t}'", fName); delete fName; return; @@ -1330,9 +1334,11 @@ void HtmlOutputDev::drawJpegImage(GfxState *state, Stream *str) fclose(f1); - if (fName) { - pages->addImage(fName, state); + if (dataUrls) { + delete fName; + fName = new GooString(std::string("data:image/jpeg;base64,") + gbase64Encode(ims.getBuffer())); } + pages->addImage(fName, state); } void HtmlOutputDev::drawPngImage(GfxState *state, Stream *str, int width, int height, @@ -1340,6 +1346,7 @@ void HtmlOutputDev::drawPngImage(GfxState *state, Stream *str, int width, int he { #ifdef ENABLE_LIBPNG FILE *f1; + InMemoryFile ims; if (!colorMap && !isMask) { error(errInternal, -1, "Can't have color image without a color map"); @@ -1348,7 +1355,8 @@ void HtmlOutputDev::drawPngImage(GfxState *state, Stream *str, int width, int he // open the image file GooString *fName=createImageFileName("png"); - if (!(f1 = fopen(fName->c_str(), "wb"))) { + f1 = dataUrls ? ims.open("wb") : fopen(fName->c_str(), "wb"); + if (!f1) { error(errIO, -1, "Couldn't open image file '{0:t}'", fName); delete fName; return; @@ -1453,6 +1461,10 @@ void HtmlOutputDev::drawPngImage(GfxState *state, Stream *str, int width, int he delete writer; fclose(f1); + if (dataUrls) { + delete fName; + fName = new GooString(std::string("data:image/png;base64,") + gbase64Encode(ims.getBuffer())); + } pages->addImage(fName, state); #else return; @@ -1461,16 +1473,7 @@ void HtmlOutputDev::drawPngImage(GfxState *state, Stream *str, int width, int he GooString *HtmlOutputDev::createImageFileName(const char *ext) { - GooString *fName=new GooString(Docname); - fName->append("-"); - GooString *pgNum= GooString::fromInt(pageNum); - GooString *imgnum= GooString::fromInt(pages->getNumImages()+1); - - fName->append(pgNum)->append("_")->append(imgnum)->append(".")->append(ext); - delete pgNum; - delete imgnum; - - return fName; + return GooString::format("{0:s}-{1:d}_{2:d}.{3:s}", Docname->c_str(), pageNum, pages->getNumImages() + 1, ext); } void HtmlOutputDev::drawImageMask(GfxState *state, Object *ref, Stream *str, diff --git a/utils/HtmlOutputDev.h b/utils/HtmlOutputDev.h index a6866295..7f09c056 100644 --- a/utils/HtmlOutputDev.h +++ b/utils/HtmlOutputDev.h @@ -36,6 +36,7 @@ #include <stdio.h> #include "goo/GooList.h" +#include "goo/gbasename.h" #include "GfxFont.h" #include "OutputDev.h" #include "HtmlLinks.h" @@ -63,7 +64,6 @@ enum UnicodeTextDirection { textDirTopBottom }; - class HtmlString { public: @@ -116,7 +116,7 @@ class HtmlPage { public: // Constructor. - HtmlPage(bool rawOrder, const char *imgExtVal); + HtmlPage(bool rawOrder); // Destructor. ~HtmlPage(); @@ -159,7 +159,7 @@ public: // number of images on the current page int getNumImages() { return imgList->getLength(); } - void dump(FILE *f, int pageNum); + void dump(FILE *f, int pageNum, const std::vector<std::string>& backgroundImages); // Clear the page. void clear(); @@ -179,7 +179,7 @@ private: void setDocName(const char* fname); void dumpAsXML(FILE* f,int page); - void dumpComplex(FILE* f, int page); + void dumpComplex(FILE* f, int page, const std::vector<std::string>& backgroundImages); int dumpComplexHeaders(FILE * const file, FILE *& pageFile, int page); // marks the position of the fonts that belong to current page (for noframes) @@ -189,7 +189,6 @@ private: GooList *imgList; GooString *DocName; - GooString *imgExt; int pageWidth; int pageHeight; int firstPage; // used to begin the numeration of pages @@ -234,7 +233,6 @@ public: const char *keywords, const char *subject, const char *date, - const char *extension, bool rawOrder, int firstPage = 1, bool outline = 0); @@ -283,6 +281,10 @@ public: // End a page. void endPage() override; + // add a background image to the list of background images, + // as this seems to be done outside other processing. takes ownership of img. + void addBackgroundImage(const std::string& img); + //----- update text state void updateFont(GfxState *state) override; @@ -345,6 +347,7 @@ private: GooList *glMetaVars; Catalog *catalog; Page *docPage; + std::vector<std::string> backgroundImages; friend class HtmlPage; }; diff --git a/utils/pdftohtml.cc b/utils/pdftohtml.cc index bbd98237..864ece99 100644 --- a/utils/pdftohtml.cc +++ b/utils/pdftohtml.cc @@ -46,6 +46,8 @@ #include <time.h> #include "parseargs.h" #include "goo/GooString.h" +#include "goo/gbase64.h" +#include "goo/gbasename.h" #include "goo/gmem.h" #include "Object.h" #include "Stream.h" @@ -68,6 +70,7 @@ #include "DateInfo.h" #include "goo/gfile.h" #include "Win32Console.h" +#include "InMemoryFile.h" static int firstPage = 1; static int lastPage = 0; @@ -77,6 +80,7 @@ static bool printHelp = false; bool printHtml = false; bool complexMode=false; bool singleHtml=false; // singleHtml +bool dataUrls = false; bool ignore=false; static char extension[5]="png"; static double scale=1.5; @@ -123,6 +127,10 @@ static const ArgDesc argDesc[] = { "generate complex document"}, {"-s", argFlag, &singleHtml, 0, "generate single document that includes all pages"}, +#ifdef HAVE_IN_MEMORY_FILE + {"-dataurls", argFlag, &dataUrls, 0, + "use data URLs instead of external images in HTML"}, +#endif {"-i", argFlag, &ignore, 0, "ignore images"}, {"-noframes", argFlag, &noframes, 0, @@ -366,7 +374,6 @@ int main(int argc, char *argv[]) { keywords ? keywords->c_str() : nullptr, subject ? subject->c_str() : nullptr, date ? date->c_str() : nullptr, - extension, rawOrder, firstPage, doOutline); @@ -387,13 +394,6 @@ int main(int argc, char *argv[]) { { delete date; } - - if (htmlOut->isOk()) - { - doc->displayPages(htmlOut, firstPage, lastPage, 72 * scale, 72 * scale, 0, - true, false, false); - htmlOut->dumpDocOutline(doc); - } if ((complexMode || singleHtml) && !xml && !ignore) { #ifdef HAVE_SPLASH @@ -409,6 +409,7 @@ int main(int argc, char *argv[]) { splashOut->startDoc(doc); for (int pg = firstPage; pg <= lastPage; ++pg) { + InMemoryFile imf; doc->displayPage(splashOut, pg, 72 * scale, 72 * scale, 0, true, false, false); @@ -416,10 +417,22 @@ int main(int argc, char *argv[]) { imgFileName = GooString::format("{0:s}{1:03d}.{2:s}", htmlFileName->c_str(), pg, extension); - - bitmap->writeImgFile(format, imgFileName->c_str(), - 72 * scale, 72 * scale); - + auto f1 = dataUrls ? imf.open("wb") : fopen(imgFileName->c_str(), "wb"); + if (!f1) { + fprintf(stderr, "Could not open %s\n", imgFileName->c_str()); + delete imgFileName; + continue; + } + bitmap->writeImgFile(format, f1, 72 * scale, 72 * scale); + fclose(f1); + if (dataUrls) { + htmlOut->addBackgroundImage( + std::string((format == splashFormatJpeg) ? "data:image/jpeg;base64," : "data:image/png;base64,") + + gbase64Encode(imf.getBuffer()) + ); + } else { + htmlOut->addBackgroundImage(gbasename(imgFileName->c_str())); + } delete imgFileName; } @@ -434,7 +447,14 @@ int main(int argc, char *argv[]) { return -1; #endif } - + + if (htmlOut->isOk()) + { + doc->displayPages(htmlOut, firstPage, lastPage, 72 * scale, 72 * scale, 0, + true, false, false); + htmlOut->dumpDocOutline(doc); + } + delete htmlOut; exit_status = EXIT_SUCCESS; commit 44da4d785cffeb5d4bbb1460479add6ce01edea2 Author: Greg Knight <[email protected]> Date: Sun Feb 10 10:31:36 2019 +0100 Introduce gbase64 diff --git a/CMakeLists.txt b/CMakeLists.txt index 74294ca8..bf187ab4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -314,6 +314,7 @@ configure_file(config.h.cmake ${CMAKE_CURRENT_BINARY_DIR}/config.h) configure_file(poppler/poppler-config.h.cmake ${CMAKE_CURRENT_BINARY_DIR}/poppler/poppler-config.h) set(poppler_SRCS + goo/gbase64.cc goo/gbasename.cc goo/gfile.cc goo/GooTimer.cc diff --git a/goo/gbase64.cc b/goo/gbase64.cc new file mode 100644 index 00000000..e0da77ff --- /dev/null +++ b/goo/gbase64.cc @@ -0,0 +1,50 @@ +//======================================================================== +// +// gbase64.cc +// +// Implementation of a base64 encoder, because another one did not immediately +// avail itself. +// +// This file is licensed under the GPLv2 or later +// +// Copyright (C) 2018 Greg Knight <[email protected]> +// +//======================================================================== + +#include "gbase64.h" +#include <sstream> + +static void b64encodeTriplet(char output[4], unsigned char a, unsigned char b, unsigned char c) +{ + static const char* base64table = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; + output[0] = base64table[((a >> 2) & 0x3f) ]; // upper 6 of first byte + output[1] = base64table[((a << 4) & 0x30) | ((b >> 4) & 0x0f)]; // lower 2 of first byte, upper 4 of second byte + output[2] = base64table[((b << 2) & 0x3c) | ((c >> 6) & 0x03)]; // lower 4 of second byte, upper 2 of third byte + output[3] = base64table[((c ) & 0x3f)]; // lower 6 of third byte +} + +std::string gbase64Encode(const void* input, size_t len) +{ + char quad[4]; + size_t pos = 0; + std::stringstream buf; + auto bytes = static_cast<const unsigned char*>(input); + for ( ; pos + 3 <= len; pos += 3) { + b64encodeTriplet(quad, bytes[0], bytes[1], bytes[2]); + buf.write(&quad[0], 4); + bytes += 3; + } + switch (len - pos) { + case 1: + b64encodeTriplet(quad, bytes[0], 0, 0); + quad[2] = quad[3] = '='; + buf.write(&quad[0], 4); + break; + case 2: + b64encodeTriplet(quad, bytes[0], bytes[1], 0); + quad[3] = '='; + buf.write(&quad[0], 4); + break; + } + return buf.str(); +} diff --git a/goo/gbase64.h b/goo/gbase64.h new file mode 100644 index 00000000..06e2e8b6 --- /dev/null +++ b/goo/gbase64.h @@ -0,0 +1,28 @@ +//======================================================================== +// +// gbase64.h +// +// Implementation of a base64 encoder, because another one did not immediately +// avail itself. +// +// This file is licensed under the GPLv2 or later +// +// Copyright (C) 2018 Greg Knight <[email protected]> +// +//======================================================================== + +#ifndef GOO_GBASE64_H +#define GOO_GBASE64_H + +#include <string> +#include <vector> + +std::string gbase64Encode(const void* input, size_t sz); + +inline std::string gbase64Encode(const std::vector<char>& input) + { return input.empty() ? std::string() : gbase64Encode(&input[0], input.size()); } + +inline std::string gbase64Encode(const std::vector<unsigned char>& input) + { return input.empty() ? std::string() : gbase64Encode(&input[0], input.size()); } + +#endif // ndef GOO_GBASE64_H commit 2ba81611e9ccdcb49275ee247308bd0dcba3e64d Author: Greg Knight <[email protected]> Date: Sun Feb 10 10:28:26 2019 +0100 Introduce gbasename diff --git a/CMakeLists.txt b/CMakeLists.txt index 6d49721c..74294ca8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -314,6 +314,7 @@ configure_file(config.h.cmake ${CMAKE_CURRENT_BINARY_DIR}/config.h) configure_file(poppler/poppler-config.h.cmake ${CMAKE_CURRENT_BINARY_DIR}/poppler/poppler-config.h) set(poppler_SRCS + goo/gbasename.cc goo/gfile.cc goo/GooTimer.cc goo/GooString.cc diff --git a/goo/gbasename.cc b/goo/gbasename.cc new file mode 100644 index 00000000..dd4607b4 --- /dev/null +++ b/goo/gbasename.cc @@ -0,0 +1,51 @@ +//======================================================================== +// +// gbasename.cc +// +// Wrapper for libgen's basename() call which returns a std::string. +// This is a convenience method working around questionable behavior +// in the copy of basename() provided by libgen.h. +// +// According to man 3 basename: +// +// Both dirname() and basename() may modify the contents of path, so it +// may be desirable to pass a copy when calling one of these functions. +// +// ... +// +// These functions may return pointers to statically allocated memory +// which may be overwritten by subsequent calls. Alternatively, they +// may return a pointer to some part of path, so that the string +// referred to by path should not be modified or freed until the pointer +// returned by the function is no longer required. +// +// Because basename can modify filename (for some reason), we have to +// duplicate our input into a mutable buffer before we can call it. +// The return value might be part of this mutable temporary, but not +// generally the front, so 'char *' cannot be used as our return value. +// The return value might also be a statically allocated string, +// rendering basename (and thus gbasename) non-thread-safe. Because +// we don't know how basename()'s return value is lifecycled, we need +// to duplicate it again into something whose lifecycle we can predict. +// +// This is how a method that should amount to finding the last slash +// in a string ends up requiring two memory allocations while managing +// not to be thread-safe. In a way, it's kind of impressive. +// +// This file is licensed under the GPLv2 or later +// +// Copyright (C) 2018 Greg Knight <[email protected]> +// +//======================================================================== + +#include "gbasename.h" +#include <libgen.h> +#include <string.h> + +std::string gbasename(const char* filename) +{ + char* mutabl = strdup(filename); + std::string retu = basename(mutabl); + free(mutabl); + return retu; +} diff --git a/goo/gbasename.h b/goo/gbasename.h new file mode 100644 index 00000000..3c5e0065 --- /dev/null +++ b/goo/gbasename.h @@ -0,0 +1,22 @@ +//======================================================================== +// +// gbasename.h +// +// Wrapper for libgen's basename() call which returns a std::string. +// This is a convenience method working around questionable behavior +// in the copy of basename() provided by libgen.h. +// +// This file is licensed under the GPLv2 or later +// +// Copyright (C) 2018 Greg Knight <[email protected]> +// +//======================================================================== + +#ifndef GBASENAME_H +#define GBASENAME_H + +#include <string> + +std::string gbasename(const char* input); + +#endif // ndef GBASENAME_H diff --git a/utils/HtmlOutputDev.cc b/utils/HtmlOutputDev.cc index d83319e2..69a93724 100644 --- a/utils/HtmlOutputDev.cc +++ b/utils/HtmlOutputDev.cc @@ -58,6 +58,7 @@ #include <math.h> #include <iostream> #include "goo/GooString.h" +#include "goo/gbasename.h" #include "goo/GooList.h" #include "UnicodeMap.h" #include "goo/gmem.h" @@ -116,16 +117,6 @@ extern double wordBreakThreshold; static bool debug = false; static GooString *gstr_buff0 = nullptr; // a workspace in which I format strings -static GooString* basename(GooString* str){ - - const char *p=str->c_str(); - int len=str->getLength(); - for (int i=len-1;i>=0;i--) - if (*(p+i)==SLASH) - return new GooString((p+i+1),len-i-1); - return new GooString(str); -} - #if 0 static GooString* Dirname(GooString* str){ @@ -860,13 +851,12 @@ int HtmlPage::dumpComplexHeaders(FILE * const file, FILE *& pageFile, int page) void HtmlPage::dumpComplex(FILE *file, int page){ FILE* pageFile; - GooString* tmp; if( firstPage == -1 ) firstPage = page; if (dumpComplexHeaders(file, pageFile, page)) { error(errIO, -1, "Couldn't write headers."); return; } - tmp=basename(DocName); + const std::string str = gbasename(DocName->c_str()); fputs("<style type=\"text/css\">\n<!--\n",pageFile); fputs("\tp {margin: 0; padding: 0;}",pageFile); @@ -894,12 +884,10 @@ void HtmlPage::dumpComplex(FILE *file, int page){ { fprintf(pageFile, "<img width=\"%d\" height=\"%d\" src=\"%s%03d.%s\" alt=\"background image\"/>\n", - pageWidth, pageHeight, tmp->c_str(), + pageWidth, pageHeight, str.c_str(), (page-firstPage+1), imgExt->c_str()); } - delete tmp; - for(HtmlString *tmp1=yxStrings;tmp1;tmp1=tmp1->yxNext){ if (tmp1->htext){ fprintf(pageFile, @@ -1070,7 +1058,7 @@ void HtmlOutputDev::doFrame(int firstPage){ delete fName; - fName=basename(Docname); + const std::string baseName = gbasename(Docname->c_str()); fputs(DOCTYPE, fContentsFrame); fputs("\n<html>",fContentsFrame); fputs("\n<head>",fContentsFrame); @@ -1080,16 +1068,15 @@ void HtmlOutputDev::doFrame(int firstPage){ dumpMetaVars(fContentsFrame); fprintf(fContentsFrame, "</head>\n"); fputs("<frameset cols=\"100,*\">\n",fContentsFrame); - fprintf(fContentsFrame,"<frame name=\"links\" src=\"%s_ind.html\"/>\n",fName->c_str()); + fprintf(fContentsFrame,"<frame name=\"links\" src=\"%s_ind.html\"/>\n", baseName.c_str()); fputs("<frame name=\"contents\" src=",fContentsFrame); if (complexMode) - fprintf(fContentsFrame,"\"%s-%d.html\"",fName->c_str(), firstPage); + fprintf(fContentsFrame,"\"%s-%d.html\"", baseName.c_str(), firstPage); else - fprintf(fContentsFrame,"\"%ss.html\"",fName->c_str()); + fprintf(fContentsFrame,"\"%ss.html\"", baseName.c_str()); fputs("/>\n</frameset>\n</html>\n",fContentsFrame); - delete fName; delete htmlEncoding; fclose(fContentsFrame); } @@ -1149,9 +1136,9 @@ HtmlOutputDev::HtmlOutputDev(Catalog *catalogA, const char *fileName, const char if (doOutline) { - GooString *str = basename(Docname); - fprintf(fContentsFrame, "<a href=\"%s%s\" target=\"contents\">Outline</a><br/>", str->c_str(), complexMode ? "-outline.html" : "s.html#outline"); - delete str; + fprintf(fContentsFrame, "<a href=\"%s%s\" target=\"contents\">Outline</a><br/>", + gbasename(Docname->c_str()).c_str(), + complexMode ? "-outline.html" : "s.html#outline"); } } if (!complexMode) @@ -1256,24 +1243,22 @@ void HtmlOutputDev::startPage(int pageNum, GfxState *state, XRef *xref) { #endif this->pageNum = pageNum; - GooString *str=basename(Docname); + const std::string str = gbasename(Docname->c_str()); pages->clear(); if(!noframes) { if (fContentsFrame) { if (complexMode) - fprintf(fContentsFrame,"<a href=\"%s-%d.html\"",str->c_str(),pageNum); + fprintf(fContentsFrame,"<a href=\"%s-%d.html\"", str.c_str(), pageNum); else - fprintf(fContentsFrame,"<a href=\"%ss.html#%d\"",str->c_str(),pageNum); + fprintf(fContentsFrame,"<a href=\"%ss.html#%d\"", str.c_str(), pageNum); fprintf(fContentsFrame," target=\"contents\" >Page %d</a><br/>\n",pageNum); } } pages->pageWidth=static_cast<int>(state->getPageWidth()); pages->pageHeight=static_cast<int>(state->getPageHeight()); - - delete str; } @@ -1561,8 +1546,8 @@ GooString* HtmlOutputDev::getLinkDest(AnnotLink *link){ switch(link->getAction()->getKind()) { case actionGoTo: - { - GooString* file=basename(Docname); + { + GooString* file = new GooString(gbasename(Docname->c_str())); int page=1; LinkGoTo *ha=(LinkGoTo *)link->getAction(); LinkDest *dest=nullptr; @@ -1781,7 +1766,7 @@ bool HtmlOutputDev::newHtmlOutlineLevel(FILE *output, const GooList *outlines, i frames file-4.html files.html#4 noframes file.html#4 file.html#4 */ - linkName=basename(Docname); + linkName = new GooString(gbasename(Docname->c_str())); GooString *str=GooString::fromInt(page); if (noframes) { linkName->append(".html#"); diff --git a/utils/HtmlOutputDev.h b/utils/HtmlOutputDev.h index 30a7f1d3..a6866295 100644 --- a/utils/HtmlOutputDev.h +++ b/utils/HtmlOutputDev.h @@ -44,13 +44,6 @@ #include "Catalog.h" #include "UnicodeMap.h" - -#ifdef _WIN32 -# define SLASH '\\' -#else -# define SLASH '/' -#endif - #define xoutRound(x) ((int)(x + 0.5)) #define DOCTYPE "<!DOCTYPE html>" diff --git a/utils/pdfsig.cc b/utils/pdfsig.cc index 64299690..3a432b36 100644 --- a/utils/pdfsig.cc +++ b/utils/pdfsig.cc @@ -24,10 +24,10 @@ #include <time.h> #include <hasht.h> #include <fstream> -#include <libgen.h> #include "parseargs.h" #include "Object.h" #include "Array.h" +#include "goo/gbasename.h" #include "Page.h" #include "PDFDoc.h" #include "PDFDocFactory.h" @@ -108,9 +108,7 @@ static void dumpSignature(int sig_num, int sigCount, FormWidgetSignature *sig_wi // since { is the magic character to replace things we need to put it twice where // we don't want it to be replaced GooString *format = GooString::format("{{0:s}}.sig{{1:{0:d}d}}", sigCountLength); - char *filenameCopy = strdup(filename); - GooString *path = GooString::format(format->c_str(), basename(filenameCopy), sig_num); - free(filenameCopy); + GooString *path = GooString::format(format->c_str(), gbasename(filename).c_str(), sig_num); printf("Signature #%d (%u bytes) => %s\n", sig_num, signature->getLength(), path->c_str()); std::ofstream outfile(path->c_str(), std::ofstream::binary); outfile.write(signature->c_str(), signature->getLength()); commit 7f4da59665969f624c18a1ba3e1f1ac1ca3478b1 Author: Greg Knight <[email protected]> Date: Fri Nov 23 19:37:37 2018 -0500 pdftohtml data urls: adding InMemoryFile utility class diff --git a/utils/CMakeLists.txt b/utils/CMakeLists.txt index 34d96475..3516479e 100644 --- a/utils/CMakeLists.txt +++ b/utils/CMakeLists.txt @@ -121,6 +121,7 @@ install(FILES pdftotext.1 DESTINATION ${CMAKE_INSTALL_MANDIR}/man1) # pdftohtml set(pdftohtml_SOURCES ${common_srcs} + InMemoryFile.cc pdftohtml.cc HtmlFonts.cc HtmlLinks.cc diff --git a/utils/InMemoryFile.cc b/utils/InMemoryFile.cc new file mode 100644 index 00000000..d4ed0f48 --- /dev/null +++ b/utils/InMemoryFile.cc @@ -0,0 +1,75 @@ +//======================================================================== +// +// InMemoryFile.cc +// +// Represents a file in-memory with GNU's stdio wrappers. +// NOTE as of this writing, open() depends on the glibc 'fopencookie' +// extension and is not supported on other platforms. The +// HAVE_IN_MEMORY_FILE macro is intended to reflect whether this class is +// usable. +// +// This file is licensed under the GPLv2 or later +// +// Copyright (C) 2018 Greg Knight <[email protected]> +// +//======================================================================== + +#include "InMemoryFile.h" + +#include <string.h> +#include <sstream> + +InMemoryFile::InMemoryFile() + : iohead(0) + , fptr(nullptr) +{ +} + +ssize_t InMemoryFile::_read(char* buf, size_t sz) +{ + auto toRead = std::min<size_t>(data.size() - iohead, sz); + memcpy(&buf[0], &data[iohead], toRead); + iohead += toRead; + return toRead; +} + +ssize_t InMemoryFile::_write(const char* buf, size_t sz) +{ + if (iohead + sz > data.size()) + data.resize(iohead + sz); + memcpy(&data[iohead], buf, sz); + iohead += sz; + return sz; +} + +int InMemoryFile::_seek(off64_t* offset, int whence) +{ + switch (whence) { + case SEEK_SET: iohead = (*offset); break; + case SEEK_CUR: iohead += (*offset); break; + case SEEK_END: iohead -= (*offset); break; + } + (*offset) = std::min<off64_t>(std::max<off64_t>(iohead, 0l), data.size()); + iohead = static_cast<size_t>(*offset); + return 0; +} + +FILE* InMemoryFile::open(const char* mode) +{ +#if HAVE_IN_MEMORY_FILE_FOPENCOOKIE + if (fptr != nullptr) { + fprintf(stderr, "InMemoryFile: BUG: Why is this opened more than once?"); + return nullptr; // maybe there's some legit reason for it, whoever comes up with one can remove this line + } + static cookie_io_functions_t methods = { + /* .read = */ [](void* self, char* buf, size_t sz) { return ((InMemoryFile*)self)->_read(buf, sz); }, + /* .write = */ [](void* self, const char* buf, size_t sz) { return ((InMemoryFile*)self)->_write(buf, sz); }, + /* .seek = */ [](void* self, off64_t* offset, int whence) { return ((InMemoryFile*)self)->_seek(offset, whence); }, + /* .close = */ [](void* self) { ((InMemoryFile*)self)->fptr = nullptr; return 0; }, + }; + return fptr = fopencookie(this, mode, methods); +#else + fprintf (stderr, "If you can read this, your platform does not support the features necessary to achieve your goals."); + return nullptr; +#endif +} diff --git a/utils/InMemoryFile.h b/utils/InMemoryFile.h new file mode 100644 index 00000000..6af7d503 --- /dev/null +++ b/utils/InMemoryFile.h @@ -0,0 +1,51 @@ +//======================================================================== +// +// InMemoryFile.h +// +// Represents a file in-memory with GNU's stdio wrappers. +// NOTE as of this writing, open() depends on the glibc 'fopencookie' +// extension and is not supported on other platforms. The +// HAVE_IN_MEMORY_FILE macro is intended to reflect whether this class is +// usable. +// +// This file is licensed under the GPLv2 or later +// +// Copyright (C) 2018 Greg Knight <[email protected]> +// +//======================================================================== + +#ifndef IN_MEMORY_FILE_H +#define IN_MEMORY_FILE_H + +#include <stdio.h> +#include <string> +#include <vector> + +#if defined(__USE_GNU) && !defined(__ANDROID_API__) +# define HAVE_IN_MEMORY_FILE (1) +# define HAVE_IN_MEMORY_FILE_FOPENCOOKIE (1) // used internally +#endif + +class InMemoryFile { +private: + size_t iohead; + std::vector<char> data; + FILE *fptr; + + ssize_t _read(char* buf, size_t sz); + ssize_t _write(const char* buf, size_t sz); + int _seek(off64_t* offset, int whence); + +public: + InMemoryFile(); + +public: + /* Returns a file handle for this file. This is scoped to this object + * and must be fclosed() by the caller before destruction. */ + FILE* open(const char* mode); + + const std::vector<char>& getBuffer() const + { return data; } +}; + +#endif // IN_MEMORY_FILE_H _______________________________________________ poppler mailing list [email protected] https://lists.freedesktop.org/mailman/listinfo/poppler
