utils/HtmlFonts.h | 3 ++- utils/pdftohtml.cc | 41 ++++++++++++++++++++++++++++++++++++++--- 2 files changed, 40 insertions(+), 4 deletions(-)
New commits: commit c5601bde9d8f3f56e558a6f63e563c9d337810eb Author: Steven Murdoch <[email protected]> Date: Mon Jun 20 23:25:43 2011 +0100 Fix encoding of PDF document metadata in output of pdftohtml pdftohtml simply copies the PDF document title into the <title> HTML tag, which fails when the title is UCS-2 encoded, or if it contains characters which are in pdfDocEncoding (a ISO 8859-1 superset), but not in ISO 8859-1. This patch fixes the problem by decoding UCS-2 or pdfDocEncoding into Unicode, then encoding this in the desired output encoding. HTML escaping wasn't being done either, so I have used the existing function HtmlFont::HtmlFilter to perform both HTML escaping and character set encoding. This static method had to be made public to call it from pdftohtml. See bug #37900. diff --git a/utils/HtmlFonts.h b/utils/HtmlFonts.h index a0ca78a..2cdea4b 100644 --- a/utils/HtmlFonts.h +++ b/utils/HtmlFonts.h @@ -19,6 +19,7 @@ // // Copyright (C) 2010 OSSD CDAC Mumbai by Leena Chourey ([email protected]) and Onkar Potdar ([email protected]) // Copyright (C) 2010 Albert Astals Cid <[email protected]> +// Copyright (C) 2011 Steven Murdoch <[email protected]> // // To see a description of the changes please see the Changelog file that // came with your tarball or type make ChangeLog if you are building from git @@ -65,7 +66,6 @@ class HtmlFont{ static GooString *DefaultFont; GooString *FontName; HtmlFontColor color; - static GooString* HtmlFilter(Unicode* u, int uLen); //char* s); public: HtmlFont(){FontName=NULL;}; @@ -84,6 +84,7 @@ public: GooString* getFontName(); static GooString* getDefaultFont(); static void setDefaultFont(GooString* defaultFont); + static GooString* HtmlFilter(Unicode* u, int uLen); //char* s); GBool isEqual(const HtmlFont& x) const; GBool isEqualIgnoreBold(const HtmlFont& x) const; static GooString* simple(HtmlFont *font, Unicode *content, int uLen); diff --git a/utils/pdftohtml.cc b/utils/pdftohtml.cc index b46bf1b..fa00ae1 100644 --- a/utils/pdftohtml.cc +++ b/utils/pdftohtml.cc @@ -18,6 +18,7 @@ // Copyright (C) 2010 Mike Slegeir <[email protected]> // Copyright (C) 2010 Suzuki Toshiya <[email protected]> // Copyright (C) 2010 OSSD CDAC Mumbai by Leena Chourey ([email protected]) and Onkar Potdar ([email protected]) +// Copyright (C) 2011 Steven Murdoch <[email protected]> // // To see a description of the changes please see the Changelog file that // came with your tarball or type make ChangeLog if you are building from git @@ -53,6 +54,7 @@ #endif #include "PSOutputDev.h" #include "GlobalParams.h" +#include "PDFDocEncoding.h" #include "Error.h" #include "DateInfo.h" #include "goo/gfile.h" @@ -511,13 +513,46 @@ int main(int argc, char *argv[]) { static GooString* getInfoString(Dict *infoDict, char *key) { Object obj; - GooString *s1 = NULL; + // Raw value as read from PDF (may be in pdfDocEncoding or UCS2) + GooString *rawString; + // Value converted to unicode + Unicode *unicodeString; + int unicodeLength; + // Value HTML escaped and converted to desired encoding + GooString *encodedString = NULL; + // Is rawString UCS2 (as opposed to pdfDocEncoding) + GBool isUnicode; if (infoDict->lookup(key, &obj)->isString()) { - s1 = new GooString(obj.getString()); + rawString = obj.getString(); + + // Convert rawString to unicode + encodedString = new GooString(); + if (rawString->hasUnicodeMarker()) { + isUnicode = gTrue; + unicodeLength = (obj.getString()->getLength() - 2) / 2; + } else { + isUnicode = gFalse; + unicodeLength = obj.getString()->getLength(); + } + unicodeString = new Unicode[unicodeLength]; + + for (int i=0; i<unicodeLength; i++) { + if (isUnicode) { + unicodeString[i] = ((rawString->getChar((i+1)*2) & 0xff) << 8) | + (rawString->getChar(((i+1)*2)+1) & 0xff); + } else { + unicodeString[i] = pdfDocEncoding[rawString->getChar(i) & 0xff]; + } + } + + // HTML escape and encode unicode + encodedString = HtmlFont::HtmlFilter(unicodeString, unicodeLength); + delete[] unicodeString; } + obj.free(); - return s1; + return encodedString; } static GooString* getInfoDate(Dict *infoDict, char *key) { _______________________________________________ poppler mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/poppler
