CMakeLists.txt | 3 - goo/GooString.cc | 3 - poppler/CairoOutputDev.cc | 2 poppler/CharCodeToUnicode.cc | 13 +++-- poppler/GlobalParams.cc | 2 poppler/Makefile.am | 3 - poppler/TextOutputDev.cc | 55 ++------------------- poppler/UTF.cc | 104 ++++++++++++++++++++++++++++++++++++++++ poppler/UTF.h | 111 +++++++++++++++++++++++++++++++++++++++++++ poppler/UTF8.h | 84 -------------------------------- utils/HtmlOutputDev.cc | 16 ------ utils/pdfinfo.cc | 37 ++------------ 12 files changed, 246 insertions(+), 187 deletions(-)
New commits: commit cd1ab1e34032d5620140bd0b6b6ec4b74f89ae19 Author: Albert Astals Cid <[email protected]> Date: Thu Aug 30 22:36:14 2012 +0200 Update Adrian's copyrights diff --git a/goo/GooString.cc b/goo/GooString.cc index 61dee33..451a70e 100644 --- a/goo/GooString.cc +++ b/goo/GooString.cc @@ -21,6 +21,7 @@ // Copyright (C) 2008-2011 Albert Astals Cid <[email protected]> // Copyright (C) 2011 Kenji Uno <[email protected]> // Copyright (C) 2012 Fabio D'Urso <[email protected]> +// Copyright (C) 2012 Adrian Johnson <[email protected]> // // To see a description of the changes please see the Changelog file that // came with your tarball or type make ChangeLog if you are building from git diff --git a/poppler/CharCodeToUnicode.cc b/poppler/CharCodeToUnicode.cc index ce16ee5..4298090 100644 --- a/poppler/CharCodeToUnicode.cc +++ b/poppler/CharCodeToUnicode.cc @@ -21,6 +21,7 @@ // Copyright (C) 2010 William Bader <[email protected]> // Copyright (C) 2010 Jakub Wilk <[email protected]> // Copyright (C) 2012 Thomas Freitag <[email protected]> +// Copyright (C) 2012 Adrian Johnson <[email protected]> // // To see a description of the changes please see the Changelog file that // came with your tarball or type make ChangeLog if you are building from git diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc index cc18c9b..adbb79f 100644 --- a/poppler/TextOutputDev.cc +++ b/poppler/TextOutputDev.cc @@ -18,7 +18,7 @@ // Copyright (C) 2006-2008, 2011 Carlos Garcia Campos <[email protected]> // Copyright (C) 2006, 2007 Ed Catmur <[email protected]> // Copyright (C) 2006 Jeff Muizelaar <[email protected]> -// Copyright (C) 2007, 2008 Adrian Johnson <[email protected]> +// Copyright (C) 2007, 2008, 2012 Adrian Johnson <[email protected]> // Copyright (C) 2008 Koji Otani <[email protected]> // Copyright (C) 2008, 2010, 2011 Albert Astals Cid <[email protected]> // Copyright (C) 2008 Pino Toscano <[email protected]> diff --git a/poppler/UTF.cc b/poppler/UTF.cc index 0642d04..8e9cb9d 100644 --- a/poppler/UTF.cc +++ b/poppler/UTF.cc @@ -1,3 +1,26 @@ +//======================================================================== +// +// UTF.h +// +// Copyright 2001-2003 Glyph & Cog, LLC +// +//======================================================================== + +//======================================================================== +// +// Modified under the Poppler project - http://poppler.freedesktop.org +// +// All changes made under the Poppler project to this file are licensed +// under GPL version 2 or later +// +// Copyright (C) 2008 Koji Otani <[email protected]> +// Copyright (C) 2012 Adrian Johnson <[email protected]> +// +// To see a description of the changes please see the Changelog file that +// came with your tarball or type make ChangeLog if you are building from git +// +//======================================================================== + #include "goo/gmem.h" #include "PDFDocEncoding.h" #include "UTF.h" diff --git a/utils/HtmlOutputDev.cc b/utils/HtmlOutputDev.cc index b3bb17d..e4bd0b1 100644 --- a/utils/HtmlOutputDev.cc +++ b/utils/HtmlOutputDev.cc @@ -25,7 +25,7 @@ // Copyright (C) 2009 Warren Toomey <[email protected]> // Copyright (C) 2009, 2011 Carlos Garcia Campos <[email protected]> // Copyright (C) 2009 Reece Dunn <[email protected]> -// Copyright (C) 2010 Adrian Johnson <[email protected]> +// Copyright (C) 2010, 2012 Adrian Johnson <[email protected]> // Copyright (C) 2010 Hib Eris <[email protected]> // Copyright (C) 2010 OSSD CDAC Mumbai by Leena Chourey ([email protected]) and Onkar Potdar ([email protected]) // Copyright (C) 2011 Joshua Richardson <[email protected]> commit ce8a579f339507da3fd7802e1531fbf6849c0c98 Author: Adrian Johnson <[email protected]> Date: Tue Aug 28 22:16:34 2012 +0930 Move text to unicode conversion into a separate function This also ensures UTF-16 ActualText strings are converted to UCS-4 before calling addChar. diff --git a/goo/GooString.cc b/goo/GooString.cc index 1ebf341..61dee33 100644 --- a/goo/GooString.cc +++ b/goo/GooString.cc @@ -895,7 +895,7 @@ int GooString::cmpN(const char *sA, int n) const { GBool GooString::hasUnicodeMarker(void) { - return (s[0] & 0xff) == 0xfe && (s[1] & 0xff) == 0xff; + return length > 1 && (s[0] & 0xff) == 0xfe && (s[1] & 0xff) == 0xff; } GooString *GooString::sanitizedName(GBool psmode) diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc index 7db041e..cc18c9b 100644 --- a/poppler/TextOutputDev.cc +++ b/poppler/TextOutputDev.cc @@ -63,7 +63,7 @@ #include "TextOutputDev.h" #include "Page.h" #include "Annot.h" -#include "PDFDocEncoding.h" +#include "UTF.h" #ifdef MACOS // needed for setting type/creator of MacOS files @@ -5230,41 +5230,17 @@ void ActualText::end(GfxState *state) { // extents of all the glyphs inside the span if (actualTextNBytes) { - char *uniString = NULL; Unicode *uni; - int length, i; - - if (!actualText->hasUnicodeMarker()) { - if (actualText->getLength() > 0) { - //non-unicode string -- assume pdfDocEncoding and - //try to convert to UTF16BE - uniString = pdfDocEncodingToUTF16(actualText, &length); - } else { - length = 0; - } - } else { - uniString = actualText->getCString(); - length = actualText->getLength(); - } - - if (length < 3) - length = 0; - else - length = length/2 - 1; - uni = new Unicode[length]; - for (i = 0 ; i < length; i++) - uni[i] = ((uniString[2 + i*2] & 0xff)<<8)|(uniString[3 + i*2] & 0xff); + int length; // now that we have the position info for all of the text inside // the marked content span, we feed the "ActualText" back through // text->addChar() + length = TextStringToUCS4(actualText, &uni); text->addChar(state, actualTextX0, actualTextY0, actualTextX1 - actualTextX0, actualTextY1 - actualTextY0, 0, actualTextNBytes, uni, length); - - delete [] uni; - if (!actualText->hasUnicodeMarker()) - delete [] uniString; + gfree(uni); } delete actualText; diff --git a/poppler/UTF.cc b/poppler/UTF.cc index b5f7d9f..0642d04 100644 --- a/poppler/UTF.cc +++ b/poppler/UTF.cc @@ -1,4 +1,5 @@ #include "goo/gmem.h" +#include "PDFDocEncoding.h" #include "UTF.h" int UTF16toUCS4(const Unicode *utf16, int utf16Len, Unicode **ucs4) @@ -45,3 +46,36 @@ int UTF16toUCS4(const Unicode *utf16, int utf16Len, Unicode **ucs4) return len; } +int TextStringToUCS4(GooString *textStr, Unicode **ucs4) +{ + int i, len; + const char *s; + Unicode *u; + + len = textStr->getLength(); + s = textStr->getCString(); + if (len == 0) + return 0; + + if (textStr->hasUnicodeMarker()) { + Unicode *utf16; + len = len/2 - 1; + if (len > 0) { + utf16 = new Unicode[len]; + for (i = 0 ; i < len; i++) { + utf16[i] = (s[2 + i*2] & 0xff) << 8 | (s[3 + i*2] & 0xff); + } + len = UTF16toUCS4(utf16, len, &u); + delete utf16; + } else { + u = NULL; + } + } else { + u = (Unicode*)gmallocn(len, sizeof(Unicode)); + for (i = 0 ; i < len; i++) { + u[i] = pdfDocEncoding[s[i]]; + } + } + *ucs4 = u; + return len; +} diff --git a/poppler/UTF.h b/poppler/UTF.h index d0ef5bc..ec51e5a 100644 --- a/poppler/UTF.h +++ b/poppler/UTF.h @@ -27,6 +27,7 @@ #pragma implementation #endif +#include "goo/GooString.h" #include "CharTypes.h" // Convert a UTF-16 string to a UCS-4 @@ -36,6 +37,13 @@ // returns number of UCS-4 characters int UTF16toUCS4(const Unicode *utf16, int utf16_len, Unicode **ucs4_out); +// Convert a PDF Text String to UCS-4 +// s - PDF text string +// ucs4 - if the number of UCS-4 characters is > 0, allocates and +// returns UCS-4 string. Free with gfree. +// returns number of UCS-4 characters +int TextStringToUCS4(GooString *textStr, Unicode **ucs4); + static int mapUTF8(Unicode u, char *buf, int bufSize) { if (u <= 0x0000007f) { diff --git a/utils/pdfinfo.cc b/utils/pdfinfo.cc index cdc5375..d1c077b 100644 --- a/utils/pdfinfo.cc +++ b/utils/pdfinfo.cc @@ -48,7 +48,7 @@ #include "PDFDocFactory.h" #include "CharTypes.h" #include "UnicodeMap.h" -#include "PDFDocEncoding.h" +#include "UTF.h" #include "Error.h" #include "DateInfo.h" @@ -379,41 +379,16 @@ static void printInfoString(Dict *infoDict, const char *key, const char *text, UnicodeMap *uMap) { Object obj; GooString *s1; - GBool isUnicode; - Unicode u, u2; + Unicode *u; char buf[8]; - int i, n; + int i, n, len; if (infoDict->lookup(key, &obj)->isString()) { fputs(text, stdout); s1 = obj.getString(); - if ((s1->getChar(0) & 0xff) == 0xfe && - (s1->getChar(1) & 0xff) == 0xff) { - isUnicode = gTrue; - i = 2; - } else { - isUnicode = gFalse; - i = 0; - } - while (i < obj.getString()->getLength()) { - if (isUnicode) { - u = ((s1->getChar(i) & 0xff) << 8) | - (s1->getChar(i+1) & 0xff); - i += 2; - if (u >= 0xd800 && u <= 0xdbff && i < obj.getString()->getLength()) { - // surrogate pair - u2 = ((s1->getChar(i) & 0xff) << 8) | - (s1->getChar(i+1) & 0xff); - i += 2; - if (u2 >= 0xdc00 && u2 <= 0xdfff) { - u = 0x10000 + ((u - 0xd800) << 10) + (u2 - 0xdc00); - } - } - } else { - u = pdfDocEncoding[s1->getChar(i) & 0xff]; - ++i; - } - n = uMap->mapUnicode(u, buf, sizeof(buf)); + len = TextStringToUCS4(s1, &u); + for (i = 0; i < len; i++) { + n = uMap->mapUnicode(u[i], buf, sizeof(buf)); fwrite(buf, 1, n, stdout); } fputc('\n', stdout); commit cac13e782cf4413703cfd1fa23e76133dfbe5ef9 Author: Adrian Johnson <[email protected]> Date: Tue Aug 28 21:48:16 2012 +0930 text: increase the tolerance for overlapping glyphs TextOutputDev will start a new line when encountering consecutive glyphs with overlapping bounding boxes. This can occur when drawing diacritics with a separate glyph. In this case, due to the diacritic having a different baseline, the lines may be output in the wrong order. This patch increases the tolerance for overlapping bounding boxes to prevent diacritics from splitting lines. diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc index 3020e22..7db041e 100644 --- a/poppler/TextOutputDev.cc +++ b/poppler/TextOutputDev.cc @@ -127,7 +127,7 @@ // Minimum spacing between characters within a word, as a fraction of // the font size. -#define minCharSpacing -0.2 +#define minCharSpacing -0.5 // Maximum spacing between characters within a word, as a fraction of // the font size, when there is no obvious extra-wide character commit 6f6386219449e70c2c3bc3559fdde3df4a57a809 Author: Adrian Johnson <[email protected]> Date: Thu Mar 8 20:52:28 2012 +1030 Convert UTF-16 to UCS-4 when reading toUnicode cmap to ensure only UCS-4 values are used with the "Unicode" type. diff --git a/CMakeLists.txt b/CMakeLists.txt index 8b07470..6bddf0b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -296,6 +296,7 @@ set(poppler_SRCS poppler/strtok_r.cpp poppler/UnicodeMap.cc poppler/UnicodeTypeTable.cc + poppler/UTF.cc poppler/XRef.cc poppler/PSOutputDev.cc poppler/TextOutputDev.cc @@ -466,7 +467,7 @@ if(ENABLE_XPDF_HEADERS) poppler/SecurityHandler.h poppler/StdinCachedFile.h poppler/StdinPDFDocBuilder.h - poppler/UTF8.h + poppler/UTF.h poppler/XpdfPluginAPI.h poppler/Sound.h ${CMAKE_CURRENT_BINARY_DIR}/poppler/poppler-config.h diff --git a/poppler/CairoOutputDev.cc b/poppler/CairoOutputDev.cc index b70183e..d8f78d7 100644 --- a/poppler/CairoOutputDev.cc +++ b/poppler/CairoOutputDev.cc @@ -61,7 +61,7 @@ #include "CairoOutputDev.h" #include "CairoFontEngine.h" #include "CairoRescaleBox.h" -#include "UTF8.h" +#include "UTF.h" //------------------------------------------------------------------------ // #define LOG_CAIRO diff --git a/poppler/CharCodeToUnicode.cc b/poppler/CharCodeToUnicode.cc index d0e6c7f..ce16ee5 100644 --- a/poppler/CharCodeToUnicode.cc +++ b/poppler/CharCodeToUnicode.cc @@ -43,6 +43,7 @@ #include "GlobalParams.h" #include "PSTokenizer.h" #include "CharCodeToUnicode.h" +#include "UTF.h" //------------------------------------------------------------------------ @@ -453,15 +454,16 @@ void CharCodeToUnicode::addMapping(CharCode code, char *uStr, int n, } map[code] = 0; sMap[sMapLen].c = code; - sMap[sMapLen].len = n / 4; - sMap[sMapLen].u = (Unicode*)gmallocn(sMap[sMapLen].len, sizeof(Unicode)); - for (j = 0; j < sMap[sMapLen].len; ++j) { - if (!parseHex(uStr + j*4, 4, &sMap[sMapLen].u[j])) { + int utf16Len = n / 4; + Unicode *utf16 = (Unicode*)gmallocn(utf16Len, sizeof(Unicode)); + for (j = 0; j < utf16Len; ++j) { + if (!parseHex(uStr + j*4, 4, &utf16[j])) { error(errSyntaxWarning, -1, "Illegal entry in ToUnicode CMap"); return; } } - sMap[sMapLen].u[sMap[sMapLen].len - 1] += offset; + utf16[utf16Len - 1] += offset; + sMap[sMapLen].len = UTF16toUCS4(utf16, utf16Len, &sMap[sMapLen].u); ++sMapLen; } } diff --git a/poppler/GlobalParams.cc b/poppler/GlobalParams.cc index 098e4a4..148a0dd 100644 --- a/poppler/GlobalParams.cc +++ b/poppler/GlobalParams.cc @@ -108,7 +108,7 @@ #include "NameToUnicodeTable.h" #include "UnicodeMapTables.h" -#include "UTF8.h" +#include "UTF.h" #ifdef ENABLE_PLUGINS # ifdef _WIN32 diff --git a/poppler/Makefile.am b/poppler/Makefile.am index 8920f8e..e9ac9d4 100644 --- a/poppler/Makefile.am +++ b/poppler/Makefile.am @@ -251,7 +251,7 @@ poppler_include_HEADERS = \ PSOutputDev.h \ TextOutputDev.h \ SecurityHandler.h \ - UTF8.h \ + UTF.h \ XpdfPluginAPI.h \ Sound.h nodist_poppler_include_HEADERS = poppler-config.h @@ -317,6 +317,7 @@ libpoppler_la_SOURCES = \ strtok_r.cpp \ UnicodeMap.cc \ UnicodeTypeTable.cc \ + UTF.cc \ ViewerPreferences.cc \ XRef.cc \ PSOutputDev.cc \ diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc index 9af7532..3020e22 100644 --- a/poppler/TextOutputDev.cc +++ b/poppler/TextOutputDev.cc @@ -2392,24 +2392,7 @@ void TextPage::addChar(GfxState *state, double x, double y, w1 /= uLen; h1 /= uLen; for (i = 0; i < uLen; ++i) { - if (u[i] >= 0xd800 && u[i] < 0xdc00) { /* surrogate pair */ - if (i + 1 < uLen && u[i+1] >= 0xdc00 && u[i+1] < 0xe000) { - /* next code is a low surrogate */ - Unicode uu = (((u[i] & 0x3ff) << 10) | (u[i+1] & 0x3ff)) + 0x10000; - i++; - curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, charPos, nBytes, c, uu); - } else { - /* missing low surrogate - replace it with REPLACEMENT CHARACTER (U+FFFD) */ - curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, charPos, nBytes, c, 0xfffd); - } - } else if (u[i] >= 0xdc00 && u[i] < 0xe000) { - /* invalid low surrogate - replace it with REPLACEMENT CHARACTER (U+FFFD) */ - curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, charPos, nBytes, c, 0xfffd); - } else { - curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, charPos, nBytes, c, u[i]); - } + curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, charPos, nBytes, c, u[i]); } } charPos += nBytes; diff --git a/poppler/UTF.cc b/poppler/UTF.cc new file mode 100644 index 0000000..b5f7d9f --- /dev/null +++ b/poppler/UTF.cc @@ -0,0 +1,47 @@ +#include "goo/gmem.h" +#include "UTF.h" + +int UTF16toUCS4(const Unicode *utf16, int utf16Len, Unicode **ucs4) +{ + int i, n, len; + Unicode *u; + + // count characters + len = 0; + for (i = 0; i < utf16Len; i++) { + if (utf16[i] >= 0xd800 && utf16[i] < 0xdc00 && i + 1 < utf16Len && + utf16[i+1] >= 0xdc00 && utf16[i+1] < 0xe000) { + i++; /* surrogate pair */ + } + len++; + } + if (ucs4 == NULL) + return len; + + u = (Unicode*)gmallocn(len, sizeof(Unicode)); + n = 0; + // convert string + for (i = 0; i < utf16Len; i++) { + if (utf16[i] >= 0xd800 && utf16[i] < 0xdc00) { /* surrogate pair */ + if (i + 1 < utf16Len && utf16[i+1] >= 0xdc00 && utf16[i+1] < 0xe000) { + /* next code is a low surrogate */ + u[n] = (((utf16[i] & 0x3ff) << 10) | (utf16[i+1] & 0x3ff)) + 0x10000; + ++i; + } else { + /* missing low surrogate + replace it with REPLACEMENT CHARACTER (U+FFFD) */ + u[n] = 0xfffd; + } + } else if (utf16[i] >= 0xdc00 && utf16[i] < 0xe000) { + /* invalid low surrogate + replace it with REPLACEMENT CHARACTER (U+FFFD) */ + u[n] = 0xfffd; + } else { + u[n] = utf16[i]; + } + n++; + } + *ucs4 = u; + return len; +} + diff --git a/poppler/UTF.h b/poppler/UTF.h new file mode 100644 index 0000000..d0ef5bc --- /dev/null +++ b/poppler/UTF.h @@ -0,0 +1,103 @@ +//======================================================================== +// +// UTF.h +// +// Copyright 2001-2003 Glyph & Cog, LLC +// +//======================================================================== + +//======================================================================== +// +// Modified under the Poppler project - http://poppler.freedesktop.org +// +// All changes made under the Poppler project to this file are licensed +// under GPL version 2 or later +// +// Copyright (C) 2008 Koji Otani <[email protected]> +// +// To see a description of the changes please see the Changelog file that +// came with your tarball or type make ChangeLog if you are building from git +// +//======================================================================== + +#ifndef UTF_H +#define UTF_H + +#ifdef USE_GCC_PRAGMAS +#pragma implementation +#endif + +#include "CharTypes.h" + +// Convert a UTF-16 string to a UCS-4 +// utf16 - utf16 bytes +// utf16_len - number of UTF-16 characters +// ucs4_out - if not NULL, allocates and returns UCS-4 string. Free with gfree. +// returns number of UCS-4 characters +int UTF16toUCS4(const Unicode *utf16, int utf16_len, Unicode **ucs4_out); + + +static int mapUTF8(Unicode u, char *buf, int bufSize) { + if (u <= 0x0000007f) { + if (bufSize < 1) { + return 0; + } + buf[0] = (char)u; + return 1; + } else if (u <= 0x000007ff) { + if (bufSize < 2) { + return 0; + } + buf[0] = (char)(0xc0 + (u >> 6)); + buf[1] = (char)(0x80 + (u & 0x3f)); + return 2; + } else if (u <= 0x0000ffff) { + if (bufSize < 3) { + return 0; + } + buf[0] = (char)(0xe0 + (u >> 12)); + buf[1] = (char)(0x80 + ((u >> 6) & 0x3f)); + buf[2] = (char)(0x80 + (u & 0x3f)); + return 3; + } else if (u <= 0x0010ffff) { + if (bufSize < 4) { + return 0; + } + buf[0] = (char)(0xf0 + (u >> 18)); + buf[1] = (char)(0x80 + ((u >> 12) & 0x3f)); + buf[2] = (char)(0x80 + ((u >> 6) & 0x3f)); + buf[3] = (char)(0x80 + (u & 0x3f)); + return 4; + } else { + return 0; + } +} + +static int mapUCS2(Unicode u, char *buf, int bufSize) { + if (u <= 0xffff) { + if (bufSize < 2) { + return 0; + } + buf[0] = (char)((u >> 8) & 0xff); + buf[1] = (char)(u & 0xff); + return 2; + } else if (u < 0x110000) { + Unicode uu; + + /* using surrogate pair */ + if (bufSize < 4) { + return 0; + } + uu = ((u - 0x10000) >> 10) + 0xd800; + buf[0] = (char)((uu >> 8) & 0xff); + buf[1] = (char)(uu & 0xff); + uu = (u & 0x3ff)+0xdc00; + buf[2] = (char)((uu >> 8) & 0xff); + buf[3] = (char)(uu & 0xff); + return 4; + } else { + return 0; + } +} + +#endif diff --git a/poppler/UTF8.h b/poppler/UTF8.h deleted file mode 100644 index 34a07d4..0000000 --- a/poppler/UTF8.h +++ /dev/null @@ -1,84 +0,0 @@ -//======================================================================== -// -// UTF8.h -// -// Copyright 2001-2003 Glyph & Cog, LLC -// -//======================================================================== - -//======================================================================== -// -// Modified under the Poppler project - http://poppler.freedesktop.org -// -// All changes made under the Poppler project to this file are licensed -// under GPL version 2 or later -// -// Copyright (C) 2008 Koji Otani <[email protected]> -// -// To see a description of the changes please see the Changelog file that -// came with your tarball or type make ChangeLog if you are building from git -// -//======================================================================== - -static int mapUTF8(Unicode u, char *buf, int bufSize) { - if (u <= 0x0000007f) { - if (bufSize < 1) { - return 0; - } - buf[0] = (char)u; - return 1; - } else if (u <= 0x000007ff) { - if (bufSize < 2) { - return 0; - } - buf[0] = (char)(0xc0 + (u >> 6)); - buf[1] = (char)(0x80 + (u & 0x3f)); - return 2; - } else if (u <= 0x0000ffff) { - if (bufSize < 3) { - return 0; - } - buf[0] = (char)(0xe0 + (u >> 12)); - buf[1] = (char)(0x80 + ((u >> 6) & 0x3f)); - buf[2] = (char)(0x80 + (u & 0x3f)); - return 3; - } else if (u <= 0x0010ffff) { - if (bufSize < 4) { - return 0; - } - buf[0] = (char)(0xf0 + (u >> 18)); - buf[1] = (char)(0x80 + ((u >> 12) & 0x3f)); - buf[2] = (char)(0x80 + ((u >> 6) & 0x3f)); - buf[3] = (char)(0x80 + (u & 0x3f)); - return 4; - } else { - return 0; - } -} - -static int mapUCS2(Unicode u, char *buf, int bufSize) { - if (u <= 0xffff) { - if (bufSize < 2) { - return 0; - } - buf[0] = (char)((u >> 8) & 0xff); - buf[1] = (char)(u & 0xff); - return 2; - } else if (u < 0x110000) { - Unicode uu; - - /* using surrogate pair */ - if (bufSize < 4) { - return 0; - } - uu = ((u - 0x10000) >> 10) + 0xd800; - buf[0] = (char)((uu >> 8) & 0xff); - buf[1] = (char)(uu & 0xff); - uu = (u & 0x3ff)+0xdc00; - buf[2] = (char)((uu >> 8) & 0xff); - buf[3] = (char)(uu & 0xff); - return 4; - } else { - return 0; - } -} diff --git a/utils/HtmlOutputDev.cc b/utils/HtmlOutputDev.cc index 83f65d5..b3bb17d 100644 --- a/utils/HtmlOutputDev.cc +++ b/utils/HtmlOutputDev.cc @@ -400,19 +400,7 @@ void HtmlPage::addChar(GfxState *state, double x, double y, h1 /= uLen; } for (i = 0; i < uLen; ++i) { - Unicode u1 = u[i]; - if (u1 >= 0xd800 && u1 <= 0xdbff && i < uLen) { - // surrogate pair - const Unicode u2 = u[i + 1]; - if (u2 >= 0xdc00 && u2 <= 0xdfff) { - u1 = 0x10000 + ((u1 - 0xd800) << 10) + (u2 - 0xdc00); - - curStr->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, u1); - } - ++i; - } else { - curStr->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, u1); - } + curStr->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, u[i]); } } _______________________________________________ poppler mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/poppler
