src/lib/MSPUBCollector.cpp | 103 +++++++++++++++++++++++--- src/lib/MSPUBCollector.h | 11 ++ src/lib/MSPUBParser.cpp | 2 src/lib/MSPUBParser97.cpp | 2 src/lib/libmspub_utils.cpp | 178 ++++++++++++++------------------------------- src/lib/libmspub_utils.h | 8 -- 6 files changed, 163 insertions(+), 141 deletions(-)
New commits: commit 17f68425119bb587ca8db474beb34884511b9a12 Author: Brennan T. Vincent <brenn...@email.arizona.edu> Date: Fri Jan 18 00:55:25 2013 -0700 Autodetect character set for pre-unicode MSPUB versions (still need to test for languages other than Russian, but appears to be working) diff --git a/src/lib/MSPUBCollector.cpp b/src/lib/MSPUBCollector.cpp index 635c049..5653df8 100644 --- a/src/lib/MSPUBCollector.cpp +++ b/src/lib/MSPUBCollector.cpp @@ -13,7 +13,7 @@ * License. * * Major Contributor(s): - * Copyright (C) 2012 Brennan Vincent <brenn...@email.arizona.edu> + * Copyright (C) 2012-2013 Brennan Vincent <brenn...@email.arizona.edu> * Copyright (C) 2012 Fridrich Strba <fridrich.st...@bluewin.ch> * * @@ -29,12 +29,16 @@ */ #include <math.h> + +#include <unicode/ucsdet.h> + #include "MSPUBCollector.h" #include "libmspub_utils.h" #include "MSPUBConstants.h" #include "MSPUBTypes.h" #include "PolygonUtils.h" #include "Coordinate.h" + #pragma GCC diagnostic ignored "-Wpragmas" #pragma GCC diagnostic ignored "-Wuninitialized" #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" @@ -169,8 +173,10 @@ libmspub::MSPUBCollector::MSPUBCollector(libwpg::WPGPaintInterface *painter) : m_shapeInfosBySeqNum(), m_masterPages(), m_shapesWithCoordinatesRotated90(), m_masterPagesByPageSeqNum(), - m_encoding(), m_tableCellTextEndsVector(), m_stringOffsetsByTextId(), - m_calculationValuesSeen(), m_pageSeqNumsOrdered() + m_tableCellTextEndsVector(), m_stringOffsetsByTextId(), + m_calculationValuesSeen(), m_pageSeqNumsOrdered(), + m_encodingHeuristic(false), m_allText(), + m_calculatedEncoding() { } @@ -186,9 +192,9 @@ void libmspub::MSPUBCollector::setNextTableCellTextEnds( m_tableCellTextEndsVector.push_back(ends); } -void libmspub::MSPUBCollector::setEncoding(Encoding encoding) +void libmspub::MSPUBCollector::useEncodingHeuristic() { - m_encoding = encoding; + m_encodingHeuristic = true; } void libmspub::MSPUBCollector::setShapeShadow(unsigned seqNum, const Shadow &shadow) @@ -784,7 +790,7 @@ boost::function<void(void)> libmspub::MSPUBCollector::paintShape(const ShapeInfo { WPXString textString; appendCharacters(textString, text[i_lines].spans[i_spans].chars, - m_encoding.get_value_or(UTF_16)); + getCalculatedEncoding()); WPXPropertyList charProps = getCharStyleProps(text[i_lines].spans[i_spans].style, text[i_lines].style.m_defaultCharStyleIndex); m_painter->startTextSpan(charProps); m_painter->insertText(textString); @@ -801,6 +807,68 @@ boost::function<void(void)> libmspub::MSPUBCollector::paintShape(const ShapeInfo return &no_op; } +const char *libmspub::MSPUBCollector::getCalculatedEncoding() const +{ + if (m_calculatedEncoding.is_initialized()) + { + return m_calculatedEncoding.get(); + } + // modern versions are somewhat sane and use Unicode + if (! m_encodingHeuristic) + { + m_calculatedEncoding = "UTF-16LE"; + return m_calculatedEncoding.get(); + } + // for older versions of PUB, see if we can get ICU to tell us the encoding. + UErrorCode status = U_ZERO_ERROR; + UCharsetDetector *ucd = NULL; + const UCharsetMatch **matches = NULL; + const UCharsetMatch *ucm = NULL; + ucd = ucsdet_open(&status); + int matchesFound = -1; + const char *name = NULL; + const char *windowsName = NULL; + if (m_allText.empty()) + { + goto csd_fail; + } + if (U_FAILURE(status)) + { + goto csd_fail; + } + // don't worry, the below call doesn't require a null-terminated string. + ucsdet_setText(ucd, (const char *)(&m_allText[0]), m_allText.size(), &status); + if (U_FAILURE(status)) + { + goto csd_fail; + } + matches = ucsdet_detectAll(ucd, &matchesFound, &status); + if (U_FAILURE(status)) + { + goto csd_fail; + } + //find best fit that is an actual Windows encoding + for (int i = 0; i < matchesFound; ++i) + { + ucm = matches[i]; + name = ucsdet_getName(ucm, &status); + if (U_FAILURE(status)) + { + goto csd_fail; + } + windowsName = windowsCharsetNameByOriginalCharset(name); + if (windowsName) + { + m_calculatedEncoding = windowsName; + ucsdet_close(ucd); + return windowsName; + } + } +csd_fail: + ucsdet_close(ucd); + return "windows-1252"; // Pretty likely to give garbage text, but it's the best we can do. +} + void libmspub::MSPUBCollector::setShapeLineBackColor(unsigned shapeSeqNum, ColorReference backColor) { @@ -1142,7 +1210,7 @@ WPXPropertyList libmspub::MSPUBCollector::getCharStyleProps(const CharacterStyle { WPXString str; appendCharacters(str, m_fonts[style.fontIndex.get()], - m_encoding.get_value_or(UTF_16)); + getCalculatedEncoding()); ret.insert("style:font-name", str); } else if (defaultCharStyle.fontIndex.is_initialized() && @@ -1150,14 +1218,14 @@ WPXPropertyList libmspub::MSPUBCollector::getCharStyleProps(const CharacterStyle { WPXString str; appendCharacters(str, m_fonts[defaultCharStyle.fontIndex.get()], - m_encoding.get_value_or(UTF_16)); + getCalculatedEncoding()); ret.insert("style:font-name", str); } else if (!m_fonts.empty()) { WPXString str; appendCharacters(str, m_fonts[0], - m_encoding.get_value_or(UTF_16)); + getCalculatedEncoding()); ret.insert("style:font-name", str); } switch (style.superSubType) @@ -1325,9 +1393,26 @@ bool libmspub::MSPUBCollector::addTextString(const std::vector<TextParagraph> &s { MSPUB_DEBUG_MSG(("addTextString, id: 0x%x\n", id)); m_textStringsById[id] = str; + if (m_encodingHeuristic) + { + ponderStringEncoding(str); + } return true; //FIXME: Warn if the string already existed in the map. } +void libmspub::MSPUBCollector::ponderStringEncoding( + const std::vector<TextParagraph> &str) +{ + for (unsigned i = 0; i < str.size(); ++i) + { + for (unsigned j = 0; j < str[i].spans.size(); ++j) + { + const std::vector<unsigned char> &chars = str[i].spans[j].chars; + m_allText.insert(m_allText.end(), chars.begin(), chars.end()); + } + } +} + void libmspub::MSPUBCollector::setWidthInEmu(unsigned long widthInEmu) { //FIXME: Warn if this is called twice diff --git a/src/lib/MSPUBCollector.h b/src/lib/MSPUBCollector.h index be39b0d..03e7dc5 100644 --- a/src/lib/MSPUBCollector.h +++ b/src/lib/MSPUBCollector.h @@ -13,7 +13,7 @@ * License. * * Major Contributor(s): - * Copyright (C) 2012 Brennan Vincent <brenn...@email.arizona.edu> + * Copyright (C) 2012-2013 Brennan Vincent <brenn...@email.arizona.edu> * Copyright (C) 2012 Fridrich Strba <fridrich.st...@bluewin.ch> * * All Rights Reserved. @@ -135,7 +135,8 @@ public: void addDefaultParagraphStyle(const ParagraphStyle &style); void addPaletteColor(Color); bool setCurrentGroupSeqNum(unsigned seqNum); - void setEncoding(Encoding encoding); + + void useEncodingHeuristic(); void setNextTableCellTextEnds(const std::vector<unsigned> &ends); void setTextStringOffset(unsigned textId, unsigned offset); @@ -181,11 +182,13 @@ private: std::set<unsigned> m_masterPages; std::set<unsigned> m_shapesWithCoordinatesRotated90; std::map<unsigned, unsigned> m_masterPagesByPageSeqNum; - boost::optional<Encoding> m_encoding; std::vector<std::vector<unsigned> > m_tableCellTextEndsVector; std::map<unsigned, unsigned> m_stringOffsetsByTextId; mutable std::vector<bool> m_calculationValuesSeen; std::vector<unsigned> m_pageSeqNumsOrdered; + bool m_encodingHeuristic; + std::vector<unsigned char> m_allText; + mutable boost::optional<const char *> m_calculatedEncoding; // helper functions std::vector<int> getShapeAdjustValues(const ShapeInfo &info) const; boost::optional<unsigned> getMasterPageSeqNum(unsigned pageSeqNum) const; @@ -208,6 +211,8 @@ private: WPXPropertyList getCharStyleProps(const CharacterStyle &, boost::optional<unsigned> defaultCharStyleIndex) const; WPXPropertyList getParaStyleProps(const ParagraphStyle &, boost::optional<unsigned> defaultParaStyleIndex) const; double getSpecialValue(const ShapeInfo &info, const CustomShape &shape, int arg, const std::vector<int> &adjustValues) const; + void ponderStringEncoding(const std::vector<TextParagraph> &str); + const char *getCalculatedEncoding() const; public: static WPXString getColorString(const Color &); }; diff --git a/src/lib/MSPUBParser.cpp b/src/lib/MSPUBParser.cpp index 6d1a04c..548dd48 100644 --- a/src/lib/MSPUBParser.cpp +++ b/src/lib/MSPUBParser.cpp @@ -488,7 +488,7 @@ bool libmspub::MSPUBParser::parseFontChunk( if (subSubInfo.id == EMBEDDED_FONT_NAME) { name = WPXString(); - appendCharacters(name.get(), subSubInfo.stringData, UTF_16); + appendCharacters(name.get(), subSubInfo.stringData, "UTF-16"); } else if (subSubInfo.id == EMBEDDED_EOT) { diff --git a/src/lib/MSPUBParser97.cpp b/src/lib/MSPUBParser97.cpp index e4ce736..8b8efd9 100644 --- a/src/lib/MSPUBParser97.cpp +++ b/src/lib/MSPUBParser97.cpp @@ -34,7 +34,7 @@ libmspub::MSPUBParser97::MSPUBParser97(WPXInputStream *input, MSPUBCollector *collector) : MSPUBParser2k(input, collector), m_isBanner(false) { - m_collector->setEncoding(WIN_1252); + m_collector->useEncodingHeuristic(); } unsigned short libmspub::MSPUBParser97::getTextMarker() const diff --git a/src/lib/libmspub_utils.cpp b/src/lib/libmspub_utils.cpp index ce10f5d..17c3d5c 100644 --- a/src/lib/libmspub_utils.cpp +++ b/src/lib/libmspub_utils.cpp @@ -29,9 +29,14 @@ * instead of those above. */ +#include <unicode/ucnv.h> +#include <unicode/utypes.h> + #include <string.h> // for memcpy #include <math.h> #include <zlib.h> +#include <cstring> + #include "libmspub_utils.h" #ifndef M_PI @@ -40,6 +45,40 @@ #define ZLIB_CHUNK 16384 +using std::strcmp; +const char *libmspub::windowsCharsetNameByOriginalCharset(const char *name) +{ + if (strcmp(name, "Shift_JIS") == 0) + { + return "windows-932"; + } + if (strcmp(name, "GB18030") == 0) + { + return "windows-936"; + } + if (strcmp(name, "Big5") == 0) + { + return "windows-950"; + } + if (strcmp(name, "ISO-8859-1") == 0) + { + return "windows-1252"; + } + if (strcmp(name, "ISO-8859-2") == 0) + { + return "windows-1250"; + } + if (strcmp(name, "windows-1251") == 0) + { + return "windows-1251"; + } + if (strcmp(name, "windows-1256") == 0) + { + return "windows-1256"; + } + return NULL; +} + const char *libmspub::mimeByImgType(ImgType type) { switch (type) @@ -182,69 +221,6 @@ WPXBinaryData libmspub::inflateData(WPXBinaryData deflated) namespace { -static uint32_t _win1252ToUCS4(unsigned char win1252Character) -{ - switch (win1252Character) - { - case 0x80: - return 0x20AC; - case 0x82: - return 0x201A; - case 0x83: - return 0x0192; - case 0x84: - return 0x201E; - case 0x85: - return 0x2026; - case 0x86: - return 0x2020; - case 0x87: - return 0x2021; - case 0x88: - return 0x02C6; - case 0x89: - return 0x2030; - case 0x8A: - return 0x0160; - case 0x8B: - return 0x2039; - case 0x8C: - return 0x0152; - case 0x8E: - return 0x017D; - case 0x91: - return 0x2018; - case 0x92: - return 0x2019; - case 0x93: - return 0x201C; - case 0x94: - return 0x201D; - case 0x95: - return 0x2022; - case 0x96: - return 0x2013; - case 0x97: - return 0x2014; - case 0x98: - return 0x02DC; - case 0x99: - return 0x2122; - case 0x9A: - return 0x0161; - case 0x9B: - return 0x203A; - case 0x9C: - return 0x0153; - case 0x9E: - return 0x017E; - case 0x9F: - return 0x0178; - default: - return win1252Character; - } -} - static void _appendUCS4(WPXString &text, unsigned ucs4Character) { unsigned char first; @@ -388,71 +364,31 @@ void libmspub::readNBytes(WPXInputStream *input, unsigned long length, std::vect #define SURROGATE_VALUE(h,l) (((h) - 0xd800) * 0x400 + (l) - 0xdc00 + 0x10000) + void libmspub::appendCharacters(WPXString &text, const std::vector<unsigned char> characters, - Encoding encoding) + const char *encoding) { - switch (encoding) + UErrorCode status = U_ZERO_ERROR; + UConverter *conv = NULL; + conv = ucnv_open(encoding, &status); + if (U_SUCCESS(status)) { - case UTF_16: - for (std::vector<unsigned char>::const_iterator iter = characters.begin(); - iter != characters.end();) + // ICU documentation claims that character-by-character processing is faster "for small amounts of data" and "'normal' charsets" + // (in any case, it is more convenient :) ) + const char *src = (const char *)&characters[0]; + const char *srcLimit = (const char *)src + characters.size(); + while (src < srcLimit) { - uint16_t high_surrogate = 0; - bool fail = false; - uint32_t ucs4Character = 0; - while (true) + uint32_t ucs4Character = (uint32_t)ucnv_getNextUChar(conv, &src, srcLimit, &status); + if (U_SUCCESS(status)) { - if (iter == characters.end()) - { - fail = true; - break; - } - uint16_t character = *iter++; - character |= (uint16_t)(*iter++) << 8; - if (character >= 0xdc00 && character < 0xe000) /* low surrogate */ - { - if (high_surrogate) - { - ucs4Character = SURROGATE_VALUE(high_surrogate, character); - high_surrogate = 0; - break; - } - else - { - fail = true; - break; - } - } - else - { - if (high_surrogate) - { - fail = true; - break; - } - if (character >= 0xd800 && character < 0xdc00) /* high surrogate */ - high_surrogate = character; - else - { - ucs4Character = character; - break; - } - } + _appendUCS4(text, ucs4Character); } - if (fail) - throw libmspub::GenericException(); - - _appendUCS4(text, ucs4Character); } - break; - case WIN_1252: - for (std::vector<unsigned char>::const_iterator iter = characters.begin(); - iter != characters.end(); ++iter) - { - uint32_t ucs4 = _win1252ToUCS4(*iter); - _appendUCS4(text, ucs4); - } - break; + } + if (conv) + { + ucnv_close(conv); } } diff --git a/src/lib/libmspub_utils.h b/src/lib/libmspub_utils.h index 9b902da..c2bfb0d 100644 --- a/src/lib/libmspub_utils.h +++ b/src/lib/libmspub_utils.h @@ -92,12 +92,8 @@ typedef unsigned __int64 uint64_t; namespace libmspub { -enum Encoding -{ - UTF_16, - WIN_1252 -}; const char *mimeByImgType(ImgType type); +const char *windowsCharsetNameByOriginalCharset(const char *name); uint16_t readU16(const unsigned char *input, unsigned offset); uint32_t readU32(const unsigned char *input, unsigned offset); @@ -113,7 +109,7 @@ double readFixedPoint(WPXInputStream *input); double toFixedPoint(int fp); void readNBytes(WPXInputStream *input, unsigned long length, std::vector<unsigned char> &out); -void appendCharacters(WPXString &text, std::vector<unsigned char> characters, Encoding encoding); +void appendCharacters(WPXString &text, std::vector<unsigned char> characters, const char *encoding); bool stillReading(WPXInputStream *input, unsigned long until); _______________________________________________ Libreoffice-commits mailing list libreoffice-comm...@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/libreoffice-commits