poppler/UTF.cc | 17 ++++++++++++++++- qt5/tests/check_utf_conversion.cpp | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 1 deletion(-)
New commits: commit 9a880ecd7d865a12b0f91f56285907bbb409f32f Author: Nelson Benítez León <[email protected]> Date: Thu Jul 9 01:36:24 2020 -0400 Add test for UTF16LE string support Issue #941 diff --git a/qt5/tests/check_utf_conversion.cpp b/qt5/tests/check_utf_conversion.cpp index f28829f4..1f04c2a5 100644 --- a/qt5/tests/check_utf_conversion.cpp +++ b/qt5/tests/check_utf_conversion.cpp @@ -4,6 +4,7 @@ #include <poppler-private.h> #include <cstring> +#include <cstdint> // for uint16_t #include "GlobalParams.h" #include "UnicodeTypeTable.h" @@ -18,6 +19,7 @@ private slots: void testUTF_data(); void testUTF(); void testUnicodeToAscii7(); + void testUnicodeLittleEndian(); }; static bool compare(const char *a, const char *b) @@ -143,5 +145,35 @@ void TestUTFConversion::testUnicodeToAscii7() free(out_ascii_idx); } +void TestUTFConversion::testUnicodeLittleEndian() +{ + uint16_t UTF16LE_hi[4] { 0xFFFE, 0x4800, 0x4900, 0x2100 }; // UTF16-LE "HI!" + GooString GooUTF16LE(reinterpret_cast<const char *>(UTF16LE_hi), 4 * 2); + + uint16_t UTF16BE_hi[4] { 0xFEFF, 0x0048, 0x0049, 0x0021 }; // UTF16-BE "HI!" + GooString GooUTF16BE(reinterpret_cast<const char *>(UTF16BE_hi), 4 * 2); + + // Let's assert both GooString's are different + Q_ASSERT(GooUTF16LE.cmp(&GooUTF16BE) != 0); + + Unicode *UCS4fromLE, *UCS4fromBE; + const int len1 = TextStringToUCS4(&GooUTF16LE, &UCS4fromLE); + const int len2 = TextStringToUCS4(&GooUTF16BE, &UCS4fromBE); + + // 3 as TextStringToUCS4() removes the two leading Byte Order Mark (BOM) code points + Q_ASSERT(len1 == len2); + Q_ASSERT(len1 == 3); + + // Check that now after conversion, UCS4fromLE and UCS4fromBE are now the same + for (int i = 0; i < len1; i++) { + Q_ASSERT(UCS4fromLE[i] == UCS4fromBE[i]); + } + + // Do some final verifications, checking the strings to be "HI!" + QVERIFY(*UCS4fromLE == *UCS4fromBE); + QVERIFY(compare(UCS4fromLE, "HI!", 3)); + QVERIFY(compare(UCS4fromBE, "HI!", 3)); +} + QTEST_GUILESS_MAIN(TestUTFConversion) #include "check_utf_conversion.moc" commit 232cba307e8be35022426ba85f34198af7406899 Author: Nelson Benítez León <[email protected]> Date: Thu Jul 9 01:37:20 2020 -0400 Make TextStringToUCS4() support UTF16-LE too UTF16-LE strings can 'de facto' appear on pdf's (eg. title of Outline items) and Acrobat display them fine, so let's support that so we don't show an ugly 'ÿþ' at start of the text (Okular) or even no text at all (Evince). Issue #941 Evince issue: https://gitlab.gnome.org/GNOME/evince/-/issues/1444 diff --git a/poppler/UTF.cc b/poppler/UTF.cc index 112986af..d231bde1 100644 --- a/poppler/UTF.cc +++ b/poppler/UTF.cc @@ -90,6 +90,7 @@ int TextStringToUCS4(const GooString *textStr, Unicode **ucs4) int i, len; const char *s; Unicode *u; + bool isUnicode, isUnicodeLE; len = textStr->getLength(); s = textStr->c_str(); @@ -99,12 +100,26 @@ int TextStringToUCS4(const GooString *textStr, Unicode **ucs4) } if (textStr->hasUnicodeMarker()) { + isUnicode = true; + isUnicodeLE = false; + } else if (textStr->hasUnicodeMarkerLE()) { + isUnicode = false; + isUnicodeLE = true; + } else { + isUnicode = false; + isUnicodeLE = false; + } + + if (isUnicode || isUnicodeLE) { Unicode *utf16; len = len / 2 - 1; if (len > 0) { utf16 = new Unicode[len]; for (i = 0; i < len; i++) { - utf16[i] = (s[2 + i * 2] & 0xff) << 8 | (s[3 + i * 2] & 0xff); + if (isUnicode) + utf16[i] = (s[2 + i * 2] & 0xff) << 8 | (s[3 + i * 2] & 0xff); + else // UnicodeLE + utf16[i] = (s[2 + i * 2] & 0xff) | (s[3 + i * 2] & 0xff) >> 8; } len = UTF16toUCS4(utf16, len, &u); delete[] utf16; _______________________________________________ poppler mailing list [email protected] https://lists.freedesktop.org/mailman/listinfo/poppler
