Adam Reichold wrote: >> A) use the functions in UTF.h, and change ustring class >> to fit them.
>> B) rewrite the functions in UTF.h by template programming >> and have the interfaces fitting to current ustring(). >> C) write diversions of the functions in UTF.h in cpp frontend. > > I think what is best here is hard to decide without seeing actual code > that at least starts in one of these directions. Indeed. In fact, yesterday I was trying to write a sample patch for B) direction, but I got stuck. Because current UTF.h is written in C-style (*p++ = c style copy/concatenation). Thus, if I try to pass std::basic_string or std::vector object, I have to make yet another template (and different implementations) for the concatenation. Following is current my proposal, which still use reinterpret_cast<>(), but if it is not good (e.g. unsigned short != uint16_t), it allocates the buffer and free it internally. By replacing iconv() by UTF.h functions, I keep from the cast between unsigned short (with unknown byte order) array and char array. Please let me explain... > --- a/cpp/poppler-global.cpp > +++ b/cpp/poppler-global.cpp > @@ -26,6 +26,7 @@ > #include "poppler-private.h" > > #include "DateInfo.h" > +#include "UTF.h" > > #include <algorithm> Include "UTF.h" to use UTF-8 <-> UTF-16 helpers in there. > @@ -34,31 +35,8 @@ > #include <ios> > #include <iostream> > > -#include <iconv.h> > - > #include "config.h" > > -namespace > -{ > - > -struct MiniIconv > -{ > - MiniIconv(const char *to_code, const char *from_code) > - : i_(iconv_open(to_code, from_code)) > - {} > - ~MiniIconv() > - { if (is_valid()) iconv_close(i_); } > - MiniIconv(const MiniIconv &) = delete; > - MiniIconv& operator=(const MiniIconv &) = delete; > - bool is_valid() const > - { return i_ != (iconv_t)-1; } > - operator iconv_t() const > - { return i_; } > - iconv_t i_; > -}; > - > -} > - Remove iconv() related functions. > @@ -226,28 +204,31 @@ byte_array ustring::to_utf8() const > return byte_array(); > } > > - MiniIconv ic("UTF-8", "UTF-16"); > - if (!ic.is_valid()) { > - return byte_array(); Almost replacement of ustring::to_utf8(). See below. > +#if 0xFFFFU == USHRT_MAX > + const uint16_t* utf16_buff = reinterpret_cast<const uint16_t *>(data()); > +#else above is the popular case that uint16_t == unsigned int. simple casting is enough to get uint16_t array from ustring object. > + const uint16_t* utf16_begin = new uint16_t[utf16_len]; > + const uint16_t* utf16_buff = utf16_begin; > + const unsigned short *me = data(); > + > + for (int i = 0; i < size(); i++) { > + utf16_buff[i] = (uint16_t)me[i]; > } [snip the removal of existing code] > +#endif above is the irregular case that uint16_t != unsigned int. allocate temporal array of uint16_t and copy the contents. for skipping of BOM in the beginning, the heaf of allocated buffer is recorded in utf16_begin, but the UTF-16 data to be converted starts from utf16_buff. > + const uint16_t* utf16_end = utf16_buff + size(); > + if (utf16_buff[0] == 0xFEFF) { > + utf16_buff ++; > } in abovem utf16_buff is shifted if the beginning of the buffer is BOM. > - str.resize(str.size() - str_len_left); > - return str; > + > + const int utf8_len = utf16CountUtf8Bytes(utf16_buff); > + byte_array ret(utf8_len + 1, '\0'); count the required buffer size by utf16CountUtf8Bytes in UTF.h, and allocate the byte_array to be returned. To assue the null termination, the size is incremented and filled by '\0'. > + utf16ToUtf8(utf16_buff, reinterpret_cast<char *>(ret.data()), utf8_len + > 1, utf16_end - utf16_buff); > + ret.resize(std::strlen(ret.data())); convert by utf16ToUtf8(), and shrink to fit converted result. > + > +#if 0xFFFFU != USHRT_MAX > + delete utf16_begin; > +#endif > + return ret; > } if irregular case, the temporal buffer should be freed. > @@ -256,15 +237,29 @@ std::string ustring::to_latin1() const > return std::string(); > } > > - const size_type mylength = size(); > - std::string ret(mylength, '\0'); > const value_type *me = data(); > - for (size_type i = 0; i < mylength; ++i) { > + const value_type *me_end = me + size(); > + if (*me == 0xFEFF) { > + me ++; > + } > + std::string ret(me_end - me, '\0'); > + for (size_type i = 0; me < me_end; ++i) { > ret[i] = (char)*me++; > } > return ret; > } original implementation of ustring::to_latin() does not care for BOM. now BOM is checked and skipped if there is. > +static bool has_bom_utf8(const char *c, int len) > +{ > + if ( 3 > len ) { > + return false; > + } > + if (c[0] == 0xEF && c[1] == 0xBB && c[2] == 0xBF) { > + return true; > + } > + return false; > +} > + > ustring ustring::from_utf8(const char *str, int len) > { > if (len <= 0) { even if ustring::from_utf8 receives UTF-8 string with BOM, created ustring object should not include BOM. has_bom_utf8() is an utility function for it. > @@ -273,31 +268,27 @@ ustring ustring::from_utf8(const char *str, int len) > return ustring(); > } > } > - > - MiniIconv ic("UTF-16", "UTF-8"); > - if (!ic.is_valid()) { > - return ustring(); remove iconv() related function. > + int utf16_count = utf8CountUtf16CodeUnits(str); > + if (has_bom_utf8(str, len)) { > + str += 3; // skip BOM > + len -= 3; // skip BOM > + utf16_count --; // skip BOM > } if UTF-8 starts with BOM, it is skiped, the length of UTF-8 buffer is decremented (3 octets), the length of UTF-16 buffer is decremented (16bit). > + ustring ret(utf16_count + 1, 0); allocate ustring object to be returned. to assure the U+0000l termination, the size is incremented. > +#if 0xFFFFU == USHRT_MAX > + const uint16_t* utf16_buff = reinterpret_cast<const uint16_t > *>(ret.data()); this is for popular case where uint16_t == unsigned short. simple cast is enough to make the array of uint16_t. > +#else > + const uint16_t* utf16_buff = new uint16_t[utf16_count + 1](0); > +#endif this is for irregular case. allocate temporal buffer to store uint16_t data. > + utf8ToUtf16(str, (uint16_t*)utf16_buff, utf16_count + 1, len); fill utf16_buff by converted UTF-16 data. > +#if 0xFFFFU != USHRT_MAX > + for (int i = 0; i < utf16_count; i ++) { > + ret[i] = (unsigned short)utf16_buff[i]; > } in irregular case, the content should be copied to the object to be returned. > + delete utf16_buff; > +#endif the buffer for irregular case is freed here. > return ret; > } > > --- a/cpp/poppler-page.cpp > +++ b/cpp/poppler-page.cpp > @@ -355,7 +365,7 @@ std::vector<text_box> page::text_list() const > TextWord *word = word_list->get(i); > > std::unique_ptr<GooString> gooWord{word->getText()}; > - ustring ustr = > detail::unicode_GooString_to_ustring(gooWord.get()); > + ustring ustr = ustring::from_utf8(gooWord->getCString()); here, gooWord is the object created by TextOutputDev, so it is always UTF-8. it should be passed to ustring::from_utf8(). > --- a/cpp/poppler-private.cpp > +++ b/cpp/poppler-private.cpp > @@ -23,6 +23,7 @@ > #include "poppler-private.h" > > #include "GooString.h" > +#include "PDFDocEncoding.h" > #include "Page.h" > > #include <ctime> the strings for PDF metadata is UTF-16BE with BOM or PDFDocEncoding. to use PDFDocEncoding helper, include PDFDocEncoding.h > @@ -58,16 +59,16 @@ rectf detail::pdfrectangle_to_rectf(const PDFRectangle > &pdfrect) > > ustring detail::unicode_GooString_to_ustring(const GooString *str) > { > - const char *data = str->getCString(); > const int len = str->getLength(); > + const char *data = str->getCString(); > + const char *data_end = data + len; input data may or may not have BOM, so the length of UTF-8 string to be converted may or may not be len. to clarify the end of buffer, data_end is calculated, instead of the index "i" in the existing implementation. > - int i = 0; > bool is_unicode = false; > if ((data[0] & 0xff) == 0xfe && (len > 1 && (data[1] & 0xff) == 0xff)) { > is_unicode = true; > - i = 2; > + data += 2; // skip BOM > } instead of the index i, the beginning of UTF-16BE buffer is shifted to skip BOM. > - ustring::size_type ret_len = len - i; > + ustring::size_type ret_len = data_end - data; > if (is_unicode) { > ret_len >>= 1; > } ditto. > @@ -75,15 +76,15 @@ ustring detail::unicode_GooString_to_ustring(const > GooString *str) > size_t ret_index = 0; > ustring::value_type u; > if (is_unicode) { > - while (i < len) { > - u = ((data[i] & 0xff) << 8) | (data[i + 1] & 0xff); > - i += 2; > + while (data < data_end) { > + u = ((*data & 0xff) << 8) | (*(data + 1) & 0xff); > + data += 2; > ret[ret_index++] = u; > } > } else { > - while (i < len) { > - u = data[i] & 0xff; > - ++i; > + while (data < data_end) { > + u = pdfDocEncoding[ *data & 0xff ]; > + data ++; > ret[ret_index++] = u; > } > } use *data instead of data[i].
mps-fix-cpp-encoding-issue_20180417-0010.diff.xz
Description: Binary data
_______________________________________________ poppler mailing list poppler@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/poppler