poppler/TextOutputDev.cc | 195 ++++++++++++++++++++++++-------------------- poppler/UnicodeTypeTable.cc | 20 +++- poppler/UnicodeTypeTable.h | 7 + 3 files changed, 132 insertions(+), 90 deletions(-)
New commits: commit d8f418d2f2ec5966d77caf128a52c834fdd0efcf Author: Khaled Hosny <[email protected]> Date: Mon Nov 23 13:52:10 2015 +0400 Fix finding Arabic Presentation Forms ligatures PDF text containing Arabic Presentation forms ligatures is still not found after the previous commit. This because the ligatures are decomposed in logical order after normalisation, while the whole string is in visual order. For example the RTL text ABCD in visual order will be DCBA, and assuming B is a ligature, it will be decomposed to B1B2 so the string after normalization will be DCB1B2A while we are expecting it to be DCB2B1A. This patch reverses the order of the decomposition of RTL characters to work around this issue. diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc index 31d303d..fff3f05 100644 --- a/poppler/TextOutputDev.cc +++ b/poppler/TextOutputDev.cc @@ -35,6 +35,7 @@ // Copyright (C) 2013 José Aliste <[email protected]> // Copyright (C) 2013 Thomas Freitag <[email protected]> // Copyright (C) 2013 Ed Catmur <[email protected]> +// Copyright (C) 2016 Khaled Hosny <[email protected]> // // To see a description of the changes please see the Changelog file that // came with your tarball or type make ChangeLog if you are building from git @@ -3900,7 +3901,8 @@ GBool TextPage::findText(Unicode *s, int len, if (!line->normalized) line->normalized = unicodeNormalizeNFKC(line->text, line->len, &line->normalized_len, - &line->normalized_idx); + &line->normalized_idx, + true); // convert the line to uppercase m = line->normalized_len; if (!caseSensitive) { diff --git a/poppler/UnicodeTypeTable.cc b/poppler/UnicodeTypeTable.cc index 721af9d..c9f8e2a 100644 --- a/poppler/UnicodeTypeTable.cc +++ b/poppler/UnicodeTypeTable.cc @@ -17,6 +17,7 @@ // Copyright (C) 2007 Jeff Muizelaar <[email protected]> // Copyright (C) 2008 Albert Astals Cid <[email protected]> // Copyright (C) 2012 Adrian Johnson <[email protected]> +// Copyright (C) 2016 Khaled Hosny <[email protected]> // // To see a description of the changes please see the Changelog file that // came with your tarball or type make ChangeLog if you are building from git @@ -1015,7 +1016,9 @@ Unicode unicodeToUpper(Unicode c) { // of characters written. @buf may be NULL, in which case the length of the // decomposition is returned but nothing is written. If @u is its own // decomposition, write @u into @buf and return 1. -static int decomp_compat(Unicode u, Unicode *buf) { +// If reverseRTL is true, then decompositions of RTL characters will be output +// in reverse order. +static int decomp_compat(Unicode u, Unicode *buf, GBool reverseRTL = false) { // decomposition tables stored as lists {character, decomp_length, offset} // so we do a binary search int start = 0, end = DECOMP_TABLE_LENGTH; @@ -1031,7 +1034,10 @@ static int decomp_compat(Unicode u, Unicode *buf) { int length = decomp_table[midpoint].length, i; if (buf) for (i = 0; i < length; ++i) - buf[i] = decomp_expansion[offset + i]; + if (unicodeTypeR(u) && reverseRTL) + buf[i] = decomp_expansion[offset + length - i - 1]; + else + buf[i] = decomp_expansion[offset + i]; return length; } } else if (midpoint == start) @@ -1125,8 +1131,14 @@ static GBool combine(Unicode base, Unicode add, Unicode *out) { // for each character in the normalized string giving the index in @in of the // corresponding unnormalized character. @indices is not guaranteed monotone or // onto. -Unicode *unicodeNormalizeNFKC(Unicode *in, int len, +Unicode *unicodeNormalizeNFKC(Unicode *in, int len, int *out_len, int **indices) { + return unicodeNormalizeNFKC(in, len, out_len, indices, false); +} + +Unicode *unicodeNormalizeNFKC(Unicode *in, int len, + int *out_len, int **indices, + GBool reverseRTL) { Unicode *out; int i, o, *classes, *idx = NULL; @@ -1174,7 +1186,7 @@ Unicode *unicodeNormalizeNFKC(Unicode *in, int len, u = in[j]; if (j != i && COMBINING_CLASS(u) == 0) break; - dlen = decomp_compat(u, out + p); + dlen = decomp_compat(u, out + p, reverseRTL); for (q = p; q < p + dlen; ++q) { classes[q] = COMBINING_CLASS(out[q]); if (indices) diff --git a/poppler/UnicodeTypeTable.h b/poppler/UnicodeTypeTable.h index 869aad9..978d889 100644 --- a/poppler/UnicodeTypeTable.h +++ b/poppler/UnicodeTypeTable.h @@ -15,6 +15,7 @@ // // Copyright (C) 2006 Ed Catmur <[email protected]> // Copyright (C) 2012 Adrian Johnson <[email protected]> +// Copyright (C) 2016 Khaled Hosny <[email protected]> // // To see a description of the changes please see the Changelog file that // came with your tarball or type make ChangeLog if you are building from git @@ -38,7 +39,11 @@ extern GBool unicodeIsAlphabeticPresentationForm(Unicode c); extern Unicode unicodeToUpper(Unicode c); -extern Unicode *unicodeNormalizeNFKC(Unicode *in, int len, +extern Unicode *unicodeNormalizeNFKC(Unicode *in, int len, int *out_len, int **offsets); +extern Unicode *unicodeNormalizeNFKC(Unicode *in, int len, + int *out_len, int **offsets, + GBool reverseRTL); + #endif commit 67645087477beb618304ea34cbdbafd40b199276 Author: Khaled Hosny <[email protected]> Date: Wed Nov 18 14:47:28 2015 +0400 Handle right-to-left text in search Currently right-to-left text reversal is only done during text dumping, but not during search. This commit applies the same reversal logic during PDF search as well. diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc index bbb371a..31d303d 100644 --- a/poppler/TextOutputDev.cc +++ b/poppler/TextOutputDev.cc @@ -178,6 +178,94 @@ #define combMaxMidDelta 0.3 #define combMaxBaseDelta 0.4 +static int reorderText(Unicode *text, int len, UnicodeMap *uMap, GBool primaryLR, GooString *s, Unicode* u) { + char lre[8], rle[8], popdf[8], buf[8]; + int lreLen = 0, rleLen = 0, popdfLen = 0, n; + int nCols, i, j, k; + + nCols = 0; + + if (s) { + lreLen = uMap->mapUnicode(0x202a, lre, sizeof(lre)); + rleLen = uMap->mapUnicode(0x202b, rle, sizeof(rle)); + popdfLen = uMap->mapUnicode(0x202c, popdf, sizeof(popdf)); + } + + if (primaryLR) { + i = 0; + while (i < len) { + // output a left-to-right section + for (j = i; j < len && !unicodeTypeR(text[j]); ++j) ; + for (k = i; k < j; ++k) { + if (s) { + n = uMap->mapUnicode(text[k], buf, sizeof(buf)); + s->append(buf, n); + } + if (u) u[nCols] = text[k]; + ++nCols; + } + i = j; + // output a right-to-left section + for (j = i; + j < len && !(unicodeTypeL(text[j]) || unicodeTypeNum(text[j])); + ++j) ; + if (j > i) { + if (s) s->append(rle, rleLen); + for (k = j - 1; k >= i; --k) { + if (s) { + n = uMap->mapUnicode(text[k], buf, sizeof(buf)); + s->append(buf, n); + } + if (u) u[nCols] = text[k]; + ++nCols; + } + if (s) s->append(popdf, popdfLen); + i = j; + } + } + } else { + // Note: This code treats numeric characters (European and + // Arabic/Indic) as left-to-right, which isn't strictly correct + // (incurs extra LRE/POPDF pairs), but does produce correct + // visual formatting. + if (s) s->append(rle, rleLen); + i = len - 1; + while (i >= 0) { + // output a right-to-left section + for (j = i; + j >= 0 && !(unicodeTypeL(text[j]) || unicodeTypeNum(text[j])); + --j) ; + for (k = i; k > j; --k) { + if (s) { + n = uMap->mapUnicode(text[k], buf, sizeof(buf)); + s->append(buf, n); + } + if (u) u[nCols] = text[k]; + ++nCols; + } + i = j; + // output a left-to-right section + for (j = i; j >= 0 && !unicodeTypeR(text[j]); --j) ; + if (j < i) { + if (s) s->append(lre, lreLen); + for (k = j + 1; k <= i; ++k) { + if (s) { + n = uMap->mapUnicode(text[k], buf, sizeof(buf)); + s->append(buf, n); + } + if (u) u[nCols] = text[k]; + ++nCols; + } + if (s) s->append(popdf, popdfLen); + i = j; + } + } + if (s) s->append(popdf, popdfLen); + } + + return nCols; +} + //------------------------------------------------------------------------ // TextUnderline //------------------------------------------------------------------------ @@ -3720,7 +3808,7 @@ GBool TextPage::findText(Unicode *s, int len, double *xMax, double *yMax) { TextBlock *blk; TextLine *line; - Unicode *s2, *txt; + Unicode *s2, *txt, *reordered; Unicode *p; int txtSize, m, i, j, k; double xStart, yStart, xStop, yStop; @@ -3728,20 +3816,23 @@ GBool TextPage::findText(Unicode *s, int len, double xMin1, yMin1, xMax1, yMax1; GBool found; - //~ needs to handle right-to-left text if (rawOrder) { return gFalse; } + // handle right-to-left text + reordered = (Unicode*)gmallocn(len, sizeof(Unicode)); + reorderText(s, len, NULL, primaryLR, NULL, reordered); + + // normalize the search string + s2 = unicodeNormalizeNFKC(reordered, len, &len, NULL); + // convert the search string to uppercase if (!caseSensitive) { - s2 = unicodeNormalizeNFKC(s, len, &len, NULL); for (i = 0; i < len; ++i) { s2[i] = unicodeToUpper(s2[i]); } - } else { - s2 = unicodeNormalizeNFKC(s, len, &len, NULL); } txt = NULL; @@ -3915,6 +4006,7 @@ GBool TextPage::findText(Unicode *s, int len, } gfree(s2); + gfree(reordered); if (!caseSensitive) { gfree(txt); } @@ -5330,91 +5422,22 @@ void TextPage::assignColumns(TextLineFrag *frags, int nFrags, GBool oneRot) { int TextPage::dumpFragment(Unicode *text, int len, UnicodeMap *uMap, GooString *s) { - char lre[8], rle[8], popdf[8], buf[8]; - int lreLen, rleLen, popdfLen, n; - int nCols, i, j, k; - - nCols = 0; - if (uMap->isUnicode()) { + return reorderText(text, len, uMap, primaryLR, s, NULL); + } else { + int nCols = 0; - lreLen = uMap->mapUnicode(0x202a, lre, sizeof(lre)); - rleLen = uMap->mapUnicode(0x202b, rle, sizeof(rle)); - popdfLen = uMap->mapUnicode(0x202c, popdf, sizeof(popdf)); - - if (primaryLR) { - - i = 0; - while (i < len) { - // output a left-to-right section - for (j = i; j < len && !unicodeTypeR(text[j]); ++j) ; - for (k = i; k < j; ++k) { - n = uMap->mapUnicode(text[k], buf, sizeof(buf)); - s->append(buf, n); - ++nCols; - } - i = j; - // output a right-to-left section - for (j = i; - j < len && !(unicodeTypeL(text[j]) || unicodeTypeNum(text[j])); - ++j) ; - if (j > i) { - s->append(rle, rleLen); - for (k = j - 1; k >= i; --k) { - n = uMap->mapUnicode(text[k], buf, sizeof(buf)); - s->append(buf, n); - ++nCols; - } - s->append(popdf, popdfLen); - i = j; - } - } - - } else { - - // Note: This code treats numeric characters (European and - // Arabic/Indic) as left-to-right, which isn't strictly correct - // (incurs extra LRE/POPDF pairs), but does produce correct - // visual formatting. - s->append(rle, rleLen); - i = len - 1; - while (i >= 0) { - // output a right-to-left section - for (j = i; - j >= 0 && !(unicodeTypeL(text[j]) || unicodeTypeNum(text[j])); - --j) ; - for (k = i; k > j; --k) { - n = uMap->mapUnicode(text[k], buf, sizeof(buf)); - s->append(buf, n); - ++nCols; - } - i = j; - // output a left-to-right section - for (j = i; j >= 0 && !unicodeTypeR(text[j]); --j) ; - if (j < i) { - s->append(lre, lreLen); - for (k = j + 1; k <= i; ++k) { - n = uMap->mapUnicode(text[k], buf, sizeof(buf)); - s->append(buf, n); - ++nCols; - } - s->append(popdf, popdfLen); - i = j; - } - } - s->append(popdf, popdfLen); + char buf[8]; + int buflen = 0; + for (int i = 0; i < len; ++i) { + buflen = uMap->mapUnicode(text[i], buf, sizeof(buf)); + s->append(buf, buflen); + nCols += buflen; } - } else { - for (i = 0; i < len; ++i) { - n = uMap->mapUnicode(text[i], buf, sizeof(buf)); - s->append(buf, n); - nCols += n; - } + return nCols; } - - return nCols; } #if TEXTOUT_WORD_LIST
_______________________________________________ poppler mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/poppler
