UnicodeTypeTable.h

Albert Astals Cid Sun, 03 Jan 2016 04:00:07 -0800

 poppler/TextOutputDev.cc    |  195 ++++++++++++++++++++++++--------------------
 poppler/UnicodeTypeTable.cc |   20 +++-
 poppler/UnicodeTypeTable.h  |    7 +
 3 files changed, 132 insertions(+), 90 deletions(-)


New commits:
commit d8f418d2f2ec5966d77caf128a52c834fdd0efcf
Author: Khaled Hosny <[email protected]>
Date:   Mon Nov 23 13:52:10 2015 +0400

    Fix finding Arabic Presentation Forms ligatures
    
    PDF text containing Arabic Presentation forms ligatures is still not
    found after the previous commit.
    
    This because the ligatures are decomposed in logical order after
    normalisation, while the whole string is in visual order.  For example
    the RTL text ABCD in visual order will be DCBA, and assuming B is a
    ligature, it will be decomposed to B1B2 so the string after
    normalization will be DCB1B2A while we are expecting it to be DCB2B1A.
    
    This patch reverses the order of the decomposition of RTL characters to
    work around this issue.

diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc
index 31d303d..fff3f05 100644
--- a/poppler/TextOutputDev.cc
+++ b/poppler/TextOutputDev.cc
@@ -35,6 +35,7 @@
 // Copyright (C) 2013 JosÃ© Aliste <[email protected]>
 // Copyright (C) 2013 Thomas Freitag <[email protected]>
 // Copyright (C) 2013 Ed Catmur <[email protected]>
+// Copyright (C) 2016 Khaled Hosny <[email protected]>
 //
 // To see a description of the changes please see the Changelog file that
 // came with your tarball or type make ChangeLog if you are building from git
@@ -3900,7 +3901,8 @@ GBool TextPage::findText(Unicode *s, int len,
       if (!line->normalized)
        line->normalized = unicodeNormalizeNFKC(line->text, line->len, 
                                                &line->normalized_len, 
-                                               &line->normalized_idx);
+                                               &line->normalized_idx,
+                                               true);
       // convert the line to uppercase
       m = line->normalized_len;
       if (!caseSensitive) {
diff --git a/poppler/UnicodeTypeTable.cc b/poppler/UnicodeTypeTable.cc
index 721af9d..c9f8e2a 100644
--- a/poppler/UnicodeTypeTable.cc
+++ b/poppler/UnicodeTypeTable.cc
@@ -17,6 +17,7 @@
 // Copyright (C) 2007 Jeff Muizelaar <[email protected]>
 // Copyright (C) 2008 Albert Astals Cid <[email protected]>
 // Copyright (C) 2012 Adrian Johnson <[email protected]>
+// Copyright (C) 2016 Khaled Hosny <[email protected]>
 //
 // To see a description of the changes please see the Changelog file that
 // came with your tarball or type make ChangeLog if you are building from git
@@ -1015,7 +1016,9 @@ Unicode unicodeToUpper(Unicode c) {
 // of characters written. @buf may be NULL, in which case the length of the
 // decomposition is returned but nothing is written. If @u is its own
 // decomposition, write @u into @buf and return 1.
-static int decomp_compat(Unicode u, Unicode *buf) {
+// If reverseRTL is true, then decompositions of RTL characters will be output
+// in reverse order.
+static int decomp_compat(Unicode u, Unicode *buf, GBool reverseRTL = false) {
   // decomposition tables stored as lists {character, decomp_length, offset}
   // so we do a binary search
   int start = 0, end = DECOMP_TABLE_LENGTH;
@@ -1031,7 +1034,10 @@ static int decomp_compat(Unicode u, Unicode *buf) {
          int length = decomp_table[midpoint].length, i;
          if (buf)
            for (i = 0; i < length; ++i)
-             buf[i] = decomp_expansion[offset + i];
+               if (unicodeTypeR(u) && reverseRTL)
+                 buf[i] = decomp_expansion[offset + length - i - 1];
+               else
+                 buf[i] = decomp_expansion[offset + i];
          return length;
        }
       } else if (midpoint == start)
@@ -1125,8 +1131,14 @@ static GBool combine(Unicode base, Unicode add, Unicode 
*out) {
 // for each character in the normalized string giving the index in @in of the 
 // corresponding unnormalized character. @indices is not guaranteed monotone or
 // onto.
-Unicode *unicodeNormalizeNFKC(Unicode *in, int len, 
+Unicode *unicodeNormalizeNFKC(Unicode *in, int len,
                              int *out_len, int **indices) {
+    return unicodeNormalizeNFKC(in, len, out_len, indices, false);
+}
+
+Unicode *unicodeNormalizeNFKC(Unicode *in, int len,
+                             int *out_len, int **indices,
+                             GBool reverseRTL) {
   Unicode *out;
   int i, o, *classes, *idx = NULL;
 
@@ -1174,7 +1186,7 @@ Unicode *unicodeNormalizeNFKC(Unicode *in, int len,
        u = in[j];
        if (j != i && COMBINING_CLASS(u) == 0)
          break;
-       dlen = decomp_compat(u, out + p);
+       dlen = decomp_compat(u, out + p, reverseRTL);
        for (q = p; q < p + dlen; ++q) {
          classes[q] = COMBINING_CLASS(out[q]);
          if (indices)
diff --git a/poppler/UnicodeTypeTable.h b/poppler/UnicodeTypeTable.h
index 869aad9..978d889 100644
--- a/poppler/UnicodeTypeTable.h
+++ b/poppler/UnicodeTypeTable.h
@@ -15,6 +15,7 @@
 //
 // Copyright (C) 2006 Ed Catmur <[email protected]>
 // Copyright (C) 2012 Adrian Johnson <[email protected]>
+// Copyright (C) 2016 Khaled Hosny <[email protected]>
 //
 // To see a description of the changes please see the Changelog file that
 // came with your tarball or type make ChangeLog if you are building from git
@@ -38,7 +39,11 @@ extern GBool unicodeIsAlphabeticPresentationForm(Unicode c);
 
 extern Unicode unicodeToUpper(Unicode c);
 
-extern Unicode *unicodeNormalizeNFKC(Unicode *in, int len, 
+extern Unicode *unicodeNormalizeNFKC(Unicode *in, int len,
                                     int *out_len, int **offsets);
 
+extern Unicode *unicodeNormalizeNFKC(Unicode *in, int len,
+                                    int *out_len, int **offsets,
+                                    GBool reverseRTL);
+
 #endif
commit 67645087477beb618304ea34cbdbafd40b199276
Author: Khaled Hosny <[email protected]>
Date:   Wed Nov 18 14:47:28 2015 +0400

    Handle right-to-left text in search
    
    Currently right-to-left text reversal is only done during text dumping,
    but not during search. This commit applies the same reversal logic
    during PDF search as well.

diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc
index bbb371a..31d303d 100644
--- a/poppler/TextOutputDev.cc
+++ b/poppler/TextOutputDev.cc
@@ -178,6 +178,94 @@
 #define combMaxMidDelta 0.3
 #define combMaxBaseDelta 0.4
 
+static int reorderText(Unicode *text, int len, UnicodeMap *uMap, GBool 
primaryLR, GooString *s, Unicode* u) {
+  char lre[8], rle[8], popdf[8], buf[8];
+  int lreLen = 0, rleLen = 0, popdfLen = 0, n;
+  int nCols, i, j, k;
+
+  nCols = 0;
+
+  if (s) {
+    lreLen = uMap->mapUnicode(0x202a, lre, sizeof(lre));
+    rleLen = uMap->mapUnicode(0x202b, rle, sizeof(rle));
+    popdfLen = uMap->mapUnicode(0x202c, popdf, sizeof(popdf));
+  }
+
+  if (primaryLR) {
+    i = 0;
+    while (i < len) {
+      // output a left-to-right section
+      for (j = i; j < len && !unicodeTypeR(text[j]); ++j) ;
+      for (k = i; k < j; ++k) {
+        if (s) {
+          n = uMap->mapUnicode(text[k], buf, sizeof(buf));
+          s->append(buf, n);
+        }
+        if (u) u[nCols] = text[k];
+        ++nCols;
+      }
+      i = j;
+      // output a right-to-left section
+      for (j = i;
+         j < len && !(unicodeTypeL(text[j]) || unicodeTypeNum(text[j]));
+         ++j) ;
+      if (j > i) {
+        if (s) s->append(rle, rleLen);
+        for (k = j - 1; k >= i; --k) {
+          if (s) {
+            n = uMap->mapUnicode(text[k], buf, sizeof(buf));
+            s->append(buf, n);
+          }
+          if (u) u[nCols] = text[k];
+          ++nCols;
+        }
+        if (s) s->append(popdf, popdfLen);
+        i = j;
+      }
+    }
+  } else {
+    // Note: This code treats numeric characters (European and
+    // Arabic/Indic) as left-to-right, which isn't strictly correct
+    // (incurs extra LRE/POPDF pairs), but does produce correct
+    // visual formatting.
+    if (s) s->append(rle, rleLen);
+    i = len - 1;
+    while (i >= 0) {
+      // output a right-to-left section
+      for (j = i;
+         j >= 0 && !(unicodeTypeL(text[j]) || unicodeTypeNum(text[j]));
+         --j) ;
+      for (k = i; k > j; --k) {
+        if (s) {
+          n = uMap->mapUnicode(text[k], buf, sizeof(buf));
+          s->append(buf, n);
+        }
+        if (u) u[nCols] = text[k];
+        ++nCols;
+      }
+      i = j;
+      // output a left-to-right section
+      for (j = i; j >= 0 && !unicodeTypeR(text[j]); --j) ;
+      if (j < i) {
+        if (s) s->append(lre, lreLen);
+        for (k = j + 1; k <= i; ++k) {
+          if (s) {
+            n = uMap->mapUnicode(text[k], buf, sizeof(buf));
+            s->append(buf, n);
+          }
+          if (u) u[nCols] = text[k];
+          ++nCols;
+        }
+        if (s) s->append(popdf, popdfLen);
+        i = j;
+      }
+    }
+    if (s) s->append(popdf, popdfLen);
+  }
+
+  return nCols;
+}
+
 //------------------------------------------------------------------------
 // TextUnderline
 //------------------------------------------------------------------------
@@ -3720,7 +3808,7 @@ GBool TextPage::findText(Unicode *s, int len,
                         double *xMax, double *yMax) {
   TextBlock *blk;
   TextLine *line;
-  Unicode *s2, *txt;
+  Unicode *s2, *txt, *reordered;
   Unicode *p;
   int txtSize, m, i, j, k;
   double xStart, yStart, xStop, yStop;
@@ -3728,20 +3816,23 @@ GBool TextPage::findText(Unicode *s, int len,
   double xMin1, yMin1, xMax1, yMax1;
   GBool found;
 
-  //~ needs to handle right-to-left text
 
   if (rawOrder) {
     return gFalse;
   }
 
+  // handle right-to-left text
+  reordered = (Unicode*)gmallocn(len, sizeof(Unicode));
+  reorderText(s, len, NULL, primaryLR, NULL, reordered);
+
+  // normalize the search string
+  s2 = unicodeNormalizeNFKC(reordered, len, &len, NULL);
+
   // convert the search string to uppercase
   if (!caseSensitive) {
-    s2 = unicodeNormalizeNFKC(s, len, &len, NULL);
     for (i = 0; i < len; ++i) {
       s2[i] = unicodeToUpper(s2[i]);
     }
-  } else {
-    s2 = unicodeNormalizeNFKC(s, len, &len, NULL);
   }
 
   txt = NULL;
@@ -3915,6 +4006,7 @@ GBool TextPage::findText(Unicode *s, int len,
   }
 
   gfree(s2);
+  gfree(reordered);
   if (!caseSensitive) {
     gfree(txt);
   }
@@ -5330,91 +5422,22 @@ void TextPage::assignColumns(TextLineFrag *frags, int 
nFrags, GBool oneRot) {
 
 int TextPage::dumpFragment(Unicode *text, int len, UnicodeMap *uMap,
                           GooString *s) {
-  char lre[8], rle[8], popdf[8], buf[8];
-  int lreLen, rleLen, popdfLen, n;
-  int nCols, i, j, k;
-
-  nCols = 0;
-
   if (uMap->isUnicode()) {
+    return reorderText(text, len, uMap, primaryLR, s, NULL);
+  } else {
+    int nCols = 0;
 
-    lreLen = uMap->mapUnicode(0x202a, lre, sizeof(lre));
-    rleLen = uMap->mapUnicode(0x202b, rle, sizeof(rle));
-    popdfLen = uMap->mapUnicode(0x202c, popdf, sizeof(popdf));
-
-    if (primaryLR) {
-
-      i = 0;
-      while (i < len) {
-       // output a left-to-right section
-       for (j = i; j < len && !unicodeTypeR(text[j]); ++j) ;
-       for (k = i; k < j; ++k) {
-         n = uMap->mapUnicode(text[k], buf, sizeof(buf));
-         s->append(buf, n);
-         ++nCols;
-       }
-       i = j;
-       // output a right-to-left section
-       for (j = i;
-            j < len && !(unicodeTypeL(text[j]) || unicodeTypeNum(text[j]));
-            ++j) ;
-       if (j > i) {
-         s->append(rle, rleLen);
-         for (k = j - 1; k >= i; --k) {
-           n = uMap->mapUnicode(text[k], buf, sizeof(buf));
-           s->append(buf, n);
-           ++nCols;
-         }
-         s->append(popdf, popdfLen);
-         i = j;
-       }
-      }
-
-    } else {
-
-      // Note: This code treats numeric characters (European and
-      // Arabic/Indic) as left-to-right, which isn't strictly correct
-      // (incurs extra LRE/POPDF pairs), but does produce correct
-      // visual formatting.
-      s->append(rle, rleLen);
-      i = len - 1;
-      while (i >= 0) {
-       // output a right-to-left section
-       for (j = i;
-            j >= 0 && !(unicodeTypeL(text[j]) || unicodeTypeNum(text[j]));
-            --j) ;
-       for (k = i; k > j; --k) {
-         n = uMap->mapUnicode(text[k], buf, sizeof(buf));
-         s->append(buf, n);
-         ++nCols;
-       }
-       i = j;
-       // output a left-to-right section
-       for (j = i; j >= 0 && !unicodeTypeR(text[j]); --j) ;
-       if (j < i) {
-         s->append(lre, lreLen);
-         for (k = j + 1; k <= i; ++k) {
-           n = uMap->mapUnicode(text[k], buf, sizeof(buf));
-           s->append(buf, n);
-           ++nCols;
-         }
-         s->append(popdf, popdfLen);
-         i = j;
-       }
-      }
-      s->append(popdf, popdfLen);
+    char buf[8];
+    int buflen = 0;
 
+    for (int i = 0; i < len; ++i) {
+      buflen = uMap->mapUnicode(text[i], buf, sizeof(buf));
+      s->append(buf, buflen);
+      nCols += buflen;
     }
 
-  } else {
-    for (i = 0; i < len; ++i) {
-      n = uMap->mapUnicode(text[i], buf, sizeof(buf));
-      s->append(buf, n);
-      nCols += n;
-    }
+    return nCols;
   }
-
-  return nCols;
 }
 
 #if TEXTOUT_WORD_LIST

_______________________________________________
poppler mailing list
[email protected]
http://lists.freedesktop.org/mailman/listinfo/poppler

[poppler] 2 commits - poppler/TextOutputDev.cc poppler/UnicodeTypeTable.cc poppler/UnicodeTypeTable.h

Reply via email to