Re: [poppler] [PATCH] Fixup LaTeX composed characters

Tim Brody Mon, 28 Mar 2011 02:56:50 -0700

On Fri, 2011-03-25 at 20:43 +0000, Albert Astals Cid wrote:
> A Divendres, 25 de març de 2011, vàreu escriure:
> > On Fri, 25 Mar 2011 19:02:46 +0000, Albert Astals Cid <[email protected]>
> > 
> > wrote:
> > > A Divendres, 25 de març de 2011, Tim Brody va escriure:
> > >> Hi All,
> > >> 
> > >> Attached is a patch to address the previous problem I wrote about with
> > >> pdflatex-produced PDFs that contain overlapping-diacritics/accents.


> > > Hmmm, is it supposed to just kill the diacritic mark?
> > > 
> > > R. L¨wen and B. Polster
> > > o
> > > gets converted to
> > > R. Lowen and B. Polster
> > > shouldn't it be
> > > R. Löwen and B. Polster
> > > ?
> > 
> > It should do - can you send me this PDF?
> 
> http://www.maths.mq.edu.au/~ross/5019-e-cmap.pdf

This PDF has [combining character][character to combine with].

I've added combining-chars to the equiv-mapping table which appears to
work for this PDF:

"R. Löwen and B. Polster"
"Institut für Analysis und Algebra ..."

> > 
> > I get this from TeX:
> > R. L\"owen and B. Polster => R. Löwen and B. Polster
> > 
> > NB I just tried extracting from a Word-generated PDF and TextOutputDev
> > didn't see the line with the diacritic at all.
> 
> And are you sure it's not a Word fault?

Oh I expect it is but I thought I'd mention it. I'll try to investigate
further.

/Tim.

>From 125b8360af90183a690357dc6ce895967766ef47 Mon Sep 17 00:00:00 2001
From: Tim Brody <[email protected]>
Date: Fri, 25 Mar 2011 13:02:18 +0000
Subject: [PATCH] Turn TeX-style composed characters into Unicode combining characters during text conversion.

---
 poppler/TextOutputDev.cc         |   38 +++++++++++--
 poppler/UnicodeCompEquivTables.h |  106 ++++++++++++++++++++++++++++++++++++++
 poppler/UnicodeTypeTable.cc      |   23 ++++++++
 poppler/UnicodeTypeTable.h       |    2 +
 4 files changed, 163 insertions(+), 6 deletions(-)
 create mode 100644 poppler/UnicodeCompEquivTables.h

diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc
index 13c67c6..2191fd1 100644
--- a/poppler/TextOutputDev.cc
+++ b/poppler/TextOutputDev.cc
@@ -2161,6 +2161,7 @@ void TextPage::addChar(GfxState *state, double x, double y,
   double x1, y1, w1, h1, dx2, dy2, base, sp, delta;
   GBool overlap;
   int i;
+  Unicode *uc = NULL; // u + combining character
 
   // subtract char and word spacing from the dx,dy values
   sp = state->getCharSpace();
@@ -2236,12 +2237,34 @@ void TextPage::addChar(GfxState *state, double x, double y,
     }
     overlap = fabs(delta) < dupMaxPriDelta * curWord->fontSize &&
               fabs(base - curWord->base) < dupMaxSecDelta * curWord->fontSize;
-    if (overlap || lastCharOverlap ||
-	sp < -minDupBreakOverlap * curWord->fontSize ||
-	sp > minWordBreakSpace * curWord->fontSize ||
-	fabs(base - curWord->base) > 0.5 ||
-	curFontSize != curWord->fontSize) {
-      endWord();
+    if (
+        // place overlapping characters in their own word
+        lastCharOverlap ||
+        // whitespace along main axis
+        sp > minWordBreakSpace * curWord->fontSize ||
+        // whitespace along secondary-axis
+        fabs(base - curWord->base) > 0.5 ||
+        // font size changed
+        curFontSize != curWord->fontSize
+       ) {
+        endWord();
+    }
+    // overlapping characters
+    else if (overlap || sp < -minDupBreakOverlap * curWord->fontSize ) {
+      // "u => ü, as seen in pdflatex output
+      Unicode uu;
+      if (unicodeCombineEquiv (curWord->text[curWord->len - 1], &uu)) {
+          curWord->len--;
+          curWord->charLen--;
+          uc = (Unicode *) gmallocn (uLen+1, sizeof (Unicode));
+          memcpy (uc, u, uLen * sizeof (Unicode));
+          uc[uLen++] = uu;
+          u = uc;
+          overlap = gFalse;
+      }
+      else {
+        endWord();
+      }
     }
     lastCharOverlap = overlap;
   } else {
@@ -2293,6 +2316,9 @@ void TextPage::addChar(GfxState *state, double x, double y,
       }
     }
   }
+  if (uc) {
+      gfree (uc);
+  }
   if (curWord) {
     curWord->charLen += nBytes;
   }
diff --git a/poppler/UnicodeCompEquivTables.h b/poppler/UnicodeCompEquivTables.h
new file mode 100644
index 0000000..0f1dfc1
--- /dev/null
+++ b/poppler/UnicodeCompEquivTables.h
@@ -0,0 +1,106 @@
+// Generated by combining.pl at Mon Mar 28 09:34:05 2011
+
+typedef struct {
+	Unicode character;
+	Unicode combining;
+} combine_equiv;
+
+#define COMBINE_EQUIV_TABLE_LENGTH 95
+
+static const combine_equiv combine_equiv_table[] = {
+    { 0x0022, 0x030e },
+    { 0x0027, 0x0301 },
+    { 0x005e, 0x0302 },
+    { 0x005f, 0x0332 },
+    { 0x0060, 0x0300 },
+    { 0x007e, 0x0303 },
+    { 0x00a8, 0x0308 },
+    { 0x00af, 0x0305 },
+    { 0x00b0, 0x030a },
+    { 0x00b4, 0x0301 },
+    { 0x00b8, 0x0327 },
+    { 0x02b1, 0x0324 },
+    { 0x02b2, 0x0321 },
+    { 0x02b7, 0x032b },
+    { 0x02b9, 0x0301 },
+    { 0x02ba, 0x030b },
+    { 0x02bb, 0x0312 },
+    { 0x02bc, 0x0315 },
+    { 0x02bd, 0x0314 },
+    { 0x02c0, 0x0309 },
+    { 0x02c6, 0x0302 },
+    { 0x02c7, 0x030c },
+    { 0x02c8, 0x030d },
+    { 0x02c9, 0x0304 },
+    { 0x02ca, 0x0301 },
+    { 0x02cb, 0x0300 },
+    { 0x02cc, 0x0329 },
+    { 0x02cd, 0x0331 },
+    { 0x02d4, 0x0323 },
+    { 0x02d5, 0x031e },
+    { 0x02d6, 0x031f },
+    { 0x02d7, 0x0320 },
+    { 0x02d8, 0x0306 },
+    { 0x02d9, 0x0307 },
+    { 0x02da, 0x030a },
+    { 0x02db, 0x0328 },
+    { 0x02dc, 0x0303 },
+    { 0x02dd, 0x030b },
+    { 0x0300, 0x0300 },
+    { 0x0301, 0x0301 },
+    { 0x0302, 0x0302 },
+    { 0x0303, 0x0303 },
+    { 0x0304, 0x0304 },
+    { 0x0305, 0x0305 },
+    { 0x0306, 0x0306 },
+    { 0x0307, 0x0307 },
+    { 0x0308, 0x0308 },
+    { 0x0309, 0x0309 },
+    { 0x030a, 0x030a },
+    { 0x030b, 0x030b },
+    { 0x030c, 0x030c },
+    { 0x030d, 0x030d },
+    { 0x030e, 0x030e },
+    { 0x030f, 0x030f },
+    { 0x0310, 0x0310 },
+    { 0x0311, 0x0311 },
+    { 0x0312, 0x0312 },
+    { 0x0313, 0x0313 },
+    { 0x0314, 0x0314 },
+    { 0x0315, 0x0315 },
+    { 0x0316, 0x0316 },
+    { 0x0317, 0x0317 },
+    { 0x0318, 0x0318 },
+    { 0x0319, 0x0319 },
+    { 0x031a, 0x031a },
+    { 0x031b, 0x031b },
+    { 0x031c, 0x031c },
+    { 0x031d, 0x031d },
+    { 0x031e, 0x031e },
+    { 0x031f, 0x031f },
+    { 0x0320, 0x0320 },
+    { 0x0321, 0x0321 },
+    { 0x0322, 0x0322 },
+    { 0x0323, 0x0323 },
+    { 0x0324, 0x0324 },
+    { 0x0325, 0x0325 },
+    { 0x0326, 0x0326 },
+    { 0x0327, 0x0327 },
+    { 0x0328, 0x0328 },
+    { 0x0329, 0x0329 },
+    { 0x032a, 0x032a },
+    { 0x032b, 0x032b },
+    { 0x032c, 0x032c },
+    { 0x032d, 0x032d },
+    { 0x032e, 0x032e },
+    { 0x032f, 0x032f },
+    { 0x0330, 0x0330 },
+    { 0x0331, 0x0331 },
+    { 0x0332, 0x0332 },
+    { 0x0333, 0x0333 },
+    { 0x0384, 0x0301 },
+    { 0x0559, 0x0314 },
+    { 0x055a, 0x0313 },
+    { 0x0901, 0x0310 },
+    { 0x2017, 0x0333 },
+};
diff --git a/poppler/UnicodeTypeTable.cc b/poppler/UnicodeTypeTable.cc
index c0483a5..ab40f86 100644
--- a/poppler/UnicodeTypeTable.cc
+++ b/poppler/UnicodeTypeTable.cc
@@ -22,6 +22,7 @@
 #include <stdlib.h>
 #include "CharTypes.h"
 #include "UnicodeTypeTable.h"
+#include "UnicodeCompEquivTables.h"
 #include "goo/gmem.h"
 
 struct UnicodeMapTableEntry {
@@ -1095,6 +1096,28 @@ static GBool combine(Unicode base, Unicode add, Unicode *out) {
       (((v) - HANGUL_V_BASE) + (HANGUL_V_COUNT * ((l) - HANGUL_L_BASE)))))
 #define HANGUL_COMPOSE_LV_T(lv, t) ((lv) + ((t) - HANGUL_T_BASE))
 
+// Returns gTrue if @in has a combining equivalent (placed in @out), otherwise
+// gFalse.
+GBool unicodeCombineEquiv(Unicode in, Unicode *out) {
+    int start = 0, end = COMBINE_EQUIV_TABLE_LENGTH;
+
+    while (gTrue) {
+        int midpoint = (start+end) / 2;
+        if (combine_equiv_table[midpoint].character == in) {
+            *out = combine_equiv_table[midpoint].combining;
+            return gTrue;
+        }
+        else if (start == midpoint)
+            break;
+        else if (in > combine_equiv_table[midpoint].character)
+            start = midpoint;
+        else
+            end = midpoint;
+    }
+
+    return gFalse;
+}
+
 // Converts Unicode string @in of length @len to its normalization in form 
 // NFKC (compatibility decomposition + canonical composition). The length of
 // the resulting Unicode string is returned in @out_len. If non-NULL, @indices
diff --git a/poppler/UnicodeTypeTable.h b/poppler/UnicodeTypeTable.h
index 939e916..cabe80e 100644
--- a/poppler/UnicodeTypeTable.h
+++ b/poppler/UnicodeTypeTable.h
@@ -28,6 +28,8 @@ extern GBool unicodeTypeR(Unicode c);
 
 extern Unicode unicodeToUpper(Unicode c);
 
+extern GBool unicodeCombineEquiv(Unicode in, Unicode *out);
+
 extern Unicode *unicodeNormalizeNFKC(Unicode *in, int len, 
 				     int *out_len, int **offsets);
 
-- 
1.7.2.3

_______________________________________________
poppler mailing list
[email protected]
http://lists.freedesktop.org/mailman/listinfo/poppler

Re: [poppler] [PATCH] Fixup LaTeX composed characters

Reply via email to