On Fri, 2011-03-25 at 20:43 +0000, Albert Astals Cid wrote: > A Divendres, 25 de març de 2011, vàreu escriure: > > On Fri, 25 Mar 2011 19:02:46 +0000, Albert Astals Cid <[email protected]> > > > > wrote: > > > A Divendres, 25 de març de 2011, Tim Brody va escriure: > > >> Hi All, > > >> > > >> Attached is a patch to address the previous problem I wrote about with > > >> pdflatex-produced PDFs that contain overlapping-diacritics/accents.
> > > Hmmm, is it supposed to just kill the diacritic mark? > > > > > > R. L¨wen and B. Polster > > > o > > > gets converted to > > > R. Lowen and B. Polster > > > shouldn't it be > > > R. Löwen and B. Polster > > > ? > > > > It should do - can you send me this PDF? > > http://www.maths.mq.edu.au/~ross/5019-e-cmap.pdf This PDF has [combining character][character to combine with]. I've added combining-chars to the equiv-mapping table which appears to work for this PDF: "R. Löwen and B. Polster" "Institut für Analysis und Algebra ..." > > > > I get this from TeX: > > R. L\"owen and B. Polster => R. Löwen and B. Polster > > > > NB I just tried extracting from a Word-generated PDF and TextOutputDev > > didn't see the line with the diacritic at all. > > And are you sure it's not a Word fault? Oh I expect it is but I thought I'd mention it. I'll try to investigate further. /Tim.
>From 125b8360af90183a690357dc6ce895967766ef47 Mon Sep 17 00:00:00 2001 From: Tim Brody <[email protected]> Date: Fri, 25 Mar 2011 13:02:18 +0000 Subject: [PATCH] Turn TeX-style composed characters into Unicode combining characters during text conversion. --- poppler/TextOutputDev.cc | 38 +++++++++++-- poppler/UnicodeCompEquivTables.h | 106 ++++++++++++++++++++++++++++++++++++++ poppler/UnicodeTypeTable.cc | 23 ++++++++ poppler/UnicodeTypeTable.h | 2 + 4 files changed, 163 insertions(+), 6 deletions(-) create mode 100644 poppler/UnicodeCompEquivTables.h diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc index 13c67c6..2191fd1 100644 --- a/poppler/TextOutputDev.cc +++ b/poppler/TextOutputDev.cc @@ -2161,6 +2161,7 @@ void TextPage::addChar(GfxState *state, double x, double y, double x1, y1, w1, h1, dx2, dy2, base, sp, delta; GBool overlap; int i; + Unicode *uc = NULL; // u + combining character // subtract char and word spacing from the dx,dy values sp = state->getCharSpace(); @@ -2236,12 +2237,34 @@ void TextPage::addChar(GfxState *state, double x, double y, } overlap = fabs(delta) < dupMaxPriDelta * curWord->fontSize && fabs(base - curWord->base) < dupMaxSecDelta * curWord->fontSize; - if (overlap || lastCharOverlap || - sp < -minDupBreakOverlap * curWord->fontSize || - sp > minWordBreakSpace * curWord->fontSize || - fabs(base - curWord->base) > 0.5 || - curFontSize != curWord->fontSize) { - endWord(); + if ( + // place overlapping characters in their own word + lastCharOverlap || + // whitespace along main axis + sp > minWordBreakSpace * curWord->fontSize || + // whitespace along secondary-axis + fabs(base - curWord->base) > 0.5 || + // font size changed + curFontSize != curWord->fontSize + ) { + endWord(); + } + // overlapping characters + else if (overlap || sp < -minDupBreakOverlap * curWord->fontSize ) { + // "u => ü, as seen in pdflatex output + Unicode uu; + if (unicodeCombineEquiv (curWord->text[curWord->len - 1], &uu)) { + curWord->len--; + curWord->charLen--; + uc = (Unicode *) gmallocn (uLen+1, sizeof (Unicode)); + memcpy (uc, u, uLen * sizeof (Unicode)); + uc[uLen++] = uu; + u = uc; + overlap = gFalse; + } + else { + endWord(); + } } lastCharOverlap = overlap; } else { @@ -2293,6 +2316,9 @@ void TextPage::addChar(GfxState *state, double x, double y, } } } + if (uc) { + gfree (uc); + } if (curWord) { curWord->charLen += nBytes; } diff --git a/poppler/UnicodeCompEquivTables.h b/poppler/UnicodeCompEquivTables.h new file mode 100644 index 0000000..0f1dfc1 --- /dev/null +++ b/poppler/UnicodeCompEquivTables.h @@ -0,0 +1,106 @@ +// Generated by combining.pl at Mon Mar 28 09:34:05 2011 + +typedef struct { + Unicode character; + Unicode combining; +} combine_equiv; + +#define COMBINE_EQUIV_TABLE_LENGTH 95 + +static const combine_equiv combine_equiv_table[] = { + { 0x0022, 0x030e }, + { 0x0027, 0x0301 }, + { 0x005e, 0x0302 }, + { 0x005f, 0x0332 }, + { 0x0060, 0x0300 }, + { 0x007e, 0x0303 }, + { 0x00a8, 0x0308 }, + { 0x00af, 0x0305 }, + { 0x00b0, 0x030a }, + { 0x00b4, 0x0301 }, + { 0x00b8, 0x0327 }, + { 0x02b1, 0x0324 }, + { 0x02b2, 0x0321 }, + { 0x02b7, 0x032b }, + { 0x02b9, 0x0301 }, + { 0x02ba, 0x030b }, + { 0x02bb, 0x0312 }, + { 0x02bc, 0x0315 }, + { 0x02bd, 0x0314 }, + { 0x02c0, 0x0309 }, + { 0x02c6, 0x0302 }, + { 0x02c7, 0x030c }, + { 0x02c8, 0x030d }, + { 0x02c9, 0x0304 }, + { 0x02ca, 0x0301 }, + { 0x02cb, 0x0300 }, + { 0x02cc, 0x0329 }, + { 0x02cd, 0x0331 }, + { 0x02d4, 0x0323 }, + { 0x02d5, 0x031e }, + { 0x02d6, 0x031f }, + { 0x02d7, 0x0320 }, + { 0x02d8, 0x0306 }, + { 0x02d9, 0x0307 }, + { 0x02da, 0x030a }, + { 0x02db, 0x0328 }, + { 0x02dc, 0x0303 }, + { 0x02dd, 0x030b }, + { 0x0300, 0x0300 }, + { 0x0301, 0x0301 }, + { 0x0302, 0x0302 }, + { 0x0303, 0x0303 }, + { 0x0304, 0x0304 }, + { 0x0305, 0x0305 }, + { 0x0306, 0x0306 }, + { 0x0307, 0x0307 }, + { 0x0308, 0x0308 }, + { 0x0309, 0x0309 }, + { 0x030a, 0x030a }, + { 0x030b, 0x030b }, + { 0x030c, 0x030c }, + { 0x030d, 0x030d }, + { 0x030e, 0x030e }, + { 0x030f, 0x030f }, + { 0x0310, 0x0310 }, + { 0x0311, 0x0311 }, + { 0x0312, 0x0312 }, + { 0x0313, 0x0313 }, + { 0x0314, 0x0314 }, + { 0x0315, 0x0315 }, + { 0x0316, 0x0316 }, + { 0x0317, 0x0317 }, + { 0x0318, 0x0318 }, + { 0x0319, 0x0319 }, + { 0x031a, 0x031a }, + { 0x031b, 0x031b }, + { 0x031c, 0x031c }, + { 0x031d, 0x031d }, + { 0x031e, 0x031e }, + { 0x031f, 0x031f }, + { 0x0320, 0x0320 }, + { 0x0321, 0x0321 }, + { 0x0322, 0x0322 }, + { 0x0323, 0x0323 }, + { 0x0324, 0x0324 }, + { 0x0325, 0x0325 }, + { 0x0326, 0x0326 }, + { 0x0327, 0x0327 }, + { 0x0328, 0x0328 }, + { 0x0329, 0x0329 }, + { 0x032a, 0x032a }, + { 0x032b, 0x032b }, + { 0x032c, 0x032c }, + { 0x032d, 0x032d }, + { 0x032e, 0x032e }, + { 0x032f, 0x032f }, + { 0x0330, 0x0330 }, + { 0x0331, 0x0331 }, + { 0x0332, 0x0332 }, + { 0x0333, 0x0333 }, + { 0x0384, 0x0301 }, + { 0x0559, 0x0314 }, + { 0x055a, 0x0313 }, + { 0x0901, 0x0310 }, + { 0x2017, 0x0333 }, +}; diff --git a/poppler/UnicodeTypeTable.cc b/poppler/UnicodeTypeTable.cc index c0483a5..ab40f86 100644 --- a/poppler/UnicodeTypeTable.cc +++ b/poppler/UnicodeTypeTable.cc @@ -22,6 +22,7 @@ #include <stdlib.h> #include "CharTypes.h" #include "UnicodeTypeTable.h" +#include "UnicodeCompEquivTables.h" #include "goo/gmem.h" struct UnicodeMapTableEntry { @@ -1095,6 +1096,28 @@ static GBool combine(Unicode base, Unicode add, Unicode *out) { (((v) - HANGUL_V_BASE) + (HANGUL_V_COUNT * ((l) - HANGUL_L_BASE))))) #define HANGUL_COMPOSE_LV_T(lv, t) ((lv) + ((t) - HANGUL_T_BASE)) +// Returns gTrue if @in has a combining equivalent (placed in @out), otherwise +// gFalse. +GBool unicodeCombineEquiv(Unicode in, Unicode *out) { + int start = 0, end = COMBINE_EQUIV_TABLE_LENGTH; + + while (gTrue) { + int midpoint = (start+end) / 2; + if (combine_equiv_table[midpoint].character == in) { + *out = combine_equiv_table[midpoint].combining; + return gTrue; + } + else if (start == midpoint) + break; + else if (in > combine_equiv_table[midpoint].character) + start = midpoint; + else + end = midpoint; + } + + return gFalse; +} + // Converts Unicode string @in of length @len to its normalization in form // NFKC (compatibility decomposition + canonical composition). The length of // the resulting Unicode string is returned in @out_len. If non-NULL, @indices diff --git a/poppler/UnicodeTypeTable.h b/poppler/UnicodeTypeTable.h index 939e916..cabe80e 100644 --- a/poppler/UnicodeTypeTable.h +++ b/poppler/UnicodeTypeTable.h @@ -28,6 +28,8 @@ extern GBool unicodeTypeR(Unicode c); extern Unicode unicodeToUpper(Unicode c); +extern GBool unicodeCombineEquiv(Unicode in, Unicode *out); + extern Unicode *unicodeNormalizeNFKC(Unicode *in, int len, int *out_len, int **offsets); -- 1.7.2.3
_______________________________________________ poppler mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/poppler
