poppler/TextOutputDev.cc | 268 +++++++++++++++++++++++++++++------------------ poppler/TextOutputDev.h | 19 +-- 2 files changed, 180 insertions(+), 107 deletions(-)
New commits: commit c5ce12993a4d2bcd3b3e95b1f08d00dc8960678c Author: Albert Astals Cid <[email protected]> Date: Wed Dec 14 22:49:33 2011 +0100 [xpdf303] Merge some stuff from TextOutputDev Yes, this is the best commit log i could think of diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc index 570d53d..0c451ec 100644 --- a/poppler/TextOutputDev.cc +++ b/poppler/TextOutputDev.cc @@ -234,74 +234,103 @@ GBool TextFontInfo::matches(TextFontInfo *fontInfo) { //------------------------------------------------------------------------ TextWord::TextWord(GfxState *state, int rotA, double x0, double y0, - int charPosA, TextFontInfo *fontA, double fontSizeA) { + TextFontInfo *fontA, double fontSizeA) { GfxFont *gfxFont; double x, y, ascent, descent; - + int wMode; + rot = rotA; - charPos = charPosA; - charLen = 0; font = fontA; fontSize = fontSizeA; state->transform(x0, y0, &x, &y); if ((gfxFont = font->gfxFont)) { ascent = gfxFont->getAscent() * fontSize; descent = gfxFont->getDescent() * fontSize; + wMode = gfxFont->getWMode(); } else { // this means that the PDF file draws text without a current font, // which should never happen ascent = 0.95 * fontSize; descent = -0.35 * fontSize; + wMode = 0; } - switch (rot) { - case 0: - yMin = y - ascent; - yMax = y - descent; - if (yMin == yMax) { - // this is a sanity check for a case that shouldn't happen -- but - // if it does happen, we want to avoid dividing by zero later - yMin = y; - yMax = y + 1; - } - base = y; - break; - case 1: - xMin = x + descent; - xMax = x + ascent; - if (xMin == xMax) { - // this is a sanity check for a case that shouldn't happen -- but - // if it does happen, we want to avoid dividing by zero later + if (wMode) { // vertical writing mode + // NB: the rotation value has been incremented by 1 (in + // TextPage::beginWord()) for vertical writing mode + switch (rot) { + case 0: + yMin = y - fontSize; + yMax = y; + base = y; + break; + case 1: xMin = x; - xMax = x + 1; - } - base = x; - break; - case 2: - yMin = y + descent; - yMax = y + ascent; - if (yMin == yMax) { - // this is a sanity check for a case that shouldn't happen -- but - // if it does happen, we want to avoid dividing by zero later + xMax = x + fontSize; + base = x; + break; + case 2: yMin = y; - yMax = y + 1; + yMax = y + fontSize; + base = y; + break; + case 3: + xMin = x - fontSize; + xMax = x; + base = x; + break; } - base = y; - break; - case 3: - xMin = x - ascent; - xMax = x - descent; - if (xMin == xMax) { - // this is a sanity check for a case that shouldn't happen -- but - // if it does happen, we want to avoid dividing by zero later - xMin = x; - xMax = x + 1; + } else { // horizontal writing mode + switch (rot) { + case 0: + yMin = y - ascent; + yMax = y - descent; + if (yMin == yMax) { + // this is a sanity check for a case that shouldn't happen -- but + // if it does happen, we want to avoid dividing by zero later + yMin = y; + yMax = y + 1; + } + base = y; + break; + case 1: + xMin = x + descent; + xMax = x + ascent; + if (xMin == xMax) { + // this is a sanity check for a case that shouldn't happen -- but + // if it does happen, we want to avoid dividing by zero later + xMin = x; + xMax = x + 1; + } + base = x; + break; + case 2: + yMin = y + descent; + yMax = y + ascent; + if (yMin == yMax) { + // this is a sanity check for a case that shouldn't happen -- but + // if it does happen, we want to avoid dividing by zero later + yMin = y; + yMax = y + 1; + } + base = y; + break; + case 3: + xMin = x - ascent; + xMax = x - descent; + if (xMin == xMax) { + // this is a sanity check for a case that shouldn't happen -- but + // if it does happen, we want to avoid dividing by zero later + xMin = x; + xMax = x + 1; + } + base = x; + break; } - base = x; - break; } text = NULL; charcode = NULL; edge = NULL; + charPos = NULL; len = size = 0; spaceAfter = gFalse; next = NULL; @@ -327,47 +356,90 @@ TextWord::~TextWord() { gfree(text); gfree(charcode); gfree(edge); + gfree(charPos); } void TextWord::addChar(GfxState *state, double x, double y, - double dx, double dy, CharCode c, Unicode u) { + double dx, double dy, int charPosA, int charLen, + CharCode c, Unicode u) { + int wMode; + if (len == size) { size += 16; text = (Unicode *)greallocn(text, size, sizeof(Unicode)); charcode = (Unicode *)greallocn(charcode, size, sizeof(CharCode)); edge = (double *)greallocn(edge, (size + 1), sizeof(double)); + charPos = (int *)greallocn(charPos, size + 1, sizeof(int)); } text[len] = u; charcode[len] = c; - switch (rot) { - case 0: - if (len == 0) { - xMin = x; - } - edge[len] = x; - xMax = edge[len+1] = x + dx; - break; - case 1: - if (len == 0) { - yMin = y; - } - edge[len] = y; - yMax = edge[len+1] = y + dy; - break; - case 2: - if (len == 0) { - xMax = x; - } - edge[len] = x; - xMin = edge[len+1] = x + dx; - break; - case 3: - if (len == 0) { - yMax = y; + charPos[len] = charPosA; + charPos[len + 1] = charPosA + charLen; + wMode = font->gfxFont ? font->gfxFont->getWMode() : 0; + if (wMode) { // vertical writing mode + // NB: the rotation value has been incremented by 1 (in + // TextPage::beginWord()) for vertical writing mode + switch (rot) { + case 0: + if (len == 0) { + xMin = x - fontSize; + } + edge[len] = x - fontSize; + xMax = edge[len+1] = x; + break; + case 1: + if (len == 0) { + yMin = y - fontSize; + } + edge[len] = y - fontSize; + yMax = edge[len+1] = y; + break; + case 2: + if (len == 0) { + xMax = x + fontSize; + } + edge[len] = x + fontSize; + xMin = edge[len+1] = x; + break; + case 3: + if (len == 0) { + yMax = y + fontSize; + } + edge[len] = y + fontSize; + yMin = edge[len+1] = y; + break; } - edge[len] = y; - yMin = edge[len+1] = y + dy; - break; + } else { // horizontal writing mode + switch (rot) { + case 0: + if (len == 0) { + xMin = x; + } + edge[len] = x; + xMax = edge[len+1] = x + dx; + break; + case 1: + if (len == 0) { + yMin = y; + } + edge[len] = y; + yMax = edge[len+1] = y + dy; + break; + case 2: + if (len == 0) { + xMax = x; + } + edge[len] = x; + xMin = edge[len+1] = x + dx; + break; + case 3: + if (len == 0) { + yMax = y; + } + edge[len] = y; + yMin = edge[len+1] = y + dy; + break; + } } ++len; } @@ -392,15 +464,17 @@ void TextWord::merge(TextWord *word) { text = (Unicode *)greallocn(text, size, sizeof(Unicode)); charcode = (CharCode *)greallocn(charcode, (size + 1), sizeof(CharCode)); edge = (double *)greallocn(edge, (size + 1), sizeof(double)); + charPos = (int *)greallocn(charPos, size + 1, sizeof(int)); } for (i = 0; i < word->len; ++i) { text[len + i] = word->text[i]; charcode[len + i] = word->charcode[i]; edge[len + i] = word->edge[i]; + charPos[len + i] = word->charPos[i]; } edge[len + word->len] = word->edge[word->len]; + charPos[len + word->len] = word->charPos[word->len]; len += word->len; - charLen += word->charLen; } inline int TextWord::primaryCmp(TextWord *word) { @@ -792,7 +866,7 @@ void TextLine::coalesce(UnicodeMap *uMap) { word0->underlined == word1->underlined && fabs(word0->fontSize - word1->fontSize) < maxWordFontSizeDelta * words->fontSize && - word1->charPos == word0->charPos + word0->charLen) { + word1->charPos[0] == word0->charPos[word0->len]) { word0->merge(word1); word0->next = word1->next; delete word1; @@ -2153,12 +2227,18 @@ void TextPage::beginWord(GfxState *state, double x0, double y0) { m[3] = m2[3]; } if (fabs(m[0] * m[3]) > fabs(m[1] * m[2])) { - rot = (m[3] < 0) ? 0 : 2; + rot = (m[0] > 0 || m[3] < 0) ? 0 : 2; } else { rot = (m[2] > 0) ? 1 : 3; } - curWord = new TextWord(state, rot, x0, y0, charPos, curFont, curFontSize); + // for vertical writing mode, the lines are effectively rotated 90 + // degrees + if (state->getFont()->getWMode()) { + rot = (rot + 1) & 3; + } + + curWord = new TextWord(state, rot, x0, y0, curFont, curFontSize); } void TextPage::addChar(GfxState *state, double x, double y, @@ -2199,9 +2279,6 @@ void TextPage::addChar(GfxState *state, double x, double y, // break words at space character if (uLen == 1 && u[0] == (Unicode)0x20) { - if (curWord) { - ++curWord->charLen; - } charPos += nBytes; endWord(); return; @@ -2284,24 +2361,21 @@ void TextPage::addChar(GfxState *state, double x, double y, /* next code is a low surrogate */ Unicode uu = (((u[i] & 0x3ff) << 10) | (u[i+1] & 0x3ff)) + 0x10000; i++; - curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, c, uu); + curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, charPos, nBytes, c, uu); } else { /* missing low surrogate replace it with REPLACEMENT CHARACTER (U+FFFD) */ - curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, c, 0xfffd); + curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, charPos, nBytes, c, 0xfffd); } } else if (u[i] >= 0xdc00 && u[i] < 0xe000) { /* invalid low surrogate replace it with REPLACEMENT CHARACTER (U+FFFD) */ - curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, c, 0xfffd); + curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, charPos, nBytes, c, 0xfffd); } else { - curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, c, u[i]); + curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, charPos, nBytes, c, u[i]); } } } - if (curWord) { - curWord->charLen += nBytes; - } charPos += nBytes; } @@ -4606,16 +4680,14 @@ GBool TextPage::findCharRange(int pos, int length, blk = blocks[i]; for (line = blk->lines; line; line = line->next) { for (word = line->words; word; word = word->next) { - if (pos < word->charPos + word->charLen && - word->charPos < pos + length) { - j0 = pos - word->charPos; - if (j0 < 0) { - j0 = 0; - } - j1 = pos + length - 1 - word->charPos; - if (j1 >= word->len) { - j1 = word->len - 1; - } + if (pos < word->charPos[word->len] && + pos + length > word->charPos[0]) { + for (j0 = 0; + j0 < word->len && pos >= word->charPos[j0 + 1]; + ++j0) ; + for (j1 = word->len - 1; + j1 > j0 && pos + length <= word->charPos[j1]; + --j1) ; switch (line->rot) { case 0: xMin1 = word->edge[j0]; diff --git a/poppler/TextOutputDev.h b/poppler/TextOutputDev.h index dff3921..2dd78cd 100644 --- a/poppler/TextOutputDev.h +++ b/poppler/TextOutputDev.h @@ -114,14 +114,15 @@ public: // Constructor. TextWord(GfxState *state, int rotA, double x0, double y0, - int charPosA, TextFontInfo *fontA, double fontSize); + TextFontInfo *fontA, double fontSize); // Destructor. ~TextWord(); // Add a character to the word. void addChar(GfxState *state, double x, double y, - double dx, double dy, CharCode c, Unicode u); + double dx, double dy, int charPosA, int charLen, + CharCode c, Unicode u); // Merge <word> onto the end of <this>. void merge(TextWord *word); @@ -159,8 +160,8 @@ public: double *xMaxA, double *yMaxA); double getFontSize() { return fontSize; } int getRotation() { return rot; } - int getCharPos() { return charPos; } - int getCharLen() { return charLen; } + int getCharPos() { return charPos[0]; } + int getCharLen() { return charPos[len] - charPos[0]; } GBool getSpaceAfter() { return spaceAfter; } #endif GBool isUnderlined() { return underlined; } @@ -180,11 +181,11 @@ private: CharCode *charcode; // glyph indices double *edge; // "near" edge x or y coord of each char // (plus one extra entry for the last char) - int len; // length of text and edge arrays - int size; // size of text and edge arrays - int charPos; // character position (within content stream) - int charLen; // number of content stream characters in - // this word + int *charPos; // character position (within content stream) + // of each char (plus one extra entry for + // the last char) + int len; // length of text/edge/charPos arrays + int size; // size of text/edge/charPos arrays TextFontInfo *font; // font information double fontSize; // font size GBool spaceAfter; // set if there is a space between this _______________________________________________ poppler mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/poppler
