I've modified my patch to works as normal when !rawOrder and when TextOutputDev.rawOrder makes selection with "raw" algorithm.
This solution doesn't break current one and adds new functionality to poppler. What do you think about it?, it's unneeded or it's a good feature to add to poppler? I think it's useful, because I do it :P and because it works for me...
>From 49147f33d6879af7f4b8157aea58edf9624bbcb9 Mon Sep 17 00:00:00 2001 From: Daniel Garcia <[email protected]> Date: Wed, 15 Sep 2010 12:56:50 +0200 Subject: [PATCH] Select text in raworder. --- poppler/TextOutputDev.cc | 388 ++++++++++++++++++++++++++++++++++++++++++++-- poppler/TextOutputDev.h | 20 +++ 2 files changed, 397 insertions(+), 11 deletions(-) diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc index 576bcc9..179a15c 100644 --- a/poppler/TextOutputDev.cc +++ b/poppler/TextOutputDev.cc @@ -416,6 +416,27 @@ inline int TextWord::primaryCmp(TextWord *word) { return cmp < 0 ? -1 : cmp > 0 ? 1 : 0; } +inline int TextWord::secondaryCmp(TextWord *word) { + double cmp; + + cmp = 0; // make gcc happy + switch (rot) { + case 0: + cmp = yMin - word->yMin; + break; + case 1: + cmp = xMin - word->xMin; + break; + case 2: + cmp = word->yMax - yMax; + break; + case 3: + cmp = word->xMax - xMax; + break; + } + return cmp < 0 ? -1 : cmp > 0 ? 1 : 0; +} + double TextWord::primaryDelta(TextWord *word) { double delta; @@ -1860,15 +1881,28 @@ TextWordList::TextWordList(TextPage *text, GBool physLayout) { TextFlow *flow; TextBlock *blk; TextLine *line; - TextWord *word; + TextWord *word, *prevword=NULL; TextWord **wordArray; int nWords, i; words = new GooList(); if (text->rawOrder) { - for (word = text->rawWords; word; word = word->next) { - words->append(word); + if (text->primaryLR) { + for (word = text->rawWords; word; word = word->next) { + words->append(word); + } + } else { + i = 0; + for (word = text->rawWords; word; word = word->next) { + if (prevword) { + if (word->secondaryCmp(prevword)) { + i = getLength(); + } + } + words->insert(i, word); + prevword = word; + } } } else if (physLayout) { @@ -2361,6 +2395,24 @@ void TextPage::coalesce(GBool physLayout, GBool doHTML) { if (rawOrder) { primaryRot = 0; primaryLR = gTrue; + + // determine the primary direction + lrCount = 0; + TextWordList *wordlist = makeWordList(gFalse); + if (wordlist->getLength()) { + for (word0 = wordlist->get(0); word0; word0 = word0->next) { + for (i = 0; i < word0->len; ++i) { + if (unicodeTypeL(word0->text[i])) { + ++lrCount; + } else if (unicodeTypeR(word0->text[i])) { + --lrCount; + } + } + } + primaryLR = lrCount >= 0; + } + delete wordlist; + return; } @@ -4105,6 +4157,7 @@ public: PDFRectangle *selection); virtual void visitWord (TextWord *word, int begin, int end, PDFRectangle *selection); + void drawRegion (PDFRectangle *region); private: OutputDev *out; @@ -4184,6 +4237,21 @@ void TextSelectionPainter::visitLine (TextLine *line, state->clearPath(); } +void TextSelectionPainter::drawRegion (PDFRectangle *region) +{ + state->setFillColor(box_color); + out->updateFillColor(state); + + state->moveTo(region->x1, region->y1); + state->lineTo(region->x2, region->y1); + state->lineTo(region->x2, region->y2); + state->lineTo(region->x1, region->y2); + state->closePath(); + + out->fill(state); + state->clearPath(); +} + void TextSelectionPainter::visitWord (TextWord *word, int begin, int end, PDFRectangle *selection) { @@ -4543,6 +4611,105 @@ void TextPage::visitSelection(TextSelectionVisitor *visitor, } } +void TextPage::getSelectionWordLimits(PDFRectangle *selection, + SelectionStyle style, + double scale, + int *first, + int *last, + int *first_c, + int *last_c) { + TextWordList *wordlist = makeWordList(gFalse); + TextWord *word=NULL; + double distance, minor=-1, minor1=-1; + double xmin, ymin, xmax, ymax; + double x1, y1, x2, y2; + int tmp; + + x1 = selection->x1; + x2 = selection->x2; + + y1 = selection->y1; + y2 = selection->y2; + + for (int i=0; i<wordlist->getLength(); i++) { + word = wordlist->get(i); + + for (int j=0; j<word->getLength(); j++) { + word->getCharBBox(j, &xmin, &ymin, &xmax, &ymax); + + distance = fabs(x1 - xmin) + 10*fabs(y1 - ymin); + if (minor < 0 || distance < minor) { + *first = i; + *first_c = j; + minor = distance; + } + + distance = fabs(x1 - xmin) + 10*fabs(y1 - ymax); + if (minor < 0 || distance < minor) { + *first = i; + *first_c = j; + minor = distance; + } + + distance = fabs(x2 - xmax) + 10*fabs(y2 - ymax); + if (minor1 < 0 || distance < minor1) { + *last = i; + *last_c = j; + minor1 = distance; + } + } + } + + switch (style) { + case selectionStyleGlyph: + break; + case selectionStyleLine: + for (int i=*first; i>=0; i--) { + word = wordlist->get(i); + if (!word->secondaryCmp(wordlist->get(*first))) { + *first = i; + } + } + for (int i=*last; i<wordlist->getLength(); i++) { + word = wordlist->get(i); + if (!word->secondaryCmp(wordlist->get(*last))) { + *last = i; + } + } + case selectionStyleWord: + *first_c = wordlist->get(*first)->getLength() - 1; + if (primaryLR) { + *last_c = wordlist->get(*last)->getLength() - 1; + } else { + *last_c = 0; + } + if (last == first) { + *last_c = wordlist->get(*last)->getLength() - 1; + *first_c = 0; + } + break; + default: break; + } + + if (*first > *last) { + tmp = *last; + *last = *first; + *first = tmp; + + tmp = *last_c; + *last_c = *first_c; + *first_c = tmp; + } + + if (*first == *last && *first_c > *last_c) { + tmp = *last_c; + *last_c = *first_c; + *first_c = tmp; + } + + delete wordlist; +} + void TextPage::drawSelection(OutputDev *out, double scale, int rotation, @@ -4550,30 +4717,229 @@ void TextPage::drawSelection(OutputDev *out, SelectionStyle style, GfxColor *glyph_color, GfxColor *box_color) { + if (!rawOrder) { + TextSelectionPainter painter(this, scale, rotation, + out, box_color, glyph_color); + visitSelection(&painter, selection, style); + } else { + drawSelectionRaw(out, scale, rotation, selection, style, glyph_color, box_color); + } +} + +void TextPage::drawSelectionRaw(OutputDev *out, + double scale, + int rotation, + PDFRectangle *selection, + SelectionStyle style, + GfxColor *glyph_color, + GfxColor *box_color) +{ TextSelectionPainter painter(this, scale, rotation, - out, box_color, glyph_color); + out, box_color, glyph_color); + int first, last, first_c, last_c, begin, end; + TextWordList *wordlist = makeWordList(gFalse); + TextWord *word = NULL; + PDFRectangle *rect; + GooList *rlist; - visitSelection(&painter, selection, style); + getSelectionWordLimits(selection, style, scale, &first, &last, &first_c, &last_c); + rlist = getSelectionRegion(selection, style, scale); + for(int i=0; i<rlist->getLength(); i++) { + rect = (PDFRectangle *)rlist->get(i); + painter.drawRegion(rect); + } + + for(int i=first; i<=last; i++) { + word = wordlist->get(i); + if (primaryLR) { + if (i == first && i == last) { + begin = first_c; + end = last_c + 1; + } else if (i == first) { + begin = first_c; + end = word->getLength(); + } else if (i == last) { + begin = 0; + end = last_c + 1; + } else { + begin = 0; + end = word->getLength(); + } + } else { + if (i == first && i == last) { + begin = first_c; + end = last_c + 1; + } else if (i == first) { + begin = 0; + end = first_c + 1; + } else if (i == last) { + begin = last_c; + end = word->getLength(); + } else { + begin = 0; + end = word->getLength(); + } + } + + painter.visitWord(word, begin, end, selection); + } + + delete wordlist; } GooList *TextPage::getSelectionRegion(PDFRectangle *selection, SelectionStyle style, double scale) { - TextSelectionSizer sizer(this, scale); + if (!rawOrder) { + TextSelectionSizer sizer(this, scale); + visitSelection(&sizer, selection, style); + return sizer.getRegion(); + } else { + return getSelectionRegionRaw(selection, style, scale); + } +} + +GooList *TextPage::getSelectionRegionRaw(PDFRectangle *selection, + SelectionStyle style, + double scale) +{ + GooList *ret = new GooList(); + PDFRectangle *rect = NULL; + TextWordList *wordlist = makeWordList(gFalse); + TextWord *word=NULL, *prevword=NULL; + int first=0, last=0, first_c=0, last_c=0; + double xmin, ymin, xmax, ymax; + double xmin1, ymin1, xmax1, ymax1; + + getSelectionWordLimits(selection, style, scale, &first, &last, &first_c, &last_c); + + for (int i=first; i<=last; i++) { + word = wordlist->get(i); + if (prevword && !word->secondaryCmp(prevword) && rect) { + if (i == last) { + word->getCharBBox(last_c, &xmin, &ymin, &xmax, &ymax); + } + else { + word->getBBox(&xmin, &ymin, &xmax, &ymax); + } - visitSelection(&sizer, selection, style); + if (primaryLR) { + rect->x2 = xmax; + } else { + rect->x1 = xmin; + } + prevword = word; + continue; + } - return sizer.getRegion(); + if (primaryLR) { + if (i == first && i == last) { + word->getCharBBox(first_c, &xmin1, &ymin1, &xmax1, &ymax1); + word->getCharBBox(last_c, &xmin, &ymin, &xmax, &ymax); + xmin = xmin1; ymin = ymin1; + } else if (i == first) { + word->getCharBBox(first_c, &xmin1, &ymin1, &xmax1, &ymax1); + word->getBBox(&xmin, &ymin, &xmax, &ymax); + xmin = xmin1; ymin = ymin1; + } else if (i == last) { + word->getCharBBox(last_c, &xmin1, &ymin1, &xmax1, &ymax1); + word->getBBox(&xmin, &ymin, &xmax, &ymax); + xmax = xmax1; ymax = ymax1; + } else { + word->getBBox(&xmin, &ymin, &xmax, &ymax); + } + } else { + if (i == first && i == last) { + word->getCharBBox(first_c, &xmin1, &ymin1, &xmax1, &ymax1); + word->getCharBBox(last_c, &xmin, &ymin, &xmax, &ymax); + xmin = xmin1; ymin = ymin1; + } else if (i == first) { + word->getCharBBox(first_c, &xmin1, &ymin1, &xmax1, &ymax1); + word->getBBox(&xmin, &ymin, &xmax, &ymax); + xmax = xmax1; ymax = ymax1; + } else if (i == last) { + word->getCharBBox(last_c, &xmin1, &ymin1, &xmax1, &ymax1); + word->getBBox(&xmin, &ymin, &xmax, &ymax); + xmin = xmin1; ymin = ymin1; + } else { + word->getBBox(&xmin, &ymin, &xmax, &ymax); + } + } + + rect = new PDFRectangle(xmin, ymin, xmax, ymax); + ret->append(rect); + prevword = word; + } + + delete wordlist; + + return ret; } GooString *TextPage::getSelectionText(PDFRectangle *selection, SelectionStyle style) { - TextSelectionDumper dumper(this); + if (!rawOrder) { + TextSelectionDumper dumper(this); + visitSelection(&dumper, selection, style); + return dumper.getText(); + } else { + return getSelectionTextRaw(selection, style); + } +} + + +GooString *TextPage::getSelectionTextRaw(PDFRectangle *selection, + SelectionStyle style) +{ + GooString *ret = new GooString(); + TextWordList *wordlist = makeWordList(gFalse); + TextWord *word=NULL, *prevword=NULL; + int first=0, last=0, first_c=0, last_c=0; + UnicodeMap *uMap; + // get the output encoding + if (!(uMap = globalParams->getTextEncoding())) { + return ret; + } + + getSelectionWordLimits(selection, style, 1, &first, &last, &first_c, &last_c); + + for (int i=first; i<=last; i++) { + word = wordlist->get(i); + if (prevword) { + if (word->secondaryCmp(prevword)) { + ret->append('\n'); + } else { + ret->append(' '); + } + } + if (primaryLR) { + if (i == first && i == last) { + dumpFragment(word->text + first_c, last_c+1 - first_c, uMap, ret); + } else if (i == first) { + dumpFragment(word->text + first_c, word->len - first_c, uMap, ret); + } else if (i == last) { + dumpFragment(word->text, last_c+1, uMap, ret); + } else { + dumpFragment(word->text, word->len, uMap, ret); + } + } else { + if (i == first && i == last) { + dumpFragment(word->text + first_c, last_c+1 - first_c, uMap, ret); + } else if (i == first) { + dumpFragment(word->text, first_c+1, uMap, ret); + } else if (i == last) { + dumpFragment(word->text + last_c, word->len, uMap, ret); + } else { + dumpFragment(word->text, word->len, uMap, ret); + } + } - visitSelection(&dumper, selection, style); + prevword = word; + } + delete wordlist; - return dumper.getText(); + return ret; } GBool TextPage::findCharRange(int pos, int length, diff --git a/poppler/TextOutputDev.h b/poppler/TextOutputDev.h index 438aee4..4b9edd4 100644 --- a/poppler/TextOutputDev.h +++ b/poppler/TextOutputDev.h @@ -125,6 +125,7 @@ public: // Compares <this> to <word>, returning -1 (<), 0 (=), or +1 (>), // based on a primary-axis comparison, e.g., x ordering if rot=0. int primaryCmp(TextWord *word); + int secondaryCmp(TextWord *word); // Return the distance along the primary axis between <this> and // <word>. @@ -581,6 +582,25 @@ private: void clear(); void assignColumns(TextLineFrag *frags, int nFrags, GBool rot); int dumpFragment(Unicode *text, int len, UnicodeMap *uMap, GooString *s); + void getSelectionWordLimits(PDFRectangle *selection, + SelectionStyle style, + double scale, + int *first, + int *last, + int *first_c, + int *last_c); + GooString *getSelectionTextRaw(PDFRectangle *selection, + SelectionStyle style); + GooList *getSelectionRegionRaw(PDFRectangle *selection, + SelectionStyle style, + double scale); + void drawSelectionRaw(OutputDev *out, + double scale, + int rotation, + PDFRectangle *selection, + SelectionStyle style, + GfxColor *glyph_color, + GfxColor *box_color); GBool rawOrder; // keep text in content stream order -- 1.7.0.4
_______________________________________________ poppler mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/poppler
