> > I filed the bug [1], and attached a patch. I attach the patch in this > mail too. >
I make a new patch that improve the adjustement to x and y limits in TextPage::getText [1]. I have also a branch in github [2] where I'm pushing that changes. Here I attach the two patchs to fix the bug. [1] https://bugs.freedesktop.org/show_bug.cgi?id=27999 [2] http://github.com/danigm/poppler/tree/gettext
>From 50f6ae59d101fb60bd61e8b8063eac10121074da Mon Sep 17 00:00:00 2001 From: danigm <[email protected]> Date: Thu, 6 May 2010 23:52:04 +0200 Subject: [PATCH 1/2] TextData::getText return text in rawOrder --- poppler/TextOutputDev.cc | 30 +++++++++++++++++++--- test/CMakeLists.txt | 7 +++- test/gettext-test.cc | 63 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 94 insertions(+), 6 deletions(-) create mode 100644 test/gettext-test.cc diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc index ef9c486..28b864b 100644 --- a/poppler/TextOutputDev.cc +++ b/poppler/TextOutputDev.cc @@ -3602,10 +3602,6 @@ GooString *TextPage::getText(double xMin, double yMin, s = new GooString(); - if (rawOrder) { - return s; - } - // get the output encoding if (!(uMap = globalParams->getTextEncoding())) { return s; @@ -3626,6 +3622,32 @@ GooString *TextPage::getText(double xMin, double yMin, break; } + if (rawOrder) { + TextWordList *wordlist; + wordlist = makeWordList(gFalse); + int word_length = wordlist->getLength (); + TextWord *word; + double xMinA, yMinA, xMaxA, yMaxA; + + for (int i=0; i < word_length; i++) + { + word = wordlist->get (i); + word->getBBox (&xMinA, &yMinA, &xMaxA, &yMaxA); + if (xMinA > xMin && yMinA > yMin && xMaxA < xMax && yMaxA < yMax) + s->append (word->getText ()); + else + continue; + if (word->getNext() && word->getNext()->primaryDelta (word) <= 0) + { + s->append(space, spaceLen); + } else { + s->append(eol, eolLen); + } + } + return s; + } + + //~ writing mode (horiz/vert) // collect the line fragments that are in the rectangle diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index acb867b..fadcd45 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -1,4 +1,3 @@ - if (ENABLE_SPLASH) if (HAVE_NANOSLEEP OR LIB_RT_HAS_NANOSLEEP) @@ -58,4 +57,8 @@ set (pdf_fullrewrite_SRCS add_executable(pdf-fullrewrite ${pdf_fullrewrite_SRCS}) target_link_libraries(pdf-fullrewrite poppler) - +set (gettext_SRCS + gettext-test.cc + ) +add_executable(gettext-test ${gettext_SRCS}) +target_link_libraries(gettext-test poppler) diff --git a/test/gettext-test.cc b/test/gettext-test.cc new file mode 100644 index 0000000..0c32a9e --- /dev/null +++ b/test/gettext-test.cc @@ -0,0 +1,63 @@ +#include "config.h" +#include "Page.h" +#include <poppler-config.h> +#include "GlobalParams.h" +#include "Error.h" +#include "PDFDoc.h" +#include "goo/GooString.h" +#include "TextOutputDev.h" + +int main (int argc, char *argv[]) +{ + PDFDoc *doc; + GooString *inputName; + GooString *s; + char *result; + int page_index; + TextOutputDev *textOut; + Page *page; + PDFRectangle *rect; + + // parse args + if (argc < 3) { + fprintf(stderr, "usage: %s INPUT-FILE page\n", argv[0]); + return 1; + } + if (!sscanf (argv[2], "%d", &page_index)) + { + fprintf(stderr, "usage: %s INPUT-FILE page\n", argv[0]); + return 1; + } + + inputName = new GooString(argv[1]); + + globalParams = new GlobalParams(); + + doc = new PDFDoc(inputName); + + if (!doc->isOk()) { + delete doc; + fprintf(stderr, "Error loading document !\n"); + return 1; + } + + page = doc->getCatalog()->getPage(1); + + //textOut = new TextOutputDev(0, gFalse, gFalse, gFalse); + textOut = new TextOutputDev(0, gTrue, gTrue, gFalse); + doc->displayPageSlice(textOut, page_index, 72, 72, + 0, false, true, false, -1, -1, -1, -1); + + rect = page->getCropBox(); + s = textOut->getText(rect->x1, rect->y1, rect->x2, rect->y2); + + result = s->getCString (); + printf ("%s\n", result); + + delete textOut; + delete s; + + delete doc; + delete globalParams; + return 0; +} -- 1.7.1
>From accb938021cf2bd0f7ae37546c601623f5dc1f1b Mon Sep 17 00:00:00 2001 From: danigm <[email protected]> Date: Mon, 10 May 2010 10:14:57 +0200 Subject: [PATCH 2/2] TextData::getText in rawOrder now count chars The previous getText with rawOrder only looked if words are between limits. This commit adds chars of a word which are in. --- poppler/TextOutputDev.cc | 43 ++++++++++++++++++++++++++++++++----------- test/gettext-test.cc | 1 + 2 files changed, 33 insertions(+), 11 deletions(-) diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc index 28b864b..4c42b30 100644 --- a/poppler/TextOutputDev.cc +++ b/poppler/TextOutputDev.cc @@ -3626,22 +3626,43 @@ GooString *TextPage::getText(double xMin, double yMin, TextWordList *wordlist; wordlist = makeWordList(gFalse); int word_length = wordlist->getLength (); - TextWord *word; + TextWord *word=NULL, *prev_word=NULL; + const Unicode *word_char; + char buf[8]; + bool outOfBound = false; double xMinA, yMinA, xMaxA, yMaxA; - for (int i=0; i < word_length; i++) - { + for (int i=0; i < word_length; i++) { word = wordlist->get (i); + + if (prev_word && word->primaryDelta (prev_word) <= 0) { + if (!outOfBound) + s->append(space, spaceLen); + } else { + s->append(eol, eolLen); + } + word->getBBox (&xMinA, &yMinA, &xMaxA, &yMaxA); - if (xMinA > xMin && yMinA > yMin && xMaxA < xMax && yMaxA < yMax) + if (xMinA > xMin && yMinA > yMin && xMaxA < xMax && yMaxA < yMax) { s->append (word->getText ()); - else - continue; - if (word->getNext() && word->getNext()->primaryDelta (word) <= 0) - { - s->append(space, spaceLen); - } else { - s->append(eol, eolLen); + prev_word = word; + outOfBound = false; + } + else if (xMinA < xMax && yMinA < yMax) { + for (int i=0; i < word->getLength(); i++) { + int n; + word->getCharBBox(i, &xMinA, &yMinA, &xMaxA, &yMaxA); + if (xMinA > xMin && yMinA > yMin && xMaxA < xMax && yMaxA < yMax) { + word_char = word->getChar(i); + n = uMap->mapUnicode(*word_char, buf, sizeof(buf)); + s->append(buf, n); + } + } + prev_word = word; + outOfBound = true; + } + else { + outOfBound = true; } } return s; diff --git a/test/gettext-test.cc b/test/gettext-test.cc index 0c32a9e..58f07a9 100644 --- a/test/gettext-test.cc +++ b/test/gettext-test.cc @@ -50,6 +50,7 @@ int main (int argc, char *argv[]) rect = page->getCropBox(); s = textOut->getText(rect->x1, rect->y1, rect->x2, rect->y2); + //s = textOut->getText(0, 0, 200, 1000); result = s->getCString (); printf ("%s\n", result); -- 1.7.1
_______________________________________________ poppler mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/poppler
