El mar, 27-04-2010 a las 18:28 +0200, Daniel Garcia Moreno escribió:
> Hi to all:
>
> I'm reading the poppler code and touching something here and there
> because I'll implement the atk interface for evince and I need to know
> how to get the text of a pdf file from glib.
>
> I want to get the text ordered like you'll read it, I saw that pdftotext
> get the text well ordered using the "-raw" option. I looked the code and
> I saw that it use TextOutputDev with rawOrder = true.
>
> It's easy to dump the text to a file using the first argument that
> receive the TextOutputDev constructor, but I want to get the text as
> char *.
>
> I saw that using rawOrder in TextOutputDev you can't use getText method,
> it always returns an empty GooString:
>
> ...
> 3603 s = new GooString();
> 3604
> 3605 if (rawOrder) {
> 3606 return s;
> 3607 }
> ...
>
> And here is the question, that is a bug/not_implemented_feature or it's
> like that for some reason?
>
> If you think that's a bug I could create the bug and upload a patch to
> "solve" it using the TextWordList.
>
I filed the bug [1], and attached a patch. I attach the patch in this
mail too.
[1] https://bugs.freedesktop.org/show_bug.cgi?id=27999
>From 50f6ae59d101fb60bd61e8b8063eac10121074da Mon Sep 17 00:00:00 2001
From: danigm <[email protected]>
Date: Thu, 6 May 2010 23:52:04 +0200
Subject: [PATCH] TextData::getText return text in rawOrder
---
poppler/TextOutputDev.cc | 30 +++++++++++++++++++---
test/CMakeLists.txt | 7 +++-
test/gettext-test.cc | 63 ++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 94 insertions(+), 6 deletions(-)
create mode 100644 test/gettext-test.cc
diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc
index ef9c486..28b864b 100644
--- a/poppler/TextOutputDev.cc
+++ b/poppler/TextOutputDev.cc
@@ -3602,10 +3602,6 @@ GooString *TextPage::getText(double xMin, double yMin,
s = new GooString();
- if (rawOrder) {
- return s;
- }
-
// get the output encoding
if (!(uMap = globalParams->getTextEncoding())) {
return s;
@@ -3626,6 +3622,32 @@ GooString *TextPage::getText(double xMin, double yMin,
break;
}
+ if (rawOrder) {
+ TextWordList *wordlist;
+ wordlist = makeWordList(gFalse);
+ int word_length = wordlist->getLength ();
+ TextWord *word;
+ double xMinA, yMinA, xMaxA, yMaxA;
+
+ for (int i=0; i < word_length; i++)
+ {
+ word = wordlist->get (i);
+ word->getBBox (&xMinA, &yMinA, &xMaxA, &yMaxA);
+ if (xMinA > xMin && yMinA > yMin && xMaxA < xMax && yMaxA < yMax)
+ s->append (word->getText ());
+ else
+ continue;
+ if (word->getNext() && word->getNext()->primaryDelta (word) <= 0)
+ {
+ s->append(space, spaceLen);
+ } else {
+ s->append(eol, eolLen);
+ }
+ }
+ return s;
+ }
+
+
//~ writing mode (horiz/vert)
// collect the line fragments that are in the rectangle
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index acb867b..fadcd45 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -1,4 +1,3 @@
-
if (ENABLE_SPLASH)
if (HAVE_NANOSLEEP OR LIB_RT_HAS_NANOSLEEP)
@@ -58,4 +57,8 @@ set (pdf_fullrewrite_SRCS
add_executable(pdf-fullrewrite ${pdf_fullrewrite_SRCS})
target_link_libraries(pdf-fullrewrite poppler)
-
+set (gettext_SRCS
+ gettext-test.cc
+ )
+add_executable(gettext-test ${gettext_SRCS})
+target_link_libraries(gettext-test poppler)
diff --git a/test/gettext-test.cc b/test/gettext-test.cc
new file mode 100644
index 0000000..0c32a9e
--- /dev/null
+++ b/test/gettext-test.cc
@@ -0,0 +1,63 @@
+#include "config.h"
+#include "Page.h"
+#include <poppler-config.h>
+#include "GlobalParams.h"
+#include "Error.h"
+#include "PDFDoc.h"
+#include "goo/GooString.h"
+#include "TextOutputDev.h"
+
+int main (int argc, char *argv[])
+{
+ PDFDoc *doc;
+ GooString *inputName;
+ GooString *s;
+ char *result;
+ int page_index;
+ TextOutputDev *textOut;
+ Page *page;
+ PDFRectangle *rect;
+
+ // parse args
+ if (argc < 3) {
+ fprintf(stderr, "usage: %s INPUT-FILE page\n", argv[0]);
+ return 1;
+ }
+ if (!sscanf (argv[2], "%d", &page_index))
+ {
+ fprintf(stderr, "usage: %s INPUT-FILE page\n", argv[0]);
+ return 1;
+ }
+
+ inputName = new GooString(argv[1]);
+
+ globalParams = new GlobalParams();
+
+ doc = new PDFDoc(inputName);
+
+ if (!doc->isOk()) {
+ delete doc;
+ fprintf(stderr, "Error loading document !\n");
+ return 1;
+ }
+
+ page = doc->getCatalog()->getPage(1);
+
+ //textOut = new TextOutputDev(0, gFalse, gFalse, gFalse);
+ textOut = new TextOutputDev(0, gTrue, gTrue, gFalse);
+ doc->displayPageSlice(textOut, page_index, 72, 72,
+ 0, false, true, false, -1, -1, -1, -1);
+
+ rect = page->getCropBox();
+ s = textOut->getText(rect->x1, rect->y1, rect->x2, rect->y2);
+
+ result = s->getCString ();
+ printf ("%s\n", result);
+
+ delete textOut;
+ delete s;
+
+ delete doc;
+ delete globalParams;
+ return 0;
+}
--
1.7.0.4.361.g8b5fe
_______________________________________________
poppler mailing list
[email protected]
http://lists.freedesktop.org/mailman/listinfo/poppler