Hi,

I've tried to implement the suggestion, I attached my current patch.

As suggested, the most part is just copied from Qt frontend and renamed,
except of one point: TextBox.nextWord() looks slightly confusing,
because the returned object is a pointer to TextBox. I wrote
text_box.next_text_box() and a macro text_box.next_word() which
calls next_text_box() internally.

Another point I want to discuss is the design of the list give by
poppler::page::text_list(). In Qt frontend, Page::textList() returns
QList<TextBox*>. For similarity, current patch returns std::vector<text_box*>
for similarity to Qt frontend.

But, if we return the vector of pointers, the client should destruct
the objects pointed by the vector, before destructing vector itself.
Using a vector of text_box (not the pointer but the object itself),
like std::vector<text_box>, could be better, because the destructor
of the vector would internally call the destructor for text_box object.
(Qt has qDeleteAll(), but I think std::vector does not have such).
If I'm misunderstanding about C++, please correct.

Regards,
mpsuzuki


Albert Astals Cid wrote:
> El dimecres, 27 de desembre de 2017, a les 12:26:25 CET, Jeroen Ooms va 
> escriure:
>> Is there a method in poppler-cpp to extract text from a pdf document,
>> including the position of each text box? Currently we use page->text()
>> with page::physical_layout which gives all text per page, but I need
>> more detailed information about each text box per page.
> 
> You want to code the variant of qt5 frontend Poppler::Page::textList() for 
> cpp 
> frontend, it shouldn't be that hard getting inspiration (i.e. almost-copying) 
> the code, do you have time for it?
> 
> Cheers,
>   Albert
> 
>> _______________________________________________
>> poppler mailing list
>> [email protected]
>> https://lists.freedesktop.org/mailman/listinfo/poppler
> 
> 
> _______________________________________________
> poppler mailing list
> [email protected]
> https://lists.freedesktop.org/mailman/listinfo/poppler
diff --git a/cpp/poppler-page.cpp b/cpp/poppler-page.cpp
index 97a4dbb..9461ab9 100644
--- a/cpp/poppler-page.cpp
+++ b/cpp/poppler-page.cpp
@@ -285,3 +285,102 @@ ustring page::text(const rectf &r, text_layout_enum layout_mode) const
     }
     return ustring::from_utf8(s->getCString());
 }
+
+/*
+ * text_box object for page::text_list()
+ */
+text_box::text_box(const ustring& text, const rectf &bbox)
+{
+    m_data = new text_box_data();
+    m_data->text = text;
+    m_data->bbox = bbox;
+}
+
+text_box::~text_box()
+{
+    delete m_data;
+}
+
+ustring text_box::text() const
+{
+    return m_data->text;
+}
+
+rectf text_box::bbox() const
+{
+    return m_data->bbox;
+}
+
+text_box* text_box::next_text_box() const
+{
+    return m_data->next_text_box;
+}
+
+rectf text_box::char_bbox(int i) const
+{
+    return m_data->char_bboxes[i];
+}
+
+bool text_box::has_space_after() const
+{
+    return m_data->has_space_after;
+}
+
+std::vector<text_box*> page::text_list(rotation_enum rotate) const
+{
+    TextOutputDev *output_dev;
+    std::vector<text_box*>  output_list;
+    const int rotation_value = (int)rotate * 90;
+
+    /* config values are same with Qt5 Page::TextList() */
+    output_dev = new TextOutputDev(NULL,    /* char* fileName */
+                                   gFalse,  /* GBool physLayoutA */
+                                   0,       /* double fixedPitchA */
+                                   gFalse,  /* GBool rawOrderA */
+                                   gFalse); /* GBool append */
+
+    /* config values are same with Qt5 Page::TextList() */
+    d->doc->doc->displayPageSlice(output_dev,
+                                  d->index + 1,           /* page */
+                                  72, 72, rotation_value, /* hDPI, vDPI, rot */
+                                  false, false, false,    /* useMediaBox, crop, printing */
+                                  -1, -1, -1, -1,         /* sliceX, sliceY, sliceW, sliceH */
+                                  NULL, NULL,             /* abortCheckCbk(), abortCheckCbkData */
+                                  NULL, NULL,             /* annotDisplayDecideCbk(), annotDisplayDecideCbkData */
+                                  gTrue);                 /* copyXRef */
+
+    TextWordList *word_list = output_dev->makeWordList();
+    if (!word_list) {
+        delete output_dev;
+        return output_list;
+    }
+
+    output_list.reserve(word_list->getLength());
+    for (int i = 0; i < word_list->getLength(); i ++) {
+	TextWord *word = word_list->get(i);
+
+	GooString *gooWord = word->getText();
+	ustring ustr = detail::unicode_GooString_to_ustring(gooWord);
+	delete gooWord;
+
+	double xMin, yMin, xMax, yMax;
+	word->getBBox(&xMin, &yMin, &xMax, &yMax);
+
+	text_box* tb = new text_box(ustr, rectf(xMin, yMin, xMax-xMin, yMax-yMin));
+	tb->m_data->has_space_after = (word->hasSpaceAfter() == gTrue);
+
+	tb->m_data->char_bboxes.reserve(word->getLength());
+	for (int j = 0; j < word->getLength(); j ++) {
+	    word->getCharBBox(j, &xMin, &yMin, &xMax, &yMax);
+	    tb->m_data->char_bboxes.push_back(rectf(xMin, yMin, xMax-xMin, yMax-yMin));
+	}
+
+	if (output_list.size() > 0)
+	    output_list.back()->m_data->next_text_box = tb;
+
+	output_list.push_back(tb);
+    }
+    delete word_list;
+    delete output_dev;
+    return output_list;
+}
diff --git a/cpp/poppler-page.h b/cpp/poppler-page.h
index 7b4298a..18af213 100644
--- a/cpp/poppler-page.h
+++ b/cpp/poppler-page.h
@@ -25,6 +25,22 @@
 namespace poppler
 {
 
+class text_box_data;
+class POPPLER_CPP_EXPORT text_box {
+    friend class page;
+    public:
+	text_box(const ustring &text, const rectf &bBox);
+	~text_box();
+	ustring   text() const;
+	rectf     bbox() const;
+	text_box *next_text_box() const;
+	text_box *next_word() { return this->next_text_box(); };
+	rectf     char_bbox(int i) const;
+	bool      has_space_after() const;
+    private:
+	text_box_data* m_data;
+};
+
 class document;
 class document_private;
 class page_private;
@@ -63,6 +79,8 @@ public:
     ustring text(const rectf &rect = rectf()) const;
     ustring text(const rectf &rect, text_layout_enum layout_mode) const;
 
+    std::vector<text_box*> text_list(rotation_enum rotation) const;
+
 private:
     page(document_private *doc, int index);
 
diff --git a/cpp/poppler-private.h b/cpp/poppler-private.h
index 147073d..726068a 100644
--- a/cpp/poppler-private.h
+++ b/cpp/poppler-private.h
@@ -67,6 +67,21 @@ void delete_all(const Collection &c)
     delete_all(c.begin(), c.end());
 }
 
+class text_box;
+class text_box_data
+{
+public:
+    text_box_data()
+    : next_text_box(0), has_space_after(false)
+    {
+    }
+    ustring text;
+    rectf bbox;
+    text_box *next_text_box;
+    std::vector<rectf> char_bboxes;
+    bool has_space_after;
+};
+
 }
 
 #endif
diff --git a/cpp/tests/poppler-dump.cpp b/cpp/tests/poppler-dump.cpp
index 706ad39..09b180d 100644
--- a/cpp/tests/poppler-dump.cpp
+++ b/cpp/tests/poppler-dump.cpp
@@ -50,6 +50,7 @@ bool show_embedded_files = false;
 bool show_pages = false;
 bool show_help = false;
 char show_text[32];
+bool show_text_list = false;
 poppler::page::text_layout_enum show_text_layout = poppler::page::physical_layout;
 
 static const ArgDesc the_args[] = {
@@ -71,6 +72,8 @@ static const ArgDesc the_args[] = {
       "show pages information" },
     { "--show-text",           argString, &show_text,          sizeof(show_text),
       "show text (physical|raw) extracted from all pages" },
+    { "--show-text-list",      argFlag, &show_text_list,       0,
+      "show text list (experimental)" },
     { "-h",                    argFlag,  &show_help,           0,
       "print usage information" },
     { "--help",                argFlag,  &show_help,           0,
@@ -323,6 +326,29 @@ static void print_page_text(poppler::page *p)
     std::cout << std::endl;
 }
 
+static void print_page_text_list(poppler::page *p)
+{
+    if (!p) {
+        std::cout << std::setw(out_width) << "Broken Page. Could not be parsed" << std::endl;
+        std::cout << std::endl;
+        return;
+    }
+    std::vector<poppler::text_box*> text_list = p->text_list(poppler::rotate_0);
+
+    std::cout << "---" << std::endl;
+    for (size_t i = 0; i < text_list.size(); i ++) {
+        poppler::rectf bbox = text_list[i]->bbox();
+        poppler::ustring ustr = text_list[i]->text();
+        std::cout << "[" << ustr << "] @ ";
+        std::cout << "( x=" << bbox.x() << " y=" << bbox.y() << " w=" << bbox.width() << " h=" << bbox.height() << " )";
+        std::cout << std::endl;
+
+        delete text_list[i];
+    }
+    std::cout << "---" << std::endl;
+}
+
+
 int main(int argc, char *argv[])
 {
     if (!parseArgs(the_args, &argc, argv)
@@ -398,6 +424,14 @@ int main(int argc, char *argv[])
             print_page_text(p.get());
         }
     }
+    if (show_text_list) {
+        const int pages = doc->pages();
+        for (int i = 0; i < pages; ++i) {
+            std::cout << "Page " << (i + 1) << "/" << pages << ":" << std::endl;
+            std::unique_ptr<poppler::page> p(doc->create_page(i));
+            print_page_text_list(p.get());
+        }
+    }
 
     return 0;
 }
_______________________________________________
poppler mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/poppler

Reply via email to