Hi,
I've tried to implement the suggestion, I attached my current patch.
As suggested, the most part is just copied from Qt frontend and renamed,
except of one point: TextBox.nextWord() looks slightly confusing,
because the returned object is a pointer to TextBox. I wrote
text_box.next_text_box() and a macro text_box.next_word() which
calls next_text_box() internally.
Another point I want to discuss is the design of the list give by
poppler::page::text_list(). In Qt frontend, Page::textList() returns
QList<TextBox*>. For similarity, current patch returns std::vector<text_box*>
for similarity to Qt frontend.
But, if we return the vector of pointers, the client should destruct
the objects pointed by the vector, before destructing vector itself.
Using a vector of text_box (not the pointer but the object itself),
like std::vector<text_box>, could be better, because the destructor
of the vector would internally call the destructor for text_box object.
(Qt has qDeleteAll(), but I think std::vector does not have such).
If I'm misunderstanding about C++, please correct.
Regards,
mpsuzuki
Albert Astals Cid wrote:
> El dimecres, 27 de desembre de 2017, a les 12:26:25 CET, Jeroen Ooms va
> escriure:
>> Is there a method in poppler-cpp to extract text from a pdf document,
>> including the position of each text box? Currently we use page->text()
>> with page::physical_layout which gives all text per page, but I need
>> more detailed information about each text box per page.
>
> You want to code the variant of qt5 frontend Poppler::Page::textList() for
> cpp
> frontend, it shouldn't be that hard getting inspiration (i.e. almost-copying)
> the code, do you have time for it?
>
> Cheers,
> Albert
>
>> _______________________________________________
>> poppler mailing list
>> [email protected]
>> https://lists.freedesktop.org/mailman/listinfo/poppler
>
>
> _______________________________________________
> poppler mailing list
> [email protected]
> https://lists.freedesktop.org/mailman/listinfo/poppler
diff --git a/cpp/poppler-page.cpp b/cpp/poppler-page.cpp
index 97a4dbb..9461ab9 100644
--- a/cpp/poppler-page.cpp
+++ b/cpp/poppler-page.cpp
@@ -285,3 +285,102 @@ ustring page::text(const rectf &r, text_layout_enum layout_mode) const
}
return ustring::from_utf8(s->getCString());
}
+
+/*
+ * text_box object for page::text_list()
+ */
+text_box::text_box(const ustring& text, const rectf &bbox)
+{
+ m_data = new text_box_data();
+ m_data->text = text;
+ m_data->bbox = bbox;
+}
+
+text_box::~text_box()
+{
+ delete m_data;
+}
+
+ustring text_box::text() const
+{
+ return m_data->text;
+}
+
+rectf text_box::bbox() const
+{
+ return m_data->bbox;
+}
+
+text_box* text_box::next_text_box() const
+{
+ return m_data->next_text_box;
+}
+
+rectf text_box::char_bbox(int i) const
+{
+ return m_data->char_bboxes[i];
+}
+
+bool text_box::has_space_after() const
+{
+ return m_data->has_space_after;
+}
+
+std::vector<text_box*> page::text_list(rotation_enum rotate) const
+{
+ TextOutputDev *output_dev;
+ std::vector<text_box*> output_list;
+ const int rotation_value = (int)rotate * 90;
+
+ /* config values are same with Qt5 Page::TextList() */
+ output_dev = new TextOutputDev(NULL, /* char* fileName */
+ gFalse, /* GBool physLayoutA */
+ 0, /* double fixedPitchA */
+ gFalse, /* GBool rawOrderA */
+ gFalse); /* GBool append */
+
+ /* config values are same with Qt5 Page::TextList() */
+ d->doc->doc->displayPageSlice(output_dev,
+ d->index + 1, /* page */
+ 72, 72, rotation_value, /* hDPI, vDPI, rot */
+ false, false, false, /* useMediaBox, crop, printing */
+ -1, -1, -1, -1, /* sliceX, sliceY, sliceW, sliceH */
+ NULL, NULL, /* abortCheckCbk(), abortCheckCbkData */
+ NULL, NULL, /* annotDisplayDecideCbk(), annotDisplayDecideCbkData */
+ gTrue); /* copyXRef */
+
+ TextWordList *word_list = output_dev->makeWordList();
+ if (!word_list) {
+ delete output_dev;
+ return output_list;
+ }
+
+ output_list.reserve(word_list->getLength());
+ for (int i = 0; i < word_list->getLength(); i ++) {
+ TextWord *word = word_list->get(i);
+
+ GooString *gooWord = word->getText();
+ ustring ustr = detail::unicode_GooString_to_ustring(gooWord);
+ delete gooWord;
+
+ double xMin, yMin, xMax, yMax;
+ word->getBBox(&xMin, &yMin, &xMax, &yMax);
+
+ text_box* tb = new text_box(ustr, rectf(xMin, yMin, xMax-xMin, yMax-yMin));
+ tb->m_data->has_space_after = (word->hasSpaceAfter() == gTrue);
+
+ tb->m_data->char_bboxes.reserve(word->getLength());
+ for (int j = 0; j < word->getLength(); j ++) {
+ word->getCharBBox(j, &xMin, &yMin, &xMax, &yMax);
+ tb->m_data->char_bboxes.push_back(rectf(xMin, yMin, xMax-xMin, yMax-yMin));
+ }
+
+ if (output_list.size() > 0)
+ output_list.back()->m_data->next_text_box = tb;
+
+ output_list.push_back(tb);
+ }
+ delete word_list;
+ delete output_dev;
+ return output_list;
+}
diff --git a/cpp/poppler-page.h b/cpp/poppler-page.h
index 7b4298a..18af213 100644
--- a/cpp/poppler-page.h
+++ b/cpp/poppler-page.h
@@ -25,6 +25,22 @@
namespace poppler
{
+class text_box_data;
+class POPPLER_CPP_EXPORT text_box {
+ friend class page;
+ public:
+ text_box(const ustring &text, const rectf &bBox);
+ ~text_box();
+ ustring text() const;
+ rectf bbox() const;
+ text_box *next_text_box() const;
+ text_box *next_word() { return this->next_text_box(); };
+ rectf char_bbox(int i) const;
+ bool has_space_after() const;
+ private:
+ text_box_data* m_data;
+};
+
class document;
class document_private;
class page_private;
@@ -63,6 +79,8 @@ public:
ustring text(const rectf &rect = rectf()) const;
ustring text(const rectf &rect, text_layout_enum layout_mode) const;
+ std::vector<text_box*> text_list(rotation_enum rotation) const;
+
private:
page(document_private *doc, int index);
diff --git a/cpp/poppler-private.h b/cpp/poppler-private.h
index 147073d..726068a 100644
--- a/cpp/poppler-private.h
+++ b/cpp/poppler-private.h
@@ -67,6 +67,21 @@ void delete_all(const Collection &c)
delete_all(c.begin(), c.end());
}
+class text_box;
+class text_box_data
+{
+public:
+ text_box_data()
+ : next_text_box(0), has_space_after(false)
+ {
+ }
+ ustring text;
+ rectf bbox;
+ text_box *next_text_box;
+ std::vector<rectf> char_bboxes;
+ bool has_space_after;
+};
+
}
#endif
diff --git a/cpp/tests/poppler-dump.cpp b/cpp/tests/poppler-dump.cpp
index 706ad39..09b180d 100644
--- a/cpp/tests/poppler-dump.cpp
+++ b/cpp/tests/poppler-dump.cpp
@@ -50,6 +50,7 @@ bool show_embedded_files = false;
bool show_pages = false;
bool show_help = false;
char show_text[32];
+bool show_text_list = false;
poppler::page::text_layout_enum show_text_layout = poppler::page::physical_layout;
static const ArgDesc the_args[] = {
@@ -71,6 +72,8 @@ static const ArgDesc the_args[] = {
"show pages information" },
{ "--show-text", argString, &show_text, sizeof(show_text),
"show text (physical|raw) extracted from all pages" },
+ { "--show-text-list", argFlag, &show_text_list, 0,
+ "show text list (experimental)" },
{ "-h", argFlag, &show_help, 0,
"print usage information" },
{ "--help", argFlag, &show_help, 0,
@@ -323,6 +326,29 @@ static void print_page_text(poppler::page *p)
std::cout << std::endl;
}
+static void print_page_text_list(poppler::page *p)
+{
+ if (!p) {
+ std::cout << std::setw(out_width) << "Broken Page. Could not be parsed" << std::endl;
+ std::cout << std::endl;
+ return;
+ }
+ std::vector<poppler::text_box*> text_list = p->text_list(poppler::rotate_0);
+
+ std::cout << "---" << std::endl;
+ for (size_t i = 0; i < text_list.size(); i ++) {
+ poppler::rectf bbox = text_list[i]->bbox();
+ poppler::ustring ustr = text_list[i]->text();
+ std::cout << "[" << ustr << "] @ ";
+ std::cout << "( x=" << bbox.x() << " y=" << bbox.y() << " w=" << bbox.width() << " h=" << bbox.height() << " )";
+ std::cout << std::endl;
+
+ delete text_list[i];
+ }
+ std::cout << "---" << std::endl;
+}
+
+
int main(int argc, char *argv[])
{
if (!parseArgs(the_args, &argc, argv)
@@ -398,6 +424,14 @@ int main(int argc, char *argv[])
print_page_text(p.get());
}
}
+ if (show_text_list) {
+ const int pages = doc->pages();
+ for (int i = 0; i < pages; ++i) {
+ std::cout << "Page " << (i + 1) << "/" << pages << ":" << std::endl;
+ std::unique_ptr<poppler::page> p(doc->create_page(i));
+ print_page_text_list(p.get());
+ }
+ }
return 0;
}
_______________________________________________
poppler mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/poppler