Hi,
Attached patches are the introduction of new method
to access raw text via poppler-cpp binding.
I wish some maintainer of poppler-cpp can review it.
poppler-0.15.0_cpp-lib.diff
patch to declare new method and its implementation
--
Also I extended cpp/tests/poppler-dump to compare
page::text() method and page::raw_text().
poppler-0.15.0_cpp-tests.diff
patch to add new options to poppler-dump, "--show-texts"
and "--raw".
--- a/cpp/poppler-page.cpp
+++ b/cpp/poppler-page.cpp
@@ -233,18 +233,16 @@ bool page::search(const ustring &text, rectf &r, search_direction_enum direction
return found;
}
-/**
- Returns the text in the page.
+enum text_layout_enum {
+ physical_layout,
+ raw_order
+};
- \param r if not empty, it will be extracted the text in it; otherwise, the
- text of the whole page
-
- \returns the text of the page in the specified rect or in the whole page
- */
-ustring page::text(const rectf &r) const
+static ustring get_text(page_private* d, const rectf &r, const text_layout_enum layout)
{
std::auto_ptr<GooString> s;
- TextOutputDev td(0, gFalse, gFalse, gFalse);
+ GBool is_raw_order = ( layout == raw_order );
+ TextOutputDev td(0, gFalse, is_raw_order, gFalse);
d->doc->doc->displayPage(&td, d->index + 1, 72, 72, 0, false, true, false);
if (r.is_empty()) {
const PDFRectangle *rect = d->page->getCropBox();
@@ -254,3 +252,29 @@ ustring page::text(const rectf &r) const
}
return ustring::from_utf8(s->getCString());
}
+
+/**
+ Returns the text in the page.
+
+ \param r if not empty, it will be extracted the text in it; otherwise, the
+ text of the whole page
+
+ \returns the text of the page in the specified rect or in the whole page
+ */
+ustring page::text(const rectf &r) const
+{
+ return get_text(d, r, physical_layout);
+}
+
+/**
+ Returns the text in the page.
+
+ \param r if not empty, it will be extracted the raw text in it; otherwise,
+ the raw text of the whole page
+
+ \returns the raw text of the page in the specified rect or in the whole page
+ */
+ustring page::raw_text(const rectf &r) const
+{
+ return get_text(d, r, raw_order);
+}
--- a/cpp/poppler-page.h
+++ b/cpp/poppler-page.h
@@ -57,6 +57,7 @@ public:
bool search(const ustring &text, rectf &r, search_direction_enum direction,
case_sensitivity_enum case_sensitivity, rotation_enum rotation = rotate_0) const;
ustring text(const rectf &rect = rectf()) const;
+ ustring raw_text(const rectf &rect = rectf()) const;
private:
page(document_private *doc, int index);
--- a/cpp/tests/poppler-dump.cpp
+++ b/cpp/tests/poppler-dump.cpp
@@ -43,6 +43,8 @@ bool show_toc = false;
bool show_fonts = false;
bool show_embedded_files = false;
bool show_pages = false;
+bool show_texts = false;
+bool use_raw_text = false;
bool show_help = false;
static const ArgDesc the_args[] = {
@@ -62,6 +64,10 @@ static const ArgDesc the_args[] = {
"show the document-level embedded files" },
{ "--show-pages", argFlag, &show_pages, 0,
"show pages information" },
+ { "--show-texts", argFlag, &show_texts, 0,
+ "show texts extracted from all pages" },
+ { "--raw", argFlag, &use_raw_text, 0,
+ "show raw text if --show-texts is specified" },
{ "-h", argFlag, &show_help, 0,
"print usage information" },
{ "--help", argFlag, &show_help, 0,
@@ -288,6 +294,21 @@ static void print_page(poppler::page *p)
std::cout << std::endl;
}
+static void print_byte_array(poppler::byte_array ary)
+{
+ for ( unsigned int i = 0; i < ary.size(); i ++ )
+ std::cout << (char)(ary[i]);
+ std::cout << std::endl;
+}
+
+static void print_text(poppler::page *p)
+{
+ if (use_raw_text)
+ print_byte_array(p->raw_text(p->page_rect()).to_utf8());
+ else
+ print_byte_array(p->text(p->page_rect()).to_utf8());
+}
+
int main(int argc, char *argv[])
{
if (!parseArgs(the_args, &argc, argv)
@@ -345,6 +366,14 @@ int main(int argc, char *argv[])
print_page(p.get());
}
}
+ if (show_texts) {
+ const int pages = doc->pages();
+ for (int i = 0; i < pages; ++i) {
+ std::cout << "Page " << (i + 1) << "/" << pages << ":" << std::endl;
+ std::auto_ptr<poppler::page> p(doc->create_page(i));
+ print_text(p.get());
+ }
+ }
return 0;
}
_______________________________________________
poppler mailing list
[email protected]
http://lists.freedesktop.org/mailman/listinfo/poppler