Hi,

Attached patches are the introduction of new method
to access raw text via poppler-cpp binding.
I wish some maintainer of poppler-cpp can review it.

poppler-0.15.0_cpp-lib.diff
patch to declare new method and its implementation

--

Also I extended cpp/tests/poppler-dump to compare
page::text() method and page::raw_text().

poppler-0.15.0_cpp-tests.diff
patch to add new options to poppler-dump, "--show-texts"
and "--raw".
--- a/cpp/poppler-page.cpp
+++ b/cpp/poppler-page.cpp
@@ -233,18 +233,16 @@ bool page::search(const ustring &text, rectf &r, search_direction_enum direction
     return found;
 }
 
-/**
- Returns the text in the page.
+enum text_layout_enum {
+    physical_layout,
+    raw_order
+};
 
- \param r if not empty, it will be extracted the text in it; otherwise, the
-          text of the whole page
-
- \returns the text of the page in the specified rect or in the whole page
- */
-ustring page::text(const rectf &r) const
+static ustring get_text(page_private* d, const rectf &r, const text_layout_enum layout)
 {
     std::auto_ptr<GooString> s;
-    TextOutputDev td(0, gFalse, gFalse, gFalse);
+    GBool is_raw_order = ( layout == raw_order );
+    TextOutputDev td(0, gFalse, is_raw_order, gFalse);
     d->doc->doc->displayPage(&td, d->index + 1, 72, 72, 0, false, true, false);
     if (r.is_empty()) {
         const PDFRectangle *rect = d->page->getCropBox();
@@ -254,3 +252,29 @@ ustring page::text(const rectf &r) const
     }
     return ustring::from_utf8(s->getCString());
 }
+
+/**
+ Returns the text in the page.
+
+ \param r if not empty, it will be extracted the text in it; otherwise, the
+          text of the whole page
+
+ \returns the text of the page in the specified rect or in the whole page
+ */
+ustring page::text(const rectf &r) const
+{
+    return get_text(d, r, physical_layout);
+}
+
+/**
+ Returns the text in the page.
+
+ \param r if not empty, it will be extracted the raw text in it; otherwise,
+          the raw text of the whole page
+
+ \returns the raw text of the page in the specified rect or in the whole page
+ */
+ustring page::raw_text(const rectf &r) const
+{
+    return get_text(d, r, raw_order);
+}
--- a/cpp/poppler-page.h
+++ b/cpp/poppler-page.h
@@ -57,6 +57,7 @@ public:
     bool search(const ustring &text, rectf &r, search_direction_enum direction,
                 case_sensitivity_enum case_sensitivity, rotation_enum rotation = rotate_0) const;
     ustring text(const rectf &rect = rectf()) const;
+    ustring raw_text(const rectf &rect = rectf()) const;
 
 private:
     page(document_private *doc, int index);
--- a/cpp/tests/poppler-dump.cpp
+++ b/cpp/tests/poppler-dump.cpp
@@ -43,6 +43,8 @@ bool show_toc = false;
 bool show_fonts = false;
 bool show_embedded_files = false;
 bool show_pages = false;
+bool show_texts = false;
+bool use_raw_text = false;
 bool show_help = false;
 
 static const ArgDesc the_args[] = {
@@ -62,6 +64,10 @@ static const ArgDesc the_args[] = {
       "show the document-level embedded files" },
     { "--show-pages",          argFlag,  &show_pages,          0,
       "show pages information" },
+    { "--show-texts",          argFlag,  &show_texts,          0,
+      "show texts extracted from all pages" },
+    { "--raw",                 argFlag,  &use_raw_text,        0,
+      "show raw text if --show-texts is specified" },
     { "-h",                    argFlag,  &show_help,           0,
       "print usage information" },
     { "--help",                argFlag,  &show_help,           0,
@@ -288,6 +294,21 @@ static void print_page(poppler::page *p)
     std::cout << std::endl;
 }
 
+static void print_byte_array(poppler::byte_array ary)
+{
+    for ( unsigned int i = 0; i < ary.size(); i ++ )
+	std::cout << (char)(ary[i]);
+    std::cout << std::endl;
+}
+
+static void print_text(poppler::page *p)
+{
+    if (use_raw_text)
+	print_byte_array(p->raw_text(p->page_rect()).to_utf8());
+    else
+	print_byte_array(p->text(p->page_rect()).to_utf8());
+}
+
 int main(int argc, char *argv[])
 {
     if (!parseArgs(the_args, &argc, argv)
@@ -345,6 +366,14 @@ int main(int argc, char *argv[])
             print_page(p.get());
         }
     }
+    if (show_texts) {
+        const int pages = doc->pages();
+        for (int i = 0; i < pages; ++i) {
+            std::cout << "Page " << (i + 1) << "/" << pages << ":" << std::endl;
+            std::auto_ptr<poppler::page> p(doc->create_page(i));
+            print_text(p.get());
+        }
+    }
 
     return 0;
 }
_______________________________________________
poppler mailing list
[email protected]
http://lists.freedesktop.org/mailman/listinfo/poppler

Reply via email to