Hi,
Attached patches are the introduction of new API to access raw text.
I wish some maintainer of poppler-glib can review it.
poppler-0.15.0_glib-lib.diff
patch to declare new function and its implementation
--
At present, poppler does not provide easy sample to use the function.
I attached a patch to add small sample "poppler-glib-get-text" for
reviewer. This is just sample for the reviewer, and I'm not proposing
this patch to official application.
poppler-0.15.0_glib-demo.diff
patch to add a sample program testing poppler_page_get_selected_text()
and poppler_page_get_selected_raw_text().
Regards,
mpsuzuki
carlosgc wrote:
> Excerpts from mpsuzuki's message of mar sep 07 09:21:13 +0200 2010:
>
>> On Tue, 07 Sep 2010 09:04:13 +0200
>> carlosgc <[email protected]> wrote:
>>
>>> Excerpts from mpsuzuki's message of mar sep 07 08:42:31 +0200 2010:
>>>
>>>> It dumps the strings collected by TextSelectionVisitor
>>>> object. TextSelectionVisitor define 3 methods to eat the text,
>>>> visitBlock(), visitLine() and visitWord(). But only visitLine()
>>>> method is implemented. Because "line" is defined by the
>>>> analysis of the text layout, there is no lines in raw order.
>>>>
>>>>
>>> Why not simply use TextOutputDev::getText() like qt4 frontend does?
>>> TextOutputDev::getSelectionText() is meant for selections, but you
>>> don't want text in raw order for selections. I would just add a new
>>> method gchar *poppler_page_get_raw_text (PopplerPage *page);
>>>
>> Oh. If you think it's acceptable design, I will do so.
>>
>
> Yes.
>
>
>> I want to add new method with argument to specify the
>> rectangle area where the text is extracted.
>>
>
> We currently have:
>
> - poppler_page_get_selected_text, that takes a rectangle
> - poppler_page_get_text, that doesn't take a rectangle
>
> We have already broken the API with poppler_page_get_text so we can
> just add a new parameter to specify the tetx order, or we can add
> another method poppler_page_get_raw_text(). I prefer to add a new
> method because poppler_page_get_text is used in combination to
> poppler_page_get_text_layout()
>
>
>> Anyway, thank you for enlightening me with quick reply.
>>
>> Regards,
>> mpsuzuki
>>
>
> Regards,
>
--- a/glib/poppler-page.cc
+++ b/glib/poppler-page.cc
@@ -1011,6 +1011,81 @@ poppler_page_get_text (PopplerPage *page)
}
/**
+ * poppler_page_get_selected_raw_text:
+ * @page: a #PopplerPage
+ * @style: a #PopplerSelectionStyle
+ * @selection: the #PopplerRectangle including the text
+ *
+ * Retrieves the contents of the specified @selection as raw text.
+ *
+ * Return value: a pointer to the contents of the @selection
+ * as a string
+ * Since: 0.16
+ **/
+char *
+poppler_page_get_selected_raw_text (PopplerPage *page,
+ PopplerSelectionStyle style,
+ PopplerRectangle *selection)
+{
+ GooString *sel_text;
+ char *result;
+ SelectionStyle selection_style;
+ PopplerRectangle cropBox;
+
+ g_return_val_if_fail (POPPLER_IS_PAGE (page), FALSE);
+ g_return_val_if_fail (selection != NULL, NULL);
+
+ switch (style)
+ {
+ case POPPLER_SELECTION_WORD:
+ selection_style = selectionStyleWord;
+ break;
+ case POPPLER_SELECTION_GLYPH:
+ case POPPLER_SELECTION_LINE: /* in raw text, line is not defined */
+ default:
+ selection_style = selectionStyleGlyph;
+ break;
+ }
+
+ TextOutputDev *raw_text_dev = new TextOutputDev (NULL,
+ gFalse,
+ gTrue, /* raw mode */
+ gFalse);
+ page->document->doc->displayPageSlice(raw_text_dev,
+ page->index + 1,
+ 72,
+ 72,
+ 0,
+ false,
+ true,
+ false,
+ -1,
+ -1,
+ -1,
+ -1);
+
+ if (selection->x1 == 0 && selection->y1 == 0 &&
+ selection->x2 == 0 && selection->y2 == 0)
+ {
+ poppler_page_get_crop_box( page, &cropBox );
+ sel_text = raw_text_dev->getText(cropBox.x1,
+ cropBox.y1,
+ cropBox.x2,
+ cropBox.y2);
+ }
+ else
+ sel_text = raw_text_dev->getText(selection->x1,
+ selection->y1,
+ selection->x2,
+ selection->y2);
+
+ result = g_strdup (sel_text->getCString ());
+ delete sel_text;
+
+ return result;
+}
+
+/**
* poppler_page_find_text:
* @page: a #PopplerPage
* @text: the text to search for (UTF-8 encoded)
--- a/glib/poppler-page.h
+++ b/glib/poppler-page.h
@@ -99,6 +99,9 @@ char *poppler_page_get_text (PopplerPage *pa
char *poppler_page_get_selected_text (PopplerPage *page,
PopplerSelectionStyle style,
PopplerRectangle *selection);
+char *poppler_page_get_selected_raw_text(PopplerPage *page,
+ PopplerSelectionStyle style,
+ PopplerRectangle *selection);
GList *poppler_page_get_selection_region (PopplerPage *page,
gdouble scale,
PopplerSelectionStyle style,
--- a/glib/demo/Makefile.am
+++ b/glib/demo/Makefile.am
@@ -9,7 +9,9 @@ INCLUDES = \
AM_LDFLAGS = @auto_import_flags@
if BUILD_GTK_TEST
-noinst_PROGRAMS = poppler-glib-demo
+noinst_PROGRAMS = \
+ poppler-glib-demo \
+ poppler-glib-get-text
endif
poppler_glib_demo_SOURCES = \
@@ -49,7 +51,13 @@ poppler_glib_demo_SOURCES = \
utils.h \
utils.c
+poppler_glib_get_text_SOURCES = \
+ poppler-glib-get-text.c
+
LDADD = \
$(top_builddir)/glib/libpoppler-glib.la \
$(top_builddir)/poppler/libpoppler.la \
$(GTK_TEST_LIBS)
+
+CCLD = $(CXXLD)
+LDFLAGS = -static -lstdc++
--- /dev/null
+++ b/glib/demo/poppler-glib-get-text.c
@@ -0,0 +1,181 @@
+/*
+ * Copyright (C) 2010 suzuki toshiya <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include <poppler.h>
+#include <string.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <getopt.h>
+
+#include <glib.h>
+#include <gio/gio.h>
+#include <gtk/gtk.h>
+
+
+extern char *optarg;
+extern int optind, optopt, opterr;
+
+
+int main (int argc, char **argv)
+{
+ double resolution = 72;
+ double xoffset = 0;
+ double yoffset = 0;
+ double width = 0;
+ double height = 0;
+ int firstpage = 0;
+ int lastpage = 0;
+ static int rawOrder = 0;
+ int option_index;
+ static struct option long_options[] = {
+ {"raw", no_argument, &rawOrder, 1},
+ {0, 0, 0, 0}
+ };
+
+
+ PopplerDocument *document;
+ PopplerRectangle rect;
+ PopplerRectangle *selection = ▭
+ GError *error = NULL;
+
+
+ {
+ int c;
+
+ /* while ( -1 != ( c = getopt( argc, argv, ":r:x:y:W:H:f:l:" ) ) ) */
+ while ( -1 != ( c = getopt_long( argc, argv, ":r:x:y:W:H:f:l:", long_options, &option_index ) ) )
+ {
+ switch( c )
+ {
+ case 0:
+ case 1:
+ break;
+ case 'f': /* first page */
+ firstpage = atoi( optarg ) - 1;
+ break;
+ case 'l': /* first page */
+ lastpage = atoi( optarg ) - 1;
+ break;
+ case 'r': /* resolution */
+ resolution = atof( optarg );
+ break;
+ case 'x': /* x offset */
+ xoffset = atof( optarg );
+ break;
+ case 'y': /* y offset */
+ yoffset = atof( optarg );
+ break;
+ case 'W': /* width */
+ width = atof( optarg );
+ break;
+ case 'H': /* height */
+ height = atof( optarg );
+ break;
+ default: /* help */
+ printf( "Usage (TBD)\n" );
+ exit( 1 );
+ }
+ }
+
+ rect.x1 = xoffset * 72 / resolution;
+ rect.y1 = yoffset * 72 / resolution;
+ rect.x2 = ( xoffset + width ) * 72 / resolution;
+ rect.y2 = ( yoffset + height ) * 72 / resolution;
+
+ if ( 0 != access( argv[optind], R_OK ) )
+ {
+ fprintf( stderr, "Could not open %s\n", argv[optind] );
+ exit( 2 );
+ }
+ else
+ {
+ char *pathname = argv[optind];
+ GFile *file;
+ char *uri;
+
+
+ if ( !g_thread_supported() )
+ g_thread_init( NULL );
+
+ gtk_init( &argc, &argv );
+
+ if ( g_path_is_absolute( pathname ) ) {
+ uri = g_filename_to_uri( pathname, NULL, &error );
+ } else if ( g_ascii_strncasecmp( pathname, "file://", strlen( "file://" ) ) == 0 ) {
+ uri = g_strdup ( pathname );
+ } else if ( !g_strrstr( pathname, "://" ) ) {
+ gchar *dir;
+ gchar *filename;
+
+ dir = g_get_current_dir( );
+ filename = g_build_filename( dir, pathname, NULL );
+ g_free (dir);
+ uri = g_filename_to_uri (filename, NULL, &error);
+ g_free (filename);
+ } else {
+ g_print ("Error: unsupported uri\n");
+ exit( 3 );
+ }
+
+ if (error) {
+ g_print ("Error: %s\n", error->message);
+ g_error_free (error);
+ exit( 4 );
+ }
+
+ document = poppler_document_new_from_file( uri, NULL, &error );
+ if ( error )
+ {
+ g_print ("Error: %s\n", error->message);
+ g_error_free( error );
+ exit( 5 );
+ }
+ g_free( uri );
+ }
+ }
+
+
+ {
+ int pg, maxpage;
+
+
+ maxpage = poppler_document_get_n_pages( document );
+ if ( maxpage < firstpage || lastpage < firstpage )
+ exit( 6 );
+
+ if ( maxpage < lastpage )
+ lastpage = maxpage;
+
+ for ( pg = firstpage; pg <= lastpage; pg++ )
+ {
+ PopplerPage* page;
+ char* gottext;
+
+
+ page = poppler_document_get_page( document, pg );
+ if ( rect.x1 == 0 && rect.y1 == 0 && rect.x2 == 0 && rect.y2 == 0 )
+ poppler_page_get_size( page, &(rect.x2), &(rect.y2) );
+ if (rawOrder > 0)
+ gottext = poppler_page_get_selected_raw_text(page, POPPLER_SELECTION_GLYPH, &rect);
+ else
+ gottext = poppler_page_get_selected_text(page, POPPLER_SELECTION_GLYPH, &rect);
+ printf( "[Page %d]:[%s]\n", pg, gottext );
+ }
+ }
+ exit( 0 );
+}
_______________________________________________
poppler mailing list
[email protected]
http://lists.freedesktop.org/mailman/listinfo/poppler