Hi,

Attached patches are the introduction of new API to access raw text.
I wish some maintainer of poppler-glib can review it.

poppler-0.15.0_glib-lib.diff
patch to declare new function and its implementation

--

At present, poppler does not provide easy sample to use the function.
I attached a patch to add small sample "poppler-glib-get-text" for
reviewer. This is just sample for the reviewer, and I'm not proposing
this patch to official application.

poppler-0.15.0_glib-demo.diff
patch to add a sample program testing poppler_page_get_selected_text()
and poppler_page_get_selected_raw_text().

Regards,
mpsuzuki



carlosgc wrote:
> Excerpts from mpsuzuki's message of mar sep 07 09:21:13 +0200 2010:
>   
>> On Tue, 07 Sep 2010 09:04:13 +0200
>> carlosgc <[email protected]> wrote:
>>     
>>> Excerpts from mpsuzuki's message of mar sep 07 08:42:31 +0200 2010:
>>>       
>>>> It dumps the strings collected by TextSelectionVisitor
>>>> object. TextSelectionVisitor define 3 methods to eat the text,
>>>> visitBlock(), visitLine() and visitWord(). But only visitLine()
>>>> method is implemented. Because "line" is defined by the
>>>> analysis of the text layout, there is no lines in raw order.
>>>>
>>>>         
>>> Why not simply use TextOutputDev::getText() like qt4 frontend does?
>>> TextOutputDev::getSelectionText() is meant for selections, but you
>>> don't want text in raw order for selections. I would just add a new
>>> method gchar *poppler_page_get_raw_text (PopplerPage *page);
>>>       
>> Oh. If you think it's acceptable design, I will do so.
>>     
>
> Yes.
>
>   
>> I want to add new method with argument to specify the
>> rectangle area where the text is extracted.
>>     
>
> We currently have:
>
>  - poppler_page_get_selected_text, that takes a rectangle
>  - poppler_page_get_text, that doesn't take a rectangle
>
> We have already broken the API with poppler_page_get_text so we can
> just add a new parameter to specify the tetx order, or we can add
> another method poppler_page_get_raw_text(). I prefer to add a new
> method because poppler_page_get_text is used in combination to
> poppler_page_get_text_layout()
>
>   
>> Anyway, thank you for enlightening me with quick reply.
>>
>> Regards,
>> mpsuzuki
>>     
>
> Regards, 
>   

--- a/glib/poppler-page.cc
+++ b/glib/poppler-page.cc
@@ -1011,6 +1011,81 @@ poppler_page_get_text (PopplerPage *page)
 }
 
 /**
+ * poppler_page_get_selected_raw_text:
+ * @page: a #PopplerPage
+ * @style: a #PopplerSelectionStyle
+ * @selection: the #PopplerRectangle including the text
+ *
+ * Retrieves the contents of the specified @selection as raw text.
+ *
+ * Return value: a pointer to the contents of the @selection
+ *               as a string
+ * Since: 0.16
+ **/
+char *
+poppler_page_get_selected_raw_text (PopplerPage          *page,
+				    PopplerSelectionStyle style,
+				    PopplerRectangle     *selection)
+{
+  GooString *sel_text;
+  char *result;
+  SelectionStyle selection_style;
+  PopplerRectangle cropBox;
+
+  g_return_val_if_fail (POPPLER_IS_PAGE (page), FALSE);
+  g_return_val_if_fail (selection != NULL, NULL);
+
+  switch (style)
+    {
+      case POPPLER_SELECTION_WORD:
+        selection_style = selectionStyleWord;
+	break;
+      case POPPLER_SELECTION_GLYPH:
+      case POPPLER_SELECTION_LINE: /* in raw text, line is not defined */
+      default:
+        selection_style = selectionStyleGlyph;
+	break;
+    }
+
+  TextOutputDev *raw_text_dev = new TextOutputDev (NULL,
+						   gFalse,
+						   gTrue,	/* raw mode */
+						   gFalse);
+  page->document->doc->displayPageSlice(raw_text_dev,
+					 page->index + 1,
+					 72,
+					 72,
+					 0,
+					 false,
+					 true,
+					 false,
+					 -1,
+					 -1,
+					 -1,
+					 -1);
+
+  if (selection->x1 == 0 && selection->y1 == 0 &&
+      selection->x2 == 0 && selection->y2 == 0)
+  {
+    poppler_page_get_crop_box( page, &cropBox );
+    sel_text = raw_text_dev->getText(cropBox.x1,
+				     cropBox.y1,
+				     cropBox.x2,
+				     cropBox.y2);
+  }
+  else
+    sel_text = raw_text_dev->getText(selection->x1,
+				     selection->y1,
+				     selection->x2,
+				     selection->y2);
+
+  result = g_strdup (sel_text->getCString ());
+  delete sel_text;
+
+  return result;
+}
+
+/**
  * poppler_page_find_text:
  * @page: a #PopplerPage
  * @text: the text to search for (UTF-8 encoded)
--- a/glib/poppler-page.h
+++ b/glib/poppler-page.h
@@ -99,6 +99,9 @@ char                  *poppler_page_get_text             (PopplerPage        *pa
 char                  *poppler_page_get_selected_text    (PopplerPage        *page,
 							  PopplerSelectionStyle style,
 							  PopplerRectangle   *selection);
+char                  *poppler_page_get_selected_raw_text(PopplerPage        *page,
+							  PopplerSelectionStyle style,
+							  PopplerRectangle   *selection);
 GList                 *poppler_page_get_selection_region (PopplerPage        *page,
 							  gdouble             scale,
 							  PopplerSelectionStyle style,
--- a/glib/demo/Makefile.am
+++ b/glib/demo/Makefile.am
@@ -9,7 +9,9 @@ INCLUDES = 					\
 AM_LDFLAGS = @auto_import_flags@
 
 if BUILD_GTK_TEST
-noinst_PROGRAMS = poppler-glib-demo
+noinst_PROGRAMS = 				\
+	poppler-glib-demo 			\
+	poppler-glib-get-text
 endif
 
 poppler_glib_demo_SOURCES = 			\
@@ -49,7 +51,13 @@ poppler_glib_demo_SOURCES = 			\
 	utils.h					\
 	utils.c
 
+poppler_glib_get_text_SOURCES =			\
+	poppler-glib-get-text.c
+
 LDADD =						\
 	$(top_builddir)/glib/libpoppler-glib.la	\
 	$(top_builddir)/poppler/libpoppler.la	\
 	$(GTK_TEST_LIBS)
+
+CCLD = $(CXXLD)
+LDFLAGS = -static -lstdc++
--- /dev/null
+++ b/glib/demo/poppler-glib-get-text.c
@@ -0,0 +1,181 @@
+/* 
+ * Copyright (C) 2010 suzuki toshiya <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include <poppler.h>
+#include <string.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <getopt.h>
+
+#include <glib.h>
+#include <gio/gio.h>
+#include <gtk/gtk.h>
+
+
+extern char *optarg;
+extern int optind, optopt, opterr;
+
+
+int main (int argc, char **argv)
+{
+  double  resolution = 72;
+  double  xoffset    = 0;
+  double  yoffset    = 0;
+  double  width      = 0;
+  double  height     = 0;
+  int     firstpage  = 0;
+  int     lastpage   = 0;
+  static int  rawOrder   = 0;
+  int     option_index;
+  static struct option long_options[] = {
+   {"raw", no_argument, &rawOrder, 1},
+   {0, 0, 0, 0}
+  };
+
+
+  PopplerDocument  *document;
+  PopplerRectangle  rect;
+  PopplerRectangle *selection = &rect;
+  GError           *error = NULL;
+
+
+  {
+    int  c;
+
+    /* while ( -1 != ( c = getopt( argc, argv, ":r:x:y:W:H:f:l:" ) ) ) */
+    while ( -1 != ( c = getopt_long( argc, argv, ":r:x:y:W:H:f:l:", long_options, &option_index ) ) ) 
+    {
+      switch( c )
+      {
+      case 0:
+      case 1:
+        break;
+      case 'f': /* first page */
+        firstpage = atoi( optarg ) - 1;
+        break;
+      case 'l': /* first page */
+        lastpage = atoi( optarg ) - 1;
+        break;
+      case 'r': /* resolution */
+        resolution = atof( optarg );
+        break;
+      case 'x': /* x offset */
+        xoffset = atof( optarg );
+        break;
+      case 'y': /* y offset */
+        yoffset = atof( optarg );
+        break;
+      case 'W': /* width */
+        width = atof( optarg );
+        break;
+      case 'H': /* height */
+        height = atof( optarg );
+        break;
+      default: /* help */
+        printf( "Usage (TBD)\n" );
+        exit( 1 );
+      }
+    }
+
+    rect.x1 = xoffset * 72 / resolution;
+    rect.y1 = yoffset * 72 / resolution;
+    rect.x2 = ( xoffset + width )  * 72 / resolution;
+    rect.y2 = ( yoffset + height ) * 72 / resolution;
+
+    if ( 0 != access( argv[optind], R_OK ) )
+    {
+      fprintf( stderr, "Could not open %s\n", argv[optind] );
+      exit( 2 );
+    }
+    else
+    {
+      char   *pathname = argv[optind];
+      GFile  *file;
+      char   *uri;
+
+
+      if ( !g_thread_supported() )
+        g_thread_init( NULL );
+
+      gtk_init( &argc, &argv );
+
+      if ( g_path_is_absolute( pathname ) ) {
+        uri = g_filename_to_uri( pathname, NULL, &error );
+      } else if ( g_ascii_strncasecmp( pathname, "file://", strlen( "file://" ) ) == 0 ) {
+        uri = g_strdup ( pathname );
+      } else if ( !g_strrstr( pathname, "://" ) ) {
+        gchar *dir;
+        gchar *filename;
+
+        dir = g_get_current_dir( );
+        filename = g_build_filename( dir, pathname, NULL );
+        g_free (dir);
+        uri = g_filename_to_uri (filename, NULL, &error);
+        g_free (filename);
+      } else {
+        g_print ("Error: unsupported uri\n");
+        exit( 3 );
+      }
+
+      if (error) {
+        g_print ("Error: %s\n", error->message);
+        g_error_free (error);
+        exit( 4 );
+      }
+
+      document = poppler_document_new_from_file( uri, NULL, &error );
+      if ( error )
+      {
+        g_print ("Error: %s\n", error->message);
+        g_error_free( error );
+        exit( 5 );
+      }
+      g_free( uri );
+    }
+  }
+
+
+  {
+    int  pg, maxpage;
+
+
+    maxpage = poppler_document_get_n_pages( document );
+    if ( maxpage < firstpage || lastpage < firstpage )
+      exit( 6 );
+
+    if ( maxpage < lastpage )
+      lastpage = maxpage;
+
+    for ( pg = firstpage; pg <= lastpage; pg++ )
+    {
+      PopplerPage*  page;
+      char*         gottext;
+
+
+      page = poppler_document_get_page( document, pg );
+      if ( rect.x1 == 0 && rect.y1 == 0 && rect.x2 == 0 && rect.y2 == 0 )
+        poppler_page_get_size( page, &(rect.x2), &(rect.y2) );
+      if (rawOrder > 0)
+        gottext = poppler_page_get_selected_raw_text(page, POPPLER_SELECTION_GLYPH, &rect);
+      else
+        gottext = poppler_page_get_selected_text(page, POPPLER_SELECTION_GLYPH, &rect);
+      printf( "[Page %d]:[%s]\n", pg, gottext );
+    }
+  }
+  exit( 0 );
+}
_______________________________________________
poppler mailing list
[email protected]
http://lists.freedesktop.org/mailman/listinfo/poppler

Reply via email to