Hi, I wrote a couple of functions for finding the area used by text in a page; it returns an number of PopplerRectangle that should enclose all text as strictly as possible. Another function returns the smallest rectangle covering all text, like 'gs -sDEVICE=bbox' does in postscript.
The enclosed test program can be run by 'pdfrects file.pdf' and produce a new pdf file 'result.pdf' with the rectangles drawn in various colors. They may be used for example for copying text from two-column documents, to zoom to the area actually used in the page, for creating a new document better suited for small-screen devices such as tablets, ebook readers and mobile phones. If you think such functions may be of general use I may make a patch to incorporate them in poppler.
/*
* pdfrects.c
*
* find the area used by text in a page of a pdf document
*
* RectangleList *rectanglelist_textarea(PopplerPage *);
* a list of rectangles that do not touch or overlap and cover all text in
* the page
*
* RectangleList *rectanglelist_textarea_distance(PopplerPage *, gdouble *);
* the second argument is the minimal distance to consider a white space;
* lower values lead to finer coverings of the used area
*
* PopplerRectangle *rectanglelist_boundingbox(PopplerPage *);
* the overall bounding box of the page: the smallest rectangle that cover
* all text in the page
*
* arguments to this test program:
* -d distance minimal size of a white space
* -r level the debugrectangles variables (-1 - 5, see below)
* -n draw also the number of each rectangle
* file.pdf file to read; output is always result.pdf
*/
/*
* the algorithm:
*
* 1. C = list containing a rectangle for each character in the page
* (obtained from poppler)
*
* 2. C = join consecutive rectangles of C if they touch or overlap
* (this step is only for efficiency)
*
* 3. W = list comprising only a rectangle as large as the whole page
* for each rectangle R in C:
* - subtract R from each rectangle in W
* (each subtraction may generate up to four rectangles)
* now W is the white area of the page
*
* 4. B = list comprising only a rectangle as large as the whole page
* for each rectangle R in W:
* - subtract R from each rectangle in B
* now B covers the used area of the page
*
* 5. for each pair of rectangles in B:
* - if they touch or overlap, join them
* repeat until nothing changes
*
* 6. return B
*
* variable debugrectangle is for the intermediate lists of rectangles:
* - if its value x is greater than zero, the algorithm is cut short after
* step x and the current list of rectangles is returned; this allows for
* visualizing the intermediate results
* - also, the number of rectangles after each step is printed
* - if its value is -1, the number of rectangles is printed every time it
* changes during a subtraction
*/
/*
* todo:
* images
* sort the rectangles
* page size (from cmdline or from original file)
*/
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <poppler.h>
#include <cairo.h>
#include <cairo-pdf.h>
/*
* return the intermediate list of rectangles after this step in the algorithm
* (see above); if not zero, print number of rectangles at each step; if -1,
* print the number of rectangles every time it changes
*/
int debugrectangles = 0;
/*
* bounds on minimal rectangle size (both dimensions and at least one)
*/
typedef struct {
gdouble both;
gdouble one;
} RectangleBound;
/*
* check if a rectangle satisfies the bounds: both dimensions and at least one
*/
gboolean rectangle_bound(PopplerRectangle *r, RectangleBound *b) {
return r->x2 - r->x1 > b->both && r->y2 - r->y1 > b->both &&
(r->x2 - r->x1 > b->one || r->y2 - r->y1 > b->one);
}
/*
* check if rectangle a contains rectangle b
*/
gboolean rectangle_contains(PopplerRectangle *a, PopplerRectangle *b) {
return a->x1 <= b->x1 && a->y1 <= b->y1 &&
a->x2 >= b->x2 && a->y2 >= b->y2;
}
/*
* check if rectangles touch or overlap
*/
gboolean rectangle_touch(PopplerRectangle *a, PopplerRectangle *b) {
return ! (a->x2 < b->x1 || a->x1 > b->x2 ||
a->y2 < b->y1 || a->y1 > b->y2);
}
/*
* copy a rectangle onto another
*/
void rectangle_copy(PopplerRectangle *dest, PopplerRectangle *orig) {
memcpy(dest, orig, sizeof(PopplerRectangle));
}
/*
* join rectangles: the first becomes the smallest rectangle containing both
*/
void rectangle_join(PopplerRectangle *a, PopplerRectangle *b) {
a->x1 = MIN(a->x1, b->x1);
a->y1 = MIN(a->y1, b->y1);
a->x2 = MAX(a->x2, b->x2);
a->y2 = MAX(a->y2, b->y2);
}
/*
* a list of rectangles
*/
#define MAXRECT 50000
typedef struct {
PopplerRectangle *rect;
guint num;
guint max;
} RectangleList;
/*
* allocate a rectangle list with maximum n rectangles and currently none
*/
RectangleList *rectanglelist_new(int n) {
RectangleList *res;
res = malloc(sizeof(RectangleList));
res->num = 0;
res->max = n;
res->rect = malloc(res->max * sizeof(PopplerRectangle));
return res;
}
/*
* thighten a rectangle list by deallocating the unused entries
*/
void rectanglelist_tighten(RectangleList *r) {
r->max = r->num;
r->rect = realloc(r->rect, r->max * sizeof(PopplerRectangle));
}
/*
* free a rectangle list
*/
void rectanglelist_free(RectangleList *rl) {
if (rl == NULL)
return;
free(rl->rect);
free(rl);
}
/*
* remove a rectangle from a list
*/
void rectanglelist_delete(RectangleList *rl, guint n) {
if (n >= rl->num)
return;
rectangle_copy(rl->rect + n, rl->rect + --rl->num);
}
/*
* add a rectangle to a list
*
* since a RectangleList represents the area that is the union of its
* rectangles, when adding a new rectangle some simplifications can be done:
*
* - the request is ignored if the rectangle is contained in one in the list
* - if the rectangle contains some rectangles in the list, it replaces them
*
* still, a rectangle list can be redundant: for example, a rectangle may be
* contained in the union of other two
*/
gboolean rectanglelist_add(RectangleList *rl, PopplerRectangle *rect) {
guint r;
gboolean placed;
if (rl->num >= rl->max)
return FALSE;
placed = FALSE;
for (r = 0; r < rl->num; r++) {
if (rectangle_contains(rl->rect + r, rect))
return TRUE;
if (rectangle_contains(rect, rl->rect + r)) {
if (! placed) {
rectangle_copy(rl->rect + r, rect);
placed = TRUE;
}
else
rectanglelist_delete(rl, r);
}
}
if (! placed)
rectangle_copy(rl->rect + rl->num++, rect);
return TRUE;
}
/*
* append the subtraction of rectangle sub from list orig to list res:
* res += orig - sub
*
* for each rectangle in orig, subtract sub from it; this may generate up to
* four rectangles, which are appended to list res
*/
gboolean rectanglelist_subtract_append(RectangleList *dest,
RectangleList *orig, PopplerRectangle *sub,
RectangleBound *b) {
guint i;
PopplerRectangle *a, *r;
for (i = 0; i < orig->num; i++) {
a = orig->rect + i;
r = dest->rect + dest->num;
r->x1 = a->x1;
r->y1 = a->y1;
r->x2 = MIN(a->x2, sub->x1);
r->y2 = a->y2;
if (rectangle_bound(r, b))
if (! rectanglelist_add(dest, r))
return FALSE;
r = dest->rect + dest->num;
r->x1 = a->x1;
r->y1 = a->y1;
r->x2 = a->x2;
r->y2 = MIN(a->y2, sub->y1);
if (rectangle_bound(r, b))
if (! rectanglelist_add(dest, r))
return FALSE;
r = dest->rect + dest->num;
r->x1 = MAX(a->x1, sub->x2);
r->y1 = a->y1;
r->x2 = a->x2;
r->y2 = a->y2;
if (rectangle_bound(r, b))
if (! rectanglelist_add(dest, r))
return FALSE;
r = dest->rect + dest->num;
r->x1 = a->x1;
r->y1 = MAX(a->y1, sub->y2);
r->x2 = a->x2;
r->y2 = a->y2;
if (rectangle_bound(r, b))
if (! rectanglelist_add(dest, r))
return FALSE;
}
return TRUE;
}
/*
* subtract a rectangle list from another: orig -= sub
*/
gboolean rectanglelist_subtract(RectangleList **orig, RectangleList *sub,
RectangleBound *b) {
RectangleList *dest;
guint r;
for (r = 0; r < sub->num; r++) {
dest = rectanglelist_new(MAXRECT);
if (! rectanglelist_subtract_append(dest, *orig, sub->rect + r,
b))
return FALSE;
if (debugrectangles == -1 && dest->num != (*orig)->num)
printf("rectangles: %d\n", dest->num);
rectanglelist_free(*orig);
*orig = dest;
}
return TRUE;
}
/*
* join consecutive touching rectangles of a list
*/
void rectanglelist_consecutive(RectangleList *orig) {
guint i, j;
if (orig->num == 0)
return;
for (j = 0, i = 1; i < orig->num; i++)
if (rectangle_touch(orig->rect + j, orig->rect + i))
rectangle_join(orig->rect + j, orig->rect + i);
else {
j++;
rectangle_copy(orig->rect + j, orig->rect + i);
}
orig->num = j + 1;
}
/*
* join touching rectangles in a rectangle list
*/
void rectanglelist_join(RectangleList *orig) {
guint i, j, n;
/* why the do-while loop: joining may produce a rectangle that overlaps
* a previous one, for example:
* 1 2
* 3
* 654
* rectangle 1 does not touch any else; but then 2 is joined to 3, then
* 4, 5 and 6; the resulting rectangle includes 1, since joining two
* rectangles produces a rectangle that includes both */
do {
n = orig->num;
for (i = 0; i < orig->num; i++)
for (j = i + 1; j < orig->num; j++)
if (rectangle_touch(orig->rect + i,
orig->rect + j)) {
rectangle_join(orig->rect + i,
orig->rect + j);
rectanglelist_delete(orig, j);
j = i;
}
} while (n != orig->num);
}
/*
* the rectangles of the single characters in the page
*/
RectangleList *rectanglelist_characters(PopplerPage *page) {
RectangleList *layout;
char *text, *cur, *next;
guint r;
layout = rectanglelist_new(0);
poppler_page_get_text_layout(page, &layout->rect, &layout->num);
text = poppler_page_get_text(page);
/* nullify rectangles of white spaces ' '; yes, it happens */
for (r = 0, cur = text; r < layout->num; r++, cur = next) {
next = g_utf8_next_char(cur);
if (*cur == ' ')
layout->rect[r].x2 = layout->rect[r].x1;
}
free(text);
return layout;
}
/*
* the area used by text in the page
* the gdouble parameters define the minimal size of considered rectangles
*/
RectangleList *rectanglelist_textarea_bound(PopplerPage *page,
gdouble whiteboth, gdouble whiteone,
gdouble blackboth, gdouble blackone) {
RectangleList *layout, *white, *black;
RectangleBound wb, bb;
wb.both = whiteboth;
wb.one = whiteone;
bb.both = blackboth;
bb.one = blackone;
layout = rectanglelist_characters(page);
if (debugrectangles)
printf("character rectangles: %d\n", layout->num);
if (debugrectangles == 1)
return layout;
rectanglelist_consecutive(layout);
if (debugrectangles)
printf("consecutive rectangles: %d\n", layout->num);
if (debugrectangles == 2)
return layout;
white = rectanglelist_new(MAXRECT);
poppler_page_get_crop_box(page, white->rect);
/* enlarge, otherwise thin white areas at the borders are lost */
white->rect[0].x1 -= wb.both - 1.0;
white->rect[0].y1 -= wb.both - 1.0;
white->rect[0].x2 += wb.both + 1.0;
white->rect[0].y2 += wb.both + 1.0;
white->num = 1;
if (! rectanglelist_subtract(&white, layout, &wb))
return NULL;
if (debugrectangles)
printf("white rectangles: %d\n", white->num);
rectanglelist_free(layout);
if (debugrectangles == 3)
return white;
black = rectanglelist_new(MAXRECT);
poppler_page_get_crop_box(page, black->rect);
black->num = 1;
if (! rectanglelist_subtract(&black, white, &bb))
return NULL;
if (debugrectangles)
printf("white rectangles: %d\n", black->num);
rectanglelist_free(white);
if (debugrectangles == 4)
return black;
rectanglelist_join(black);
if (debugrectangles)
printf("joined rectangles: %d\n", black->num);
rectanglelist_tighten(black);
return black;
}
/*
* text area in the page, with parametric minimal distance considered a space
*/
RectangleList *rectanglelist_textarea_distance(PopplerPage *page, gdouble w) {
RectangleList *res;
res = rectanglelist_textarea_bound(page, w, 100.0, 5.0, 10.0);
if (res != NULL)
return res;
/* fallback: finding the rectangle list was impossible because of the
* large number of rectangles; just return the whole page */
res = rectanglelist_new(1);
poppler_page_get_crop_box(page, res->rect);
res->num = 1;
return res;
}
/*
* text area in the page
*/
RectangleList *rectanglelist_textarea(PopplerPage *page) {
return rectanglelist_textarea_distance(page, 15.0);
}
/*
* overall bounding box
*/
PopplerRectangle *rectanglelist_boundingbox(PopplerPage *page) {
RectangleList *all;
PopplerRectangle *boundingbox;
guint i;
all = rectanglelist_new(0);
poppler_page_get_text_layout(page, &all->rect, &all->num);
boundingbox = poppler_rectangle_copy(all->rect + 0);
for (i = 1; i < all->num; i++)
rectangle_join(boundingbox, all->rect + i);
rectanglelist_free(all);
return boundingbox;
}
/*
* draw a rectangle on a cairo context with a random color
*/
void rectangle_draw(cairo_t *cr, PopplerRectangle *rect, gboolean fill) {
cairo_set_source_rgb(cr,
((gdouble) random()) / RAND_MAX * 0.8,
((gdouble) random()) / RAND_MAX * 0.8,
((gdouble) random()) / RAND_MAX * 0.8);
cairo_rectangle(cr,
rect->x1, rect->y1,
rect->x2 - rect->x1, rect->y2 - rect->y1);
if (fill)
cairo_fill(cr);
cairo_stroke(cr);
}
/*
* draw a rectangle list on a cairo context
*/
void rectanglelist_draw(cairo_t *cr, RectangleList *rl,
gboolean fill, gboolean num) {
guint r;
char buf[20];
for (r = 0; r < rl->num; r++) {
rectangle_draw(cr, rl->rect + r, fill);
if (num) {
cairo_move_to(cr,
rl->rect[r].x1 - 10.0,
rl->rect[r].y1 + 10.0);
sprintf(buf, "%d", r);
cairo_show_text(cr, buf);
}
}
}
/*
* from file name to uri
*/
char *filenametouri(char *filename) {
char *dir, *sep, *uri;
if (filename[0] == '/') {
dir = "";
sep = "";
}
else {
dir = malloc(4096);
if (dir == NULL) {
printf("failed to allocate memory for directory\n");
return NULL;
}
if (getcwd(dir, 4096) == NULL) {
printf("error in obtaining the current directory\n");
return NULL;
}
sep = "/";
}
uri = malloc(strlen("file:") + strlen(dir) +
strlen(sep) + strlen(filename) + 1);
if (uri == NULL) {
printf("failed to allocate memory for file name\n");
return NULL;
}
strcpy(uri, "file:");
strcat(uri, dir);
strcat(uri, sep);
strcat(uri, filename);
return uri;
}
/*
* main
*/
int main(int argc, char *argv[]) {
int opt;
gdouble distance = 15.0;
gboolean numbers = FALSE;
char *filename;
PopplerDocument *doc;
PopplerPage *page;
int npages, n;
RectangleList *textarea;
PopplerRectangle *boundingbox;
cairo_surface_t *surface;
cairo_t *cr;
/* arguments */
while ((opt = getopt(argc, argv, "nd:r:")) != -1)
switch(opt) {
case 'n':
numbers = TRUE;
break;
case 'd':
distance = atof(optarg);
break;
case 'r':
debugrectangles = atoi(optarg);
break;
}
if (argc - 1 < optind) {
printf("pdfrects [-d distance] [-r level] [-n] file.pdf\n");
exit(EXIT_FAILURE);
}
filename = filenametouri(argv[optind]);
if (! filename)
exit(EXIT_FAILURE);
printf("%s -> result.pdf\n", argv[optind]);
/* open file */
doc = poppler_document_new_from_file(filename, NULL, NULL);
if (doc == NULL) {
printf("error opening pdf file\n");
exit(EXIT_FAILURE);
}
/* number of pages */
npages = poppler_document_get_n_pages(doc);
if (npages < 1) {
printf("no page in document\n");
exit(EXIT_FAILURE);
}
/* copy to destination */
surface = cairo_pdf_surface_create("result.pdf", 595.22, 842.00);
for (n = 0; n < npages; n++) {
printf("page %d\n", n);
page = poppler_document_get_page(doc, n);
// textarea = rectanglelist_textarea(page);
textarea = rectanglelist_textarea_distance(page, distance);
boundingbox = rectanglelist_boundingbox(page);
cr = cairo_create(surface);
poppler_page_render_for_printing(page, cr);
rectanglelist_draw(cr, textarea, FALSE, numbers);
// rectangle_draw(cr, boundingbox, FALSE);
cairo_destroy(cr);
cairo_surface_show_page(surface);
rectanglelist_free(textarea);
poppler_rectangle_free(boundingbox);
}
cairo_surface_destroy(surface);
return EXIT_SUCCESS;
}
Makefile
Description: Binary data
_______________________________________________ poppler mailing list [email protected] https://lists.freedesktop.org/mailman/listinfo/poppler
