Hi,

I wrote a couple of functions for finding the area used by text in a page; it
returns an number of PopplerRectangle that should enclose all text as strictly
as possible. Another function returns the smallest rectangle covering all text,
like 'gs -sDEVICE=bbox' does in postscript.

The enclosed test program can be run by 'pdfrects file.pdf' and produce a new
pdf file 'result.pdf' with the rectangles drawn in various colors.

They may be used for example for copying text from two-column documents, to
zoom to the area actually used in the page, for creating a new document better
suited for small-screen devices such as tablets, ebook readers and mobile
phones.

If you think such functions may be of general use I may make a patch to
incorporate them in poppler.

/*
 * pdfrects.c
 *
 * find the area used by text in a page of a pdf document
 *
 * RectangleList *rectanglelist_textarea(PopplerPage *);
 *	a list of rectangles that do not touch or overlap and cover all text in
 *	the page
 *
 * RectangleList *rectanglelist_textarea_distance(PopplerPage *, gdouble *);
 *	the second argument is the minimal distance to consider a white space;
 *	lower values lead to finer coverings of the used area
 *
 * PopplerRectangle *rectanglelist_boundingbox(PopplerPage *);
 * 	the overall bounding box of the page: the smallest rectangle that cover
 * 	all text in the page
 *
 * arguments to this test program:
 *	-d distance	minimal size of a white space
 *	-r level	the debugrectangles variables (-1 - 5, see below)
 *	-n		draw also the number of each rectangle
 *	file.pdf	file to read; output is always result.pdf
 */

/*
 * the algorithm:
 *
 * 1. C = list containing a rectangle for each character in the page
 *    (obtained from poppler)
 *
 * 2. C = join consecutive rectangles of C if they touch or overlap
 *    (this step is only for efficiency)
 *
 * 3. W = list comprising only a rectangle as large as the whole page
 *    for each rectangle R in C:
 *       - subtract R from each rectangle in W
 *         (each subtraction may generate up to four rectangles)
 *    now W is the white area of the page
 *
 * 4. B = list comprising only a rectangle as large as the whole page
 *    for each rectangle R in W:
 *       - subtract R from each rectangle in B
 *    now B covers the used area of the page
 *
 * 5. for each pair of rectangles in B:
 *       - if they touch or overlap, join them
 *    repeat until nothing changes
 *
 * 6. return B
 *
 * variable debugrectangle is for the intermediate lists of rectangles:
 *   - if its value x is greater than zero, the algorithm is cut short after
 *     step x and the current list of rectangles is returned; this allows for
 *     visualizing the intermediate results
 *   - also, the number of rectangles after each step is printed
 *   - if its value is -1, the number of rectangles is printed every time it
 *     changes during a subtraction
 */

/*
 * todo:
 *	images
 *	sort the rectangles
 *	page size (from cmdline or from original file)
 */

#include <stdlib.h>
#include <stdio.h>
#include <string.h>

#include <poppler.h>
#include <cairo.h>
#include <cairo-pdf.h>

/*
 * return the intermediate list of rectangles after this step in the algorithm
 * (see above); if not zero, print number of rectangles at each step; if -1,
 * print the number of rectangles every time it changes
 */
int debugrectangles = 0;

/*
 * bounds on minimal rectangle size (both dimensions and at least one)
 */
typedef struct {
	gdouble both;
	gdouble one;
} RectangleBound;

/*
 * check if a rectangle satisfies the bounds: both dimensions and at least one
 */
gboolean rectangle_bound(PopplerRectangle *r, RectangleBound *b) {
	return	r->x2 - r->x1 > b->both && r->y2 - r->y1 > b->both &&
		(r->x2 - r->x1 > b->one || r->y2 - r->y1 > b->one);
		
}

/*
 * check if rectangle a contains rectangle b
 */
gboolean rectangle_contains(PopplerRectangle *a, PopplerRectangle *b) {
	return	a->x1 <= b->x1 && a->y1 <= b->y1 &&
		a->x2 >= b->x2 && a->y2 >= b->y2;
}

/*
 * check if rectangles touch or overlap
 */
gboolean rectangle_touch(PopplerRectangle *a, PopplerRectangle *b) {
	return ! (a->x2 < b->x1 || a->x1 > b->x2 ||
		  a->y2 < b->y1 || a->y1 > b->y2);
	          
}

/*
 * copy a rectangle onto another
 */
void rectangle_copy(PopplerRectangle *dest, PopplerRectangle *orig) {
	memcpy(dest, orig, sizeof(PopplerRectangle));
}

/*
 * join rectangles: the first becomes the smallest rectangle containing both
 */
void rectangle_join(PopplerRectangle *a, PopplerRectangle *b) {
	a->x1 = MIN(a->x1, b->x1);
	a->y1 = MIN(a->y1, b->y1);
	a->x2 = MAX(a->x2, b->x2);
	a->y2 = MAX(a->y2, b->y2);
}

/*
 * a list of rectangles
 */
#define MAXRECT 50000
typedef struct {
	PopplerRectangle *rect;
	guint num;
	guint max;
} RectangleList;

/*
 * allocate a rectangle list with maximum n rectangles and currently none
 */
RectangleList *rectanglelist_new(int n) {
	RectangleList *res;

	res = malloc(sizeof(RectangleList));
	res->num = 0;
	res->max = n;
	res->rect = malloc(res->max * sizeof(PopplerRectangle));

	return res;
}

/*
 * thighten a rectangle list by deallocating the unused entries
 */
void rectanglelist_tighten(RectangleList *r) {
	r->max = r->num;
	r->rect = realloc(r->rect, r->max * sizeof(PopplerRectangle));
}

/*
 * free a rectangle list
 */
void rectanglelist_free(RectangleList *rl) {
	if (rl == NULL)
		return;
	free(rl->rect);
	free(rl);
}

/*
 * remove a rectangle from a list
 */
void rectanglelist_delete(RectangleList *rl, guint n) {
	if (n >= rl->num)
		return;
	rectangle_copy(rl->rect + n, rl->rect + --rl->num);
}

/*
 * add a rectangle to a list
 *
 * since a RectangleList represents the area that is the union of its
 * rectangles, when adding a new rectangle some simplifications can be done:
 *
 * - the request is ignored if the rectangle is contained in one in the list
 * - if the rectangle contains some rectangles in the list, it replaces them
 *
 * still, a rectangle list can be redundant: for example, a rectangle may be
 * contained in the union of other two
 */
gboolean rectanglelist_add(RectangleList *rl, PopplerRectangle *rect) {
	guint r;
	gboolean placed;

	if (rl->num >= rl->max)
		return FALSE;

	placed = FALSE;

	for (r = 0; r < rl->num; r++) {
		if (rectangle_contains(rl->rect + r, rect))
			return TRUE;

		if (rectangle_contains(rect, rl->rect + r)) {
			if (! placed) {
				rectangle_copy(rl->rect + r, rect);
				placed = TRUE;
			}
			else
				rectanglelist_delete(rl, r);
		}
	}

	if (! placed)
		rectangle_copy(rl->rect + rl->num++, rect);
	return TRUE;
}

/*
 * append the subtraction of rectangle sub from list orig to list res:
 *	res += orig - sub
 *
 * for each rectangle in orig, subtract sub from it; this may generate up to
 * four rectangles, which are appended to list res
 */
gboolean rectanglelist_subtract_append(RectangleList *dest,
		RectangleList *orig, PopplerRectangle *sub,
		RectangleBound *b) {
	guint i;
	PopplerRectangle *a, *r;

	for (i = 0; i < orig->num; i++) {
		a = orig->rect + i;

		r = dest->rect + dest->num;
		r->x1 = a->x1;
		r->y1 = a->y1;
		r->x2 = MIN(a->x2, sub->x1);
		r->y2 = a->y2;
		if (rectangle_bound(r, b))
			if (! rectanglelist_add(dest, r))
				return FALSE;

		r = dest->rect + dest->num;
		r->x1 = a->x1;
		r->y1 = a->y1;
		r->x2 = a->x2;
		r->y2 = MIN(a->y2, sub->y1);
		if (rectangle_bound(r, b))
			if (! rectanglelist_add(dest, r))
				return FALSE;

		r = dest->rect + dest->num;
		r->x1 = MAX(a->x1, sub->x2);
		r->y1 = a->y1;
		r->x2 = a->x2;
		r->y2 = a->y2;
		if (rectangle_bound(r, b))
			if (! rectanglelist_add(dest, r))
				return FALSE;

		r = dest->rect + dest->num;
		r->x1 = a->x1;
		r->y1 = MAX(a->y1, sub->y2);
		r->x2 = a->x2;
		r->y2 = a->y2;
		if (rectangle_bound(r, b))
			if (! rectanglelist_add(dest, r))
				return FALSE;
	}

	return TRUE;
}

/*
 * subtract a rectangle list from another: orig -= sub
 */
gboolean rectanglelist_subtract(RectangleList **orig, RectangleList *sub,
		RectangleBound *b) {
	RectangleList *dest;
	guint r;

	for (r = 0; r < sub->num; r++) {
		dest = rectanglelist_new(MAXRECT);
		if (! rectanglelist_subtract_append(dest, *orig, sub->rect + r,
				b))
			return FALSE;
		if (debugrectangles == -1 && dest->num != (*orig)->num)
			printf("rectangles: %d\n", dest->num);
		rectanglelist_free(*orig);
		*orig = dest;
	}

	return TRUE;
}

/*
 * join consecutive touching rectangles of a list
 */
void rectanglelist_consecutive(RectangleList *orig) {
	guint i, j;

	if (orig->num == 0)
		return;

	for (j = 0, i = 1; i < orig->num; i++)
		if (rectangle_touch(orig->rect + j, orig->rect + i))
			rectangle_join(orig->rect + j, orig->rect + i);
		else {
			j++;
			rectangle_copy(orig->rect + j, orig->rect + i);
		}
	orig->num = j + 1;
}

/*
 * join touching rectangles in a rectangle list
 */
void rectanglelist_join(RectangleList *orig) {
	guint i, j, n;

	/* why the do-while loop: joining may produce a rectangle that overlaps
	 * a previous one, for example:
	 *	1 2
	 *	  3
	 *	654
	 * rectangle 1 does not touch any else; but then 2 is joined to 3, then
	 * 4, 5 and 6; the resulting rectangle includes 1, since joining two
	 * rectangles produces a rectangle that includes both */

	do {
		n = orig->num;
		for (i = 0; i < orig->num; i++)
			for (j = i + 1; j < orig->num; j++)
				if (rectangle_touch(orig->rect + i,
						orig->rect + j)) {
					rectangle_join(orig->rect + i,
						orig->rect + j);
					rectanglelist_delete(orig, j);
					j = i;
				}
	} while (n != orig->num);
}

/*
 * the rectangles of the single characters in the page
 */
RectangleList *rectanglelist_characters(PopplerPage *page) {
	RectangleList *layout;
	char *text, *cur, *next;
	guint r;

	layout = rectanglelist_new(0);
	poppler_page_get_text_layout(page, &layout->rect, &layout->num);
	text = poppler_page_get_text(page);
	
	/* nullify rectangles of white spaces ' '; yes, it happens */
	for (r = 0, cur = text; r < layout->num; r++, cur = next) {
		next = g_utf8_next_char(cur);
		if (*cur == ' ')
			layout->rect[r].x2 = layout->rect[r].x1;
	}

	free(text);
	return layout;
}

/*
 * the area used by text in the page
 * the gdouble parameters define the minimal size of considered rectangles
 */
RectangleList *rectanglelist_textarea_bound(PopplerPage *page,
		gdouble whiteboth, gdouble whiteone,
		gdouble blackboth, gdouble blackone) {
	RectangleList *layout, *white, *black;
	RectangleBound wb, bb;

	wb.both = whiteboth;
	wb.one = whiteone;
	bb.both = blackboth;
	bb.one = blackone;

	layout = rectanglelist_characters(page);
	if (debugrectangles)
		printf("character rectangles: %d\n", layout->num);
	if (debugrectangles == 1)
		return layout;

	rectanglelist_consecutive(layout);
	if (debugrectangles)
		printf("consecutive rectangles: %d\n", layout->num);
	if (debugrectangles == 2)
		return layout;

	white = rectanglelist_new(MAXRECT);
	poppler_page_get_crop_box(page, white->rect);
	/* enlarge, otherwise thin white areas at the borders are lost */
	white->rect[0].x1 -= wb.both - 1.0;
	white->rect[0].y1 -= wb.both - 1.0;
	white->rect[0].x2 += wb.both + 1.0;
	white->rect[0].y2 += wb.both + 1.0;
	white->num = 1;

	if (! rectanglelist_subtract(&white, layout, &wb))
		return NULL;
	if (debugrectangles)
		printf("white rectangles: %d\n", white->num);
	rectanglelist_free(layout);
	if (debugrectangles == 3)
		return white;

	black = rectanglelist_new(MAXRECT);
	poppler_page_get_crop_box(page, black->rect);
	black->num = 1;

	if (! rectanglelist_subtract(&black, white, &bb))
		return NULL;
	if (debugrectangles)
		printf("white rectangles: %d\n", black->num);
	rectanglelist_free(white);
	if (debugrectangles == 4)
		return black;

	rectanglelist_join(black);
	if (debugrectangles)
		printf("joined rectangles: %d\n", black->num);

	rectanglelist_tighten(black);
	return black;
}

/*
 * text area in the page, with parametric minimal distance considered a space
 */
RectangleList *rectanglelist_textarea_distance(PopplerPage *page, gdouble w) {
	RectangleList *res;

	res = rectanglelist_textarea_bound(page, w, 100.0, 5.0, 10.0);
	if (res != NULL)
		return res;

	/* fallback: finding the rectangle list was impossible because of the
	 * large number of rectangles; just return the whole page */
	res = rectanglelist_new(1);
	poppler_page_get_crop_box(page, res->rect);
	res->num = 1;
	return res;
}

/*
 * text area in the page
 */
RectangleList *rectanglelist_textarea(PopplerPage *page) {
	return rectanglelist_textarea_distance(page, 15.0);
}

/*
 * overall bounding box
 */
PopplerRectangle *rectanglelist_boundingbox(PopplerPage *page) {
	RectangleList *all;
	PopplerRectangle *boundingbox;
	guint i;

	all = rectanglelist_new(0);
	poppler_page_get_text_layout(page, &all->rect, &all->num);

	boundingbox = poppler_rectangle_copy(all->rect + 0);
	for (i = 1; i < all->num; i++)
		rectangle_join(boundingbox, all->rect + i);

	rectanglelist_free(all);
	return boundingbox;
}

/*
 * draw a rectangle on a cairo context with a random color
 */
void rectangle_draw(cairo_t *cr, PopplerRectangle *rect, gboolean fill) {
	cairo_set_source_rgb(cr,
		((gdouble) random()) / RAND_MAX * 0.8,
		((gdouble) random()) / RAND_MAX * 0.8,
		((gdouble) random()) / RAND_MAX * 0.8);
	cairo_rectangle(cr,
		rect->x1, rect->y1,
		rect->x2 - rect->x1, rect->y2 - rect->y1);
	if (fill)
		cairo_fill(cr);
	cairo_stroke(cr);
}

/*
 * draw a rectangle list on a cairo context
 */
void rectanglelist_draw(cairo_t *cr, RectangleList *rl,
		gboolean fill, gboolean num) {
	guint r;
	char buf[20];
	
	for (r = 0; r < rl->num; r++) {
		rectangle_draw(cr, rl->rect + r, fill);
		if (num) {
			cairo_move_to(cr,
				rl->rect[r].x1 - 10.0,
				rl->rect[r].y1 + 10.0);
			sprintf(buf, "%d", r);
			cairo_show_text(cr, buf);
		}
	}
}

/*
 * from file name to uri
 */
char *filenametouri(char *filename) {
	char *dir, *sep, *uri;

	if (filename[0] == '/') {
		dir = "";
		sep = "";
	}
	else {
		dir = malloc(4096);
		if (dir == NULL) {
			printf("failed to allocate memory for directory\n");
			return NULL;
		}
		if (getcwd(dir, 4096) == NULL) {
			printf("error in obtaining the current directory\n");
			return NULL;
		}
		sep = "/";
	}

	uri = malloc(strlen("file:") + strlen(dir) +
		strlen(sep) + strlen(filename) + 1);
	if (uri == NULL) {
		printf("failed to allocate memory for file name\n");
		return NULL;
	}
	strcpy(uri, "file:");
	strcat(uri, dir);
	strcat(uri, sep);
	strcat(uri, filename);

	return uri;
}

/*
 * main
 */
int main(int argc, char *argv[]) {
	int opt;
	gdouble distance = 15.0;
	gboolean numbers = FALSE;
	char *filename;

	PopplerDocument *doc;
	PopplerPage *page;
	int npages, n;
	RectangleList *textarea;
	PopplerRectangle *boundingbox;

	cairo_surface_t *surface;
	cairo_t *cr;

				/* arguments */

	while ((opt = getopt(argc, argv, "nd:r:")) != -1)
		switch(opt) {
		case 'n':
			numbers = TRUE;
			break;
		case 'd':
			distance = atof(optarg);
			break;
		case 'r':
			debugrectangles = atoi(optarg);
			break;
		}

	if (argc - 1 < optind) {
		printf("pdfrects [-d distance] [-r level] [-n] file.pdf\n");
		exit(EXIT_FAILURE);
	}
	filename = filenametouri(argv[optind]);
	if (! filename)
		exit(EXIT_FAILURE);
	printf("%s -> result.pdf\n", argv[optind]);

				/* open file */

	doc = poppler_document_new_from_file(filename, NULL, NULL);
	if (doc == NULL) {
		printf("error opening pdf file\n");
		exit(EXIT_FAILURE);
	}

				/* number of pages */

	npages = poppler_document_get_n_pages(doc);
	if (npages < 1) {
		printf("no page in document\n");
		exit(EXIT_FAILURE);
	}

				/* copy to destination */

	surface = cairo_pdf_surface_create("result.pdf", 595.22, 842.00);

	for (n = 0; n < npages; n++) {
		printf("page %d\n", n);
		page = poppler_document_get_page(doc, n);
		// textarea = rectanglelist_textarea(page);
		textarea = rectanglelist_textarea_distance(page, distance);
		boundingbox = rectanglelist_boundingbox(page);

		cr = cairo_create(surface);
		poppler_page_render_for_printing(page, cr);
		rectanglelist_draw(cr, textarea, FALSE, numbers);
		// rectangle_draw(cr, boundingbox, FALSE);
		cairo_destroy(cr);
		cairo_surface_show_page(surface);

		rectanglelist_free(textarea);
		poppler_rectangle_free(boundingbox);
	}

	cairo_surface_destroy(surface);

	return EXIT_SUCCESS;
}

Attachment: Makefile
Description: Binary data

_______________________________________________
poppler mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/poppler

Reply via email to