Hi all,

I recently posted this to the tesseract-dev list, and thought it 
might be of interest to some people here too.

I'm preparing to release a bunch of ground truth text & scans, and 
the scans contain a mix of Latin and Ancient Greek, mostly in 
separate columns.  As I'm only interested in testing the Ancient 
Greek training for Tesseract, UZN files to identify Ancient Greek 
zones, which can then be used when testing the training with 
Tesseract, so the Latin can be completely ignored.

It's quite basic, but may perhaps be useful to people here, either 
as is, or as an example of a complete program using the C-API for 
Tesseract.

Run it without arguments for details of how to use it, example usage 
for the above usecase would be:

  uznforlang myscan.png grc+lat grc 0.5 > myscan.uzn

Any comments would be very welcome.

Nick

-- 
You received this message because you are subscribed to the Google Groups 
"tesseract-ocr" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
To post to this group, send email to [email protected].
Visit this group at http://groups.google.com/group/tesseract-ocr.
To view this discussion on the web visit 
https://groups.google.com/d/msgid/tesseract-ocr/20140605164046.GB5444%40manta.lan.
For more options, visit https://groups.google.com/d/optout.
/* Copyright 2014 Nick White
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#define usage "uznforlang - runs tesseract outputing uzn regions for each block with\n" \
              "             a high enough ratio of characters of a given language\n" \
              "usage: uznforlang imgpath ocrlang matchlang minmatching\n"

/* Compile this like so:
 * g++ -o uznforlang uznforlang.c -I/usr/include/leptonica -I/usr/include/tesseract -llept -ltesseract -lpthread
 */

/* This does its own calculations of the script of a given region,
 * rather than using the results from OSD, for 2 reasons:
 * 1) Any language to be used, regardless of its coverage in
 *    osd.traineddata
 * 2) The matching ratio can be arbitrarily set
 */

#include <stdio.h>
#include <string.h>
#include <allheaders.h>
#include <capi.h>

typedef struct {
	int code;
	const char *text;
} Typename;

/* Mapping of TessPolyBlockType to UZN names */
static const Typename typenames[] = {
	{ PT_UNKNOWN, "Text" },
	{ PT_FLOWING_TEXT, "Text" },
	{ PT_HEADING_TEXT, "Header/Footer" },
	{ PT_PULLOUT_TEXT, "Text" },
	{ PT_EQUATION, "Equation" },
	{ PT_INLINE_EQUATION, "Equation" },
	{ PT_TABLE, "Table" },
	{ PT_VERTICAL_TEXT, "Text" },
	{ PT_CAPTION_TEXT, "Caption" },
	{ PT_FLOWING_IMAGE, "Image" },
	{ PT_HEADING_IMAGE, "Image" },
	{ PT_PULLOUT_IMAGE, "Image" },
	{ PT_HORZ_LINE, "Line" },
	{ PT_VERT_LINE, "Line" },
	{ PT_NOISE, "Noise" },
};

#define LENGTH(X) (sizeof X / sizeof X[0])

static void die(const char *errstr) {
	fputs(errstr, stderr);
	exit(EXIT_FAILURE);
}

int main(int argc, char *argv[]) {
	TessBaseAPI *handle;
	TessResultIterator *ri;
	TessPageIterator *pi;
	PIX *img;
	TessPolyBlockType blocktype;
	const char *name = 0;
	unsigned int i;
	int left, top, right, bottom;
	unsigned int wordcount, matchcount;
	float minmatch;

	if(argc != 5) {
		fputs(usage, stdout);
		return 1;
	}

	sscanf(argv[4], "%f", &minmatch);

	handle = TessBaseAPICreate();
	if(TessBaseAPIInit3(handle, NULL, argv[2]) != 0) {
		die("Error initialising tesseract\n");
	}
	TessBaseAPISetPageSegMode(handle, PSM_AUTO);

	if((img = pixRead(argv[1])) == NULL) {
		die("Error reading image\n");
	}

	TessBaseAPISetImage2(handle, img);
	if(TessBaseAPIRecognize(handle, NULL) != 0) {
		die("Error in Tesseract recognition\n");
	}

	if((ri = TessBaseAPIGetIterator(handle)) == NULL) {
		die("Error getting Tesseract Result Iterator\n");
	}
	pi = TessResultIteratorGetPageIterator(ri);
	TessPageIteratorBegin(pi);

	do {
		wordcount++;
		if(strcmp(TessResultIteratorWordRecognitionLanguage(ri), argv[3]) == 0) {
			matchcount++;
		}
		
		if(TessPageIteratorIsAtFinalElement(pi, RIL_BLOCK, RIL_WORD)) {
			if((float)matchcount / (float)wordcount < minmatch) {
				wordcount = matchcount = 0;
				continue;
			}
			wordcount = matchcount = 0;

			if(!TessPageIteratorBoundingBox(pi, RIL_BLOCK, &left, &top, &right, &bottom)) {
				fputs("Warning: Failed to read bounding box\n", stderr);
				continue;
			}

			blocktype = TessPageIteratorBlockType(pi);
			for(i = 0; i < LENGTH(typenames); i++) {
				if(typenames[i].code == blocktype) {
					name = typenames[i].text;
					break;
				}
			}
			if(!name) {
				name = strdup("Text");
			}

			printf("%5d %5d %5d %5d %s\n", left, top, right-left, bottom-top, name);
		}
	} while(TessPageIteratorNext(pi, RIL_WORD));

	TessResultIteratorDelete(ri); /* This also deletes the page iterator pi */
	TessBaseAPIEnd(handle);
	TessBaseAPIDelete(handle);
	pixDestroy(&img);

	return EXIT_SUCCESS;
}

Reply via email to