Hello,

because the training of ocropus has been changed I have developed a simple 
script to use tesseract 3.02 training files and convert them into ocropus 
0.7 training data. The script ist not perfect and only works on single page 
tiffs.

If you have improvements, please do not hesitate to contact me.

I would also be happy if script is added to ocropus repository, too.

With best regards

Andreas

-- 
You received this message because you are subscribed to the Google Groups 
"ocropus" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
To post to this group, send email to [email protected].
To view this discussion on the web visit 
https://groups.google.com/d/msg/ocropus/-/MGXRmcj03g4J.
For more options, visit https://groups.google.com/groups/opt_out.


#!/bin/bash
# script to use tesseract 3.02 training files to generate ocropus 0.7 training files
# created 2013 by Andreas Romeyke ([email protected]) 
# should be used under terms of Gnu General Public License v3.0 or higher
# see http://www.gnu.org/licenses/gpl-3.0.html for LICENSE and further information
#
# needs an installed imagemagick
# 
# call: tess2ocropus tess_train/ ocrotrain/

function convert_page {
	pagefname=$1;
	gap=8; # to recognize spaces between text
	echo $pagefname
	# read file
	# extract line from tiff
	# write png and assoc text
	lastx=0
	lastx1=0
	lasty=0
	maxx=0
	minx=100000
	maxy=0
	miny=100000
	IFS=$'\n'
	let lineno=0;
	dimy=0;
	# calc max dimy, needed to calc offset correctly
	for line in $(cat $pagefname); do
		y1=$(echo $line | cut -d " " -f 5)
		if [ $y1 -gt $dimy ]; then dimy=$y1; fi
	done
	srcimage=$tessdir/$(basename $pagefname ".box").tif
	tmpimage=$(basename $pagefname ".box").png
	convert -flip $srcimage $tmpimage
	mkdir $ocropusdir/$(basename $pagefname ".box");
	for line in $(cat $pagefname); do
		c=$(echo $line | cut -d " " -f 1)
		x0=$(echo $line | cut -d " " -f 2)
		y0=$(echo $line | cut -d " " -f 3)
		x1=$(echo $line | cut -d " " -f 4)
		y1=$(echo $line | cut -d " " -f 5)
		let w=1+$maxx-$minx;
		let h=1+$maxy-$miny;
		if [ $x0 -lt $lastx ]; then
			# extract
			# width height offsetx offsety
			let ox=$minx;
			let oy=$miny; #$dimy-$maxy;
			slineno=$(printf "01%04x" $lineno)
			tgtimage=$ocropusdir/$(basename $pagefname ".box")/$slineno.bin.png
			tgttxt=$ocropusdir/$(basename $pagefname ".box")/$slineno.gt.txt
	
	
			echo "extract new line: ${w}x$h+$ox+$oy ($minx $miny $maxx $maxy) dimy=$dimy word=$word"
			convert -extract ${w}x$h+$ox+$oy $tmpimage -flip $tgtimage
			echo "$word" >$tgttxt
			let lineno=$[lineno+1];
			maxx=$x1
			minx=$x0
			maxy=$y1
			miny=$y0
			word=""
		else 
			if [ $x0 -lt $minx ]; then minx=$x0; fi
			if [ $x1 -gt $maxx ]; then maxx=$x1; fi
			if [ $y0 -lt $miny ]; then miny=$y0; fi
			if [ $y1 -gt $maxy ]; then maxy=$y1; fi
		fi
		let dx=$x0-$lastx1;
		if [ $dx -gt $gap ]; then word="${word} "; fi
		word="${word}${c}"
		lastx=$x0
		lastx1=$x1
		lasty=$y0
	done
	unset IFS
	rm -f $tmpimage

}

tessdir=$1
ocropusdir=$2
if [ -e "$tessdir" ]; then
	if [ -e "$ocropusdir" ]; then
		# here convert
		# for each *.box
				for file in $tessdir/*.box; do
			convert_page $file
		done
	else
		echo "no ocropusdir given '$ocropusdir'"
	fi
else
	echo "nop tessdir given '$tessdir'"
fi

Reply via email to