Hello, because the training of ocropus has been changed I have developed a simple script to use tesseract 3.02 training files and convert them into ocropus 0.7 training data. The script ist not perfect and only works on single page tiffs.
If you have improvements, please do not hesitate to contact me. I would also be happy if script is added to ocropus repository, too. With best regards Andreas -- You received this message because you are subscribed to the Google Groups "ocropus" group. To unsubscribe from this group and stop receiving emails from it, send an email to [email protected]. To post to this group, send email to [email protected]. To view this discussion on the web visit https://groups.google.com/d/msg/ocropus/-/MGXRmcj03g4J. For more options, visit https://groups.google.com/groups/opt_out.
#!/bin/bash # script to use tesseract 3.02 training files to generate ocropus 0.7 training files # created 2013 by Andreas Romeyke ([email protected]) # should be used under terms of Gnu General Public License v3.0 or higher # see http://www.gnu.org/licenses/gpl-3.0.html for LICENSE and further information # # needs an installed imagemagick # # call: tess2ocropus tess_train/ ocrotrain/ function convert_page { pagefname=$1; gap=8; # to recognize spaces between text echo $pagefname # read file # extract line from tiff # write png and assoc text lastx=0 lastx1=0 lasty=0 maxx=0 minx=100000 maxy=0 miny=100000 IFS=$'\n' let lineno=0; dimy=0; # calc max dimy, needed to calc offset correctly for line in $(cat $pagefname); do y1=$(echo $line | cut -d " " -f 5) if [ $y1 -gt $dimy ]; then dimy=$y1; fi done srcimage=$tessdir/$(basename $pagefname ".box").tif tmpimage=$(basename $pagefname ".box").png convert -flip $srcimage $tmpimage mkdir $ocropusdir/$(basename $pagefname ".box"); for line in $(cat $pagefname); do c=$(echo $line | cut -d " " -f 1) x0=$(echo $line | cut -d " " -f 2) y0=$(echo $line | cut -d " " -f 3) x1=$(echo $line | cut -d " " -f 4) y1=$(echo $line | cut -d " " -f 5) let w=1+$maxx-$minx; let h=1+$maxy-$miny; if [ $x0 -lt $lastx ]; then # extract # width height offsetx offsety let ox=$minx; let oy=$miny; #$dimy-$maxy; slineno=$(printf "01%04x" $lineno) tgtimage=$ocropusdir/$(basename $pagefname ".box")/$slineno.bin.png tgttxt=$ocropusdir/$(basename $pagefname ".box")/$slineno.gt.txt echo "extract new line: ${w}x$h+$ox+$oy ($minx $miny $maxx $maxy) dimy=$dimy word=$word" convert -extract ${w}x$h+$ox+$oy $tmpimage -flip $tgtimage echo "$word" >$tgttxt let lineno=$[lineno+1]; maxx=$x1 minx=$x0 maxy=$y1 miny=$y0 word="" else if [ $x0 -lt $minx ]; then minx=$x0; fi if [ $x1 -gt $maxx ]; then maxx=$x1; fi if [ $y0 -lt $miny ]; then miny=$y0; fi if [ $y1 -gt $maxy ]; then maxy=$y1; fi fi let dx=$x0-$lastx1; if [ $dx -gt $gap ]; then word="${word} "; fi word="${word}${c}" lastx=$x0 lastx1=$x1 lasty=$y0 done unset IFS rm -f $tmpimage } tessdir=$1 ocropusdir=$2 if [ -e "$tessdir" ]; then if [ -e "$ocropusdir" ]; then # here convert # for each *.box for file in $tessdir/*.box; do convert_page $file done else echo "no ocropusdir given '$ocropusdir'" fi else echo "nop tessdir given '$tessdir'" fi
