# Make about 150 lines of representative training text for finetuning finetune_training_text=$langdata_dir/$Lang/$Lang.finetune.training_text
# Make about 150 lines of representative training text for evaluation eval_training_text=$langdata_dir/$Lang/$Lang.eval.training_text On Thu, Jun 21, 2018 at 10:03 PM <[email protected]> wrote: > @Shree > > Thanks for providing the two bash scripts > I want to ask you about tesstrain.sh and tesstrain_utils.sh, Is there > something that must be edited before running lstmtrain_finetune_impact.sh ? > > On Wednesday, June 20, 2018 at 11:56:27 PM UTC+3, shree wrote: >> >> Here are the bash script files: >> >> 1. for finetune for impact training - add a font >> 2. for finetune plus-minus training - for adding a new character >> >> On Thu, Jun 21, 2018 at 1:40 AM Shree Devi Kumar <[email protected]> >> wrote: >> >>> Attached is a BASH script for Finetune training for 'Impact' (refer to >>> Ray's tutorial in wiki for more details). >>> Use this when you want to finetune a model for a single new font. >>> >>> You will need to change the paths for directories and filenames based on >>> your system. >>> >>> The script assumes that you have tesseract 4.0.0-beta installed >>> alongwith training tools. Refer to wiki main page for info on how to >>> download latest version of code from PPA etc. >>> >>> Please read through the script first, change as needed, create the >>> required training texts and then run the script. >>> >>> #!/bin/bash >>> ##################################################### >>> # Script to finetune a language traineddata file for one new font >>> # for tesseract4.0.0-beta >>> # Modify directory paths and filenames as required for your setup. >>> ##################################################### >>> # Choose which parts of script are to be run? >>> MakeData=yes >>> RunTraining=yes >>> RunEval=yes >>> ##################################################### >>> >>> # Language >>> Lang=eng >>> >>> # downloaded directory with language data >>> langdata_dir=~/langdata >>> >>> # Make about 150 lines of representative training text for finetuning >>> finetune_training_text=$langdata_dir/$Lang/$Lang.finetune.training_text >>> >>> # Make about 150 lines of representative training text for evaluation >>> eval_training_text=$langdata_dir/$Lang/$Lang.eval.training_text >>> >>> # fonts directory for this system >>> fonts_dir=~/.fonts >>> >>> # Finetune training for IMPACT - ONE font ONLY >>> fonts_for_training=" \ >>> 'Alanis Hand' \ >>> " >>> >>> # directory with the old 'best' language training set to continue from >>> eg. ara, eng, san >>> bestdata_dir=~/tessdata_best >>> >>> # tessdata-dir which has osd.trainddata, eng.traineddata, config and >>> tessconfigs folder and pdf.ttf >>> tessdata_dir=~/tessdata >>> >>> # directory with training scripts - tesstrain.sh etc. >>> tesstrain_dir=~/tesseract/src/training >>> >>> # output directories for this run >>> trained_output_dir=./$Lang-finetune-impact >>> eval_output_dir=./$Lang-finetune-impact-eval >>> >>> if [ $MakeData = "yes" ]; then >>> >>> echo "###### MAKING EVAL DATA ######" >>> rm -rf $eval_output_dir >>> mkdir $trained_output_dir >>> >>> echo "#### running tesstrain.sh for eval text ####" >>> >>> eval bash $tesstrain_dir/tesstrain.sh \ >>> --lang $Lang \ >>> --linedata_only \ >>> --noextract_font_properties \ >>> --exposures "0" \ >>> --fonts_dir $fonts_dir \ >>> --fontlist $fonts_for_training \ >>> --langdata_dir $langdata_dir \ >>> --tessdata_dir $tessdata_dir \ >>> --training_text $eval_training_text \ >>> --output_dir $eval_output_dir >>> >>> echo "###### MAKING TRAINING DATA ######" >>> rm -rf $trained_output_dir >>> mkdir $trained_output_dir >>> >>> echo "#### running tesstrain.sh for training text ####" >>> >>> eval bash $tesstrain_dir/tesstrain.sh \ >>> --lang $Lang \ >>> --linedata_only \ >>> --noextract_font_properties \ >>> --exposures "0" \ >>> --fonts_dir $fonts_dir \ >>> --fontlist $fonts_for_training \ >>> --langdata_dir $langdata_dir \ >>> --tessdata_dir $tessdata_dir \ >>> --training_text $finetune_training_text \ >>> --output_dir $trained_output_dir >>> >>> echo "#### running combine_tessdata to extract lstm model from >>> 'tessdata_best' for $Lang ####" >>> >>> combine_tessdata -e $bestdata_dir/$Lang.traineddata >>> $bestdata_dir/$Lang.lstm >>> >>> fi >>> >>> if [ $RunTraining = "yes" ]; then >>> >>> echo "###### LSTM TRAINING ######" >>> >>> echo "#### running lstmtraining for finetuning from >>> $bestdata_dir/$Lang.traineddata #####" >>> >>> lstmtraining \ >>> --continue_from $bestdata_dir/$Lang.lstm \ >>> --traineddata $bestdata_dir/$Lang.traineddata \ >>> --max_iterations 1000 \ >>> --debug_interval 0 \ >>> --train_listfile $trained_output_dir/$Lang.training_files.txt \ >>> --model_output $trained_output_dir/finetune >>> >>> echo "###### BUILD FINETUNED MODEL ######" >>> >>> echo "#### Building final trained file $Lang-finetune-$Lang.traineddata >>> ####" >>> >>> lstmtraining \ >>> --stop_training \ >>> --continue_from $trained_output_dir/finetune_checkpoint \ >>> --traineddata $bestdata_dir/$Lang.traineddata \ >>> --model_output "$trained_output_dir/$Lang-finetune-$Lang.traineddata" >>> >>> fi >>> >>> if [ $RunEval = "yes" ]; then >>> >>> echo "###### EVAL ORIGINAL MODEL ######" >>> >>> lstmeval \ >>> --model $bestdata_dir/$Lang.traineddata \ >>> --eval_listfile $eval_output_dir/$Lang.training_files.txt \ >>> --verbosity 0 >>> >>> echo "###### EVAL FINETUNED MODEL ######" >>> >>> lstmeval \ >>> --model $trained_output_dir/$Lang-finetune-$Lang.traineddata \ >>> --eval_listfile $eval_output_dir/$Lang.training_files.txt \ >>> --verbosity 0 >>> >>> fi >>> >>> >>> On Wed, Jun 20, 2018 at 9:14 PM Shree Devi Kumar <[email protected]> >>> wrote: >>> >>>> >>>> https://github.com/tesseract-ocr/tesseract/wiki/Training-Tesseract-3.03%E2%80%933.05 >>>> >>>> >>>> https://github.com/tesseract-ocr/tesseract/wiki/Training-Tesseract-%E2%80%93-tesstrain.sh >>>> >>>> I haven't trained with tesseract 3 for a while. I willpost instructions >>>> for tesseract4 later. >>>> >>>> On Wed, Jun 20, 2018 at 9:05 PM Navaneetha Bitla <[email protected]> >>>> wrote: >>>> >>>>> can you help us by saying how to train with tesstrain.sh >>>>> >>>>> It will help all of us, we are thankful to you. >>>>> >>>>> On Wed, Jun 20, 2018 at 8:59 PM, Shree Devi Kumar <[email protected]> >>>>> wrote: >>>>> >>>>>> You will have better control on training if you use tesstrain.sh >>>>>> provided with tesseract. >>>>>> >>>>>> On Wed, Jun 20, 2018 at 8:52 PM Navaneetha Bitla <[email protected]> >>>>>> wrote: >>>>>> >>>>>>> http://www.1001fonts.com/handwritten-fonts.html. >>>>>>> >>>>>>> the above link has 1900+ fonts from that site i have downloaded the >>>>>>> ttf files of fonts and converted to tiff files online. >>>>>>> >>>>>>> then i have trained the tiff files(fonts) using serak trainer. >>>>>>> >>>>>>> >>>>>>> If you got the accuracy just forward the results so everyone can >>>>>>> konw and will follw you. >>>>>>> >>>>>>> Thank you >>>>>>> >>>>>>> On Wed, Jun 20, 2018 at 3:13 PM, James Q <[email protected]> >>>>>>> wrote: >>>>>>> >>>>>>>> I'm going to be using tesseract 4 and using the tesstrain.sh >>>>>>>> script. If I come across things that improve accuracy though I will >>>>>>>> let you >>>>>>>> know. >>>>>>>> >>>>>>>> Where did you find 1300 handwriting fonts? >>>>>>>> >>>>>>>> On Tuesday, June 19, 2018 at 5:19:54 PM UTC+1, Navaneetha Bitla >>>>>>>> wrote: >>>>>>>>> >>>>>>>>> serak trainer using training tesseract 3.5. >>>>>>>>> >>>>>>>>> >>>>>>>>> >>>>>>>>> On Tue, Jun 19, 2018 at 9:29 PM, James Q <[email protected]> >>>>>>>>> wrote: >>>>>>>>> >>>>>>>>>> Hi Navaneetha >>>>>>>>>> I am also looking to start training tesseract using handwritten >>>>>>>>>> fonts and am about to start setting up my training environment. Are >>>>>>>>>> you >>>>>>>>>> training tesseract 4 by following the guide at >>>>>>>>>> https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract-4.00 >>>>>>>>>> ? >>>>>>>>>> >>>>>>>>>> If so are you fine tuning the existing english model, retraining >>>>>>>>>> just the top layer(s) or training from scratch with your additional >>>>>>>>>> fonts? >>>>>>>>>> >>>>>>>>>> Thanks >>>>>>>>>> Jim >>>>>>>>>> >>>>>>>>>> On Tuesday, June 19, 2018 at 10:30:30 AM UTC+1, Navaneetha Bitla >>>>>>>>>> wrote: >>>>>>>>>>> >>>>>>>>>>> Hi, this is Navaneetha >>>>>>>>>>> >>>>>>>>>>> i'm working in hand written character recognition project. >>>>>>>>>>> >>>>>>>>>>> I have trained 1300 different hand written fonts of english and >>>>>>>>>>> moved the files into tessdata directory. >>>>>>>>>>> >>>>>>>>>>> tested tesseract using the below commands: >>>>>>>>>>> >>>>>>>>>>> $convert -density 300 input.png -depth 8 -strip -background >>>>>>>>>>> white -alpha off out.tiff >>>>>>>>>>> >>>>>>>>>>> $tesseract out.tiff eng >>>>>>>>>>> >>>>>>>>>>> The input.png is of Alanis Handa font and i have trained this >>>>>>>>>>> font but i'm not getting atleast 40% accuracy. >>>>>>>>>>> >>>>>>>>>>> Can someone help me. >>>>>>>>>>> >>>>>>>>>>> >>>>>>>>>>> Thanks in advance. >>>>>>>>>>> >>>>>>>>>> -- >>>>>>>>>> You received this message because you are subscribed to the >>>>>>>>>> Google Groups "tesseract-ocr" group. >>>>>>>>>> To unsubscribe from this group and stop receiving emails from it, >>>>>>>>>> send an email to [email protected]. >>>>>>>>>> To post to this group, send email to [email protected]. >>>>>>>>>> Visit this group at https://groups.google.com/group/tesseract-ocr >>>>>>>>>> . >>>>>>>>>> To view this discussion on the web visit >>>>>>>>>> https://groups.google.com/d/msgid/tesseract-ocr/253906ac-fedf-4364-ad70-e745b8786c0d%40googlegroups.com >>>>>>>>>> <https://groups.google.com/d/msgid/tesseract-ocr/253906ac-fedf-4364-ad70-e745b8786c0d%40googlegroups.com?utm_medium=email&utm_source=footer> >>>>>>>>>> . >>>>>>>>>> >>>>>>>>>> For more options, visit https://groups.google.com/d/optout. >>>>>>>>>> >>>>>>>>> >>>>>>>>> -- >>>>>>>> You received this message because you are subscribed to the Google >>>>>>>> Groups "tesseract-ocr" group. >>>>>>>> To unsubscribe from this group and stop receiving emails from it, >>>>>>>> send an email to [email protected]. >>>>>>>> To post to this group, send email to [email protected]. >>>>>>>> Visit this group at https://groups.google.com/group/tesseract-ocr. >>>>>>>> To view this discussion on the web visit >>>>>>>> https://groups.google.com/d/msgid/tesseract-ocr/29a1bc53-d127-407b-8611-0652821a0707%40googlegroups.com >>>>>>>> <https://groups.google.com/d/msgid/tesseract-ocr/29a1bc53-d127-407b-8611-0652821a0707%40googlegroups.com?utm_medium=email&utm_source=footer> >>>>>>>> . >>>>>>>> >>>>>>>> For more options, visit https://groups.google.com/d/optout. >>>>>>>> >>>>>>> >>>>>>> -- >>>>>>> You received this message because you are subscribed to the Google >>>>>>> Groups "tesseract-ocr" group. >>>>>>> To unsubscribe from this group and stop receiving emails from it, >>>>>>> send an email to [email protected]. >>>>>>> To post to this group, send email to [email protected]. >>>>>>> Visit this group at https://groups.google.com/group/tesseract-ocr. >>>>>>> To view this discussion on the web visit >>>>>>> https://groups.google.com/d/msgid/tesseract-ocr/CABbi8QfEe2r%2BynHHEGfr8_b-x5KOf2yJ1xr%2Be7e1sDCKxqUFXA%40mail.gmail.com >>>>>>> <https://groups.google.com/d/msgid/tesseract-ocr/CABbi8QfEe2r%2BynHHEGfr8_b-x5KOf2yJ1xr%2Be7e1sDCKxqUFXA%40mail.gmail.com?utm_medium=email&utm_source=footer> >>>>>>> . >>>>>>> For more options, visit https://groups.google.com/d/optout. >>>>>>> >>>>>> >>>>>> >>>>>> -- >>>>>> >>>>>> ____________________________________________________________ >>>>>> भजन - कीर्तन - आरती @ http://bhajans.ramparivar.com >>>>>> >>>>>> -- >>>>>> You received this message because you are subscribed to the Google >>>>>> Groups "tesseract-ocr" group. >>>>>> To unsubscribe from this group and stop receiving emails from it, >>>>>> send an email to [email protected]. >>>>>> To post to this group, send email to [email protected]. >>>>>> Visit this group at https://groups.google.com/group/tesseract-ocr. >>>>>> To view this discussion on the web visit >>>>>> https://groups.google.com/d/msgid/tesseract-ocr/CAG2NduU4w%2BjPakoNOdzq6QyS3nF9rAp9gHSPUkKddioZTXsgyw%40mail.gmail.com >>>>>> <https://groups.google.com/d/msgid/tesseract-ocr/CAG2NduU4w%2BjPakoNOdzq6QyS3nF9rAp9gHSPUkKddioZTXsgyw%40mail.gmail.com?utm_medium=email&utm_source=footer> >>>>>> . >>>>>> >>>>>> For more options, visit https://groups.google.com/d/optout. >>>>>> >>>>> >>>>> -- >>>>> You received this message because you are subscribed to the Google >>>>> Groups "tesseract-ocr" group. >>>>> To unsubscribe from this group and stop receiving emails from it, send >>>>> an email to [email protected]. >>>>> To post to this group, send email to [email protected]. >>>>> Visit this group at https://groups.google.com/group/tesseract-ocr. >>>>> To view this discussion on the web visit >>>>> https://groups.google.com/d/msgid/tesseract-ocr/CABbi8Qdg6FhUbL9ZznVNikY-CS9PcYCoWWeM_7OJNuq7BLMgUA%40mail.gmail.com >>>>> <https://groups.google.com/d/msgid/tesseract-ocr/CABbi8Qdg6FhUbL9ZznVNikY-CS9PcYCoWWeM_7OJNuq7BLMgUA%40mail.gmail.com?utm_medium=email&utm_source=footer> >>>>> . >>>>> For more options, visit https://groups.google.com/d/optout. >>>>> >>>> >>>> >>>> -- >>>> >>>> ____________________________________________________________ >>>> भजन - कीर्तन - आरती @ http://bhajans.ramparivar.com >>>> >>> >>> >>> -- >>> >>> ____________________________________________________________ >>> भजन - कीर्तन - आरती @ http://bhajans.ramparivar.com >>> >> >> >> -- >> >> ____________________________________________________________ >> भजन - कीर्तन - आरती @ http://bhajans.ramparivar.com >> > -- > You received this message because you are subscribed to the Google Groups > "tesseract-ocr" group. > To unsubscribe from this group and stop receiving emails from it, send an > email to [email protected]. > To post to this group, send email to [email protected]. > Visit this group at https://groups.google.com/group/tesseract-ocr. > To view this discussion on the web visit > https://groups.google.com/d/msgid/tesseract-ocr/d3680fbd-8bdc-435a-86f1-d78fdec87b56%40googlegroups.com > <https://groups.google.com/d/msgid/tesseract-ocr/d3680fbd-8bdc-435a-86f1-d78fdec87b56%40googlegroups.com?utm_medium=email&utm_source=footer> > . > For more options, visit https://groups.google.com/d/optout. > -- ____________________________________________________________ भजन - कीर्तन - आरती @ http://bhajans.ramparivar.com -- You received this message because you are subscribed to the Google Groups "tesseract-ocr" group. To unsubscribe from this group and stop receiving emails from it, send an email to [email protected]. To post to this group, send email to [email protected]. Visit this group at https://groups.google.com/group/tesseract-ocr. To view this discussion on the web visit https://groups.google.com/d/msgid/tesseract-ocr/CAG2NduU2fLert%2BT1WFkgo3sr%3DnmLSXuZRDQYV62gmXbJzZv2_A%40mail.gmail.com. For more options, visit https://groups.google.com/d/optout.

