# Make about 150 lines of representative training text for finetuning
finetune_training_text=$langdata_dir/$Lang/$Lang.finetune.training_text

# Make about 150 lines of representative training text for evaluation
eval_training_text=$langdata_dir/$Lang/$Lang.eval.training_text




On Thu, Jun 21, 2018 at 10:03 PM <[email protected]> wrote:

> @Shree
>
> Thanks for providing the two bash scripts
> I want to ask you about tesstrain.sh and tesstrain_utils.sh, Is there
> something that must be edited before running lstmtrain_finetune_impact.sh ?
>
> On Wednesday, June 20, 2018 at 11:56:27 PM UTC+3, shree wrote:
>>
>> Here are the bash script files:
>>
>> 1. for finetune for impact training - add a font
>> 2. for finetune plus-minus training - for adding a new character
>>
>> On Thu, Jun 21, 2018 at 1:40 AM Shree Devi Kumar <[email protected]>
>> wrote:
>>
>>> Attached is a BASH script for Finetune training for 'Impact' (refer to
>>> Ray's tutorial in wiki for more details).
>>> Use this when you want to finetune a model for a single new font.
>>>
>>> You will need to change the paths for directories and filenames based on
>>> your system.
>>>
>>> The script assumes that you have tesseract 4.0.0-beta installed
>>> alongwith training tools. Refer to wiki main page for info on how to
>>> download latest version of code from PPA etc.
>>>
>>> Please read through the script first, change as needed, create the
>>> required training texts and then run the script.
>>>
>>> #!/bin/bash
>>> #####################################################
>>> # Script to finetune a language traineddata file for one new font
>>> # for tesseract4.0.0-beta
>>> # Modify directory paths and filenames as required for your setup.
>>> #####################################################
>>> # Choose which parts of script are to be run?
>>> MakeData=yes
>>> RunTraining=yes
>>> RunEval=yes
>>> #####################################################
>>>
>>> # Language
>>> Lang=eng
>>>
>>> # downloaded directory with language data
>>> langdata_dir=~/langdata
>>>
>>> # Make about 150 lines of representative training text for finetuning
>>> finetune_training_text=$langdata_dir/$Lang/$Lang.finetune.training_text
>>>
>>> # Make about 150 lines of representative training text for evaluation
>>> eval_training_text=$langdata_dir/$Lang/$Lang.eval.training_text
>>>
>>> # fonts directory for this system
>>> fonts_dir=~/.fonts
>>>
>>> # Finetune training for IMPACT - ONE font ONLY
>>> fonts_for_training=" \
>>> 'Alanis Hand'  \
>>> "
>>>
>>> # directory with the old 'best' language training set to continue from
>>> eg. ara, eng, san
>>> bestdata_dir=~/tessdata_best
>>>
>>> # tessdata-dir which has osd.trainddata, eng.traineddata, config and
>>> tessconfigs folder and pdf.ttf
>>> tessdata_dir=~/tessdata
>>>
>>> # directory with training scripts - tesstrain.sh etc.
>>> tesstrain_dir=~/tesseract/src/training
>>>
>>> # output directories for this run
>>> trained_output_dir=./$Lang-finetune-impact
>>> eval_output_dir=./$Lang-finetune-impact-eval
>>>
>>> if [ $MakeData = "yes" ]; then
>>>
>>> echo "###### MAKING EVAL DATA ######"
>>>  rm -rf $eval_output_dir
>>>  mkdir $trained_output_dir
>>>
>>> echo "#### running tesstrain.sh for eval text ####"
>>>
>>> eval bash $tesstrain_dir/tesstrain.sh \
>>> --lang $Lang \
>>> --linedata_only \
>>> --noextract_font_properties \
>>> --exposures "0" \
>>> --fonts_dir $fonts_dir \
>>> --fontlist $fonts_for_training \
>>> --langdata_dir $langdata_dir \
>>> --tessdata_dir  $tessdata_dir \
>>> --training_text $eval_training_text \
>>> --output_dir $eval_output_dir
>>>
>>> echo "###### MAKING TRAINING DATA ######"
>>>  rm -rf $trained_output_dir
>>>  mkdir $trained_output_dir
>>>
>>> echo "#### running tesstrain.sh for training text ####"
>>>
>>> eval bash $tesstrain_dir/tesstrain.sh \
>>> --lang $Lang \
>>> --linedata_only \
>>> --noextract_font_properties \
>>> --exposures "0" \
>>> --fonts_dir $fonts_dir \
>>> --fontlist $fonts_for_training \
>>> --langdata_dir $langdata_dir \
>>> --tessdata_dir  $tessdata_dir \
>>> --training_text $finetune_training_text \
>>> --output_dir $trained_output_dir
>>>
>>> echo "#### running combine_tessdata to extract lstm model from
>>> 'tessdata_best' for $Lang ####"
>>>
>>> combine_tessdata -e $bestdata_dir/$Lang.traineddata
>>> $bestdata_dir/$Lang.lstm
>>>
>>> fi
>>>
>>> if [ $RunTraining = "yes" ]; then
>>>
>>> echo "###### LSTM TRAINING ######"
>>>
>>> echo "#### running lstmtraining for finetuning from
>>> $bestdata_dir/$Lang.traineddata #####"
>>>
>>> lstmtraining \
>>> --continue_from  $bestdata_dir/$Lang.lstm \
>>> --traineddata    $bestdata_dir/$Lang.traineddata \
>>> --max_iterations 1000 \
>>> --debug_interval 0 \
>>> --train_listfile $trained_output_dir/$Lang.training_files.txt \
>>> --model_output  $trained_output_dir/finetune
>>>
>>> echo "###### BUILD FINETUNED MODEL ######"
>>>
>>> echo "#### Building final trained file $Lang-finetune-$Lang.traineddata
>>> ####"
>>>
>>> lstmtraining \
>>> --stop_training \
>>> --continue_from $trained_output_dir/finetune_checkpoint \
>>> --traineddata    $bestdata_dir/$Lang.traineddata \
>>> --model_output "$trained_output_dir/$Lang-finetune-$Lang.traineddata"
>>>
>>> fi
>>>
>>> if [ $RunEval = "yes" ]; then
>>>
>>> echo "###### EVAL ORIGINAL MODEL ######"
>>>
>>> lstmeval \
>>> --model  $bestdata_dir/$Lang.traineddata \
>>> --eval_listfile $eval_output_dir/$Lang.training_files.txt \
>>> --verbosity 0
>>>
>>> echo "###### EVAL FINETUNED MODEL ######"
>>>
>>> lstmeval \
>>> --model  $trained_output_dir/$Lang-finetune-$Lang.traineddata \
>>> --eval_listfile $eval_output_dir/$Lang.training_files.txt \
>>> --verbosity 0
>>>
>>> fi
>>>
>>>
>>> On Wed, Jun 20, 2018 at 9:14 PM Shree Devi Kumar <[email protected]>
>>> wrote:
>>>
>>>>
>>>> https://github.com/tesseract-ocr/tesseract/wiki/Training-Tesseract-3.03%E2%80%933.05
>>>>
>>>>
>>>> https://github.com/tesseract-ocr/tesseract/wiki/Training-Tesseract-%E2%80%93-tesstrain.sh
>>>>
>>>> I haven't trained with tesseract 3 for a while. I willpost instructions
>>>> for tesseract4 later.
>>>>
>>>> On Wed, Jun 20, 2018 at 9:05 PM Navaneetha Bitla <[email protected]>
>>>> wrote:
>>>>
>>>>> can you help us by saying how to train with tesstrain.sh
>>>>>
>>>>> It will help all of us, we are thankful to you.
>>>>>
>>>>> On Wed, Jun 20, 2018 at 8:59 PM, Shree Devi Kumar <[email protected]>
>>>>> wrote:
>>>>>
>>>>>> You will have better control on training if you use tesstrain.sh
>>>>>> provided with tesseract.
>>>>>>
>>>>>> On Wed, Jun 20, 2018 at 8:52 PM Navaneetha Bitla <[email protected]>
>>>>>> wrote:
>>>>>>
>>>>>>> http://www.1001fonts.com/handwritten-fonts.html.
>>>>>>>
>>>>>>> the above link has 1900+ fonts from that site i have downloaded the
>>>>>>> ttf files of fonts and converted to tiff files online.
>>>>>>>
>>>>>>> then i have trained the tiff files(fonts) using serak trainer.
>>>>>>>
>>>>>>>
>>>>>>> If you got the accuracy just forward the results so everyone can
>>>>>>> konw and will follw you.
>>>>>>>
>>>>>>> Thank you
>>>>>>>
>>>>>>> On Wed, Jun 20, 2018 at 3:13 PM, James Q <[email protected]>
>>>>>>> wrote:
>>>>>>>
>>>>>>>> I'm going to be using tesseract 4 and using the tesstrain.sh
>>>>>>>> script. If I come across things that improve accuracy though I will 
>>>>>>>> let you
>>>>>>>> know.
>>>>>>>>
>>>>>>>> Where did you find 1300 handwriting fonts?
>>>>>>>>
>>>>>>>> On Tuesday, June 19, 2018 at 5:19:54 PM UTC+1, Navaneetha Bitla
>>>>>>>> wrote:
>>>>>>>>>
>>>>>>>>> serak trainer using training tesseract 3.5.
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> On Tue, Jun 19, 2018 at 9:29 PM, James Q <[email protected]>
>>>>>>>>> wrote:
>>>>>>>>>
>>>>>>>>>> Hi Navaneetha
>>>>>>>>>> I am also looking to start training tesseract using handwritten
>>>>>>>>>> fonts and am about to start setting up my training environment. Are 
>>>>>>>>>> you
>>>>>>>>>> training tesseract 4 by following the guide at
>>>>>>>>>> https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract-4.00
>>>>>>>>>> ?
>>>>>>>>>>
>>>>>>>>>> If so are you fine tuning the existing english model, retraining
>>>>>>>>>> just the top layer(s) or training from scratch with your additional 
>>>>>>>>>> fonts?
>>>>>>>>>>
>>>>>>>>>> Thanks
>>>>>>>>>> Jim
>>>>>>>>>>
>>>>>>>>>> On Tuesday, June 19, 2018 at 10:30:30 AM UTC+1, Navaneetha Bitla
>>>>>>>>>> wrote:
>>>>>>>>>>>
>>>>>>>>>>> Hi, this is Navaneetha
>>>>>>>>>>>
>>>>>>>>>>> i'm working in hand written character recognition project.
>>>>>>>>>>>
>>>>>>>>>>> I have trained 1300 different hand written fonts of english and
>>>>>>>>>>> moved the files into tessdata directory.
>>>>>>>>>>>
>>>>>>>>>>> tested tesseract using the below commands:
>>>>>>>>>>>
>>>>>>>>>>> $convert -density 300 input.png -depth 8 -strip -background
>>>>>>>>>>> white -alpha off out.tiff
>>>>>>>>>>>
>>>>>>>>>>>  $tesseract out.tiff eng
>>>>>>>>>>>
>>>>>>>>>>> The input.png is of Alanis Handa font and i have trained this
>>>>>>>>>>> font but i'm not getting atleast 40% accuracy.
>>>>>>>>>>>
>>>>>>>>>>> Can someone help me.
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> Thanks in advance.
>>>>>>>>>>>
>>>>>>>>>> --
>>>>>>>>>> You received this message because you are subscribed to the
>>>>>>>>>> Google Groups "tesseract-ocr" group.
>>>>>>>>>> To unsubscribe from this group and stop receiving emails from it,
>>>>>>>>>> send an email to [email protected].
>>>>>>>>>> To post to this group, send email to [email protected].
>>>>>>>>>> Visit this group at https://groups.google.com/group/tesseract-ocr
>>>>>>>>>> .
>>>>>>>>>> To view this discussion on the web visit
>>>>>>>>>> https://groups.google.com/d/msgid/tesseract-ocr/253906ac-fedf-4364-ad70-e745b8786c0d%40googlegroups.com
>>>>>>>>>> <https://groups.google.com/d/msgid/tesseract-ocr/253906ac-fedf-4364-ad70-e745b8786c0d%40googlegroups.com?utm_medium=email&utm_source=footer>
>>>>>>>>>> .
>>>>>>>>>>
>>>>>>>>>> For more options, visit https://groups.google.com/d/optout.
>>>>>>>>>>
>>>>>>>>>
>>>>>>>>> --
>>>>>>>> You received this message because you are subscribed to the Google
>>>>>>>> Groups "tesseract-ocr" group.
>>>>>>>> To unsubscribe from this group and stop receiving emails from it,
>>>>>>>> send an email to [email protected].
>>>>>>>> To post to this group, send email to [email protected].
>>>>>>>> Visit this group at https://groups.google.com/group/tesseract-ocr.
>>>>>>>> To view this discussion on the web visit
>>>>>>>> https://groups.google.com/d/msgid/tesseract-ocr/29a1bc53-d127-407b-8611-0652821a0707%40googlegroups.com
>>>>>>>> <https://groups.google.com/d/msgid/tesseract-ocr/29a1bc53-d127-407b-8611-0652821a0707%40googlegroups.com?utm_medium=email&utm_source=footer>
>>>>>>>> .
>>>>>>>>
>>>>>>>> For more options, visit https://groups.google.com/d/optout.
>>>>>>>>
>>>>>>>
>>>>>>> --
>>>>>>> You received this message because you are subscribed to the Google
>>>>>>> Groups "tesseract-ocr" group.
>>>>>>> To unsubscribe from this group and stop receiving emails from it,
>>>>>>> send an email to [email protected].
>>>>>>> To post to this group, send email to [email protected].
>>>>>>> Visit this group at https://groups.google.com/group/tesseract-ocr.
>>>>>>> To view this discussion on the web visit
>>>>>>> https://groups.google.com/d/msgid/tesseract-ocr/CABbi8QfEe2r%2BynHHEGfr8_b-x5KOf2yJ1xr%2Be7e1sDCKxqUFXA%40mail.gmail.com
>>>>>>> <https://groups.google.com/d/msgid/tesseract-ocr/CABbi8QfEe2r%2BynHHEGfr8_b-x5KOf2yJ1xr%2Be7e1sDCKxqUFXA%40mail.gmail.com?utm_medium=email&utm_source=footer>
>>>>>>> .
>>>>>>> For more options, visit https://groups.google.com/d/optout.
>>>>>>>
>>>>>>
>>>>>>
>>>>>> --
>>>>>>
>>>>>> ____________________________________________________________
>>>>>> भजन - कीर्तन - आरती @ http://bhajans.ramparivar.com
>>>>>>
>>>>>> --
>>>>>> You received this message because you are subscribed to the Google
>>>>>> Groups "tesseract-ocr" group.
>>>>>> To unsubscribe from this group and stop receiving emails from it,
>>>>>> send an email to [email protected].
>>>>>> To post to this group, send email to [email protected].
>>>>>> Visit this group at https://groups.google.com/group/tesseract-ocr.
>>>>>> To view this discussion on the web visit
>>>>>> https://groups.google.com/d/msgid/tesseract-ocr/CAG2NduU4w%2BjPakoNOdzq6QyS3nF9rAp9gHSPUkKddioZTXsgyw%40mail.gmail.com
>>>>>> <https://groups.google.com/d/msgid/tesseract-ocr/CAG2NduU4w%2BjPakoNOdzq6QyS3nF9rAp9gHSPUkKddioZTXsgyw%40mail.gmail.com?utm_medium=email&utm_source=footer>
>>>>>> .
>>>>>>
>>>>>> For more options, visit https://groups.google.com/d/optout.
>>>>>>
>>>>>
>>>>> --
>>>>> You received this message because you are subscribed to the Google
>>>>> Groups "tesseract-ocr" group.
>>>>> To unsubscribe from this group and stop receiving emails from it, send
>>>>> an email to [email protected].
>>>>> To post to this group, send email to [email protected].
>>>>> Visit this group at https://groups.google.com/group/tesseract-ocr.
>>>>> To view this discussion on the web visit
>>>>> https://groups.google.com/d/msgid/tesseract-ocr/CABbi8Qdg6FhUbL9ZznVNikY-CS9PcYCoWWeM_7OJNuq7BLMgUA%40mail.gmail.com
>>>>> <https://groups.google.com/d/msgid/tesseract-ocr/CABbi8Qdg6FhUbL9ZznVNikY-CS9PcYCoWWeM_7OJNuq7BLMgUA%40mail.gmail.com?utm_medium=email&utm_source=footer>
>>>>> .
>>>>> For more options, visit https://groups.google.com/d/optout.
>>>>>
>>>>
>>>>
>>>> --
>>>>
>>>> ____________________________________________________________
>>>> भजन - कीर्तन - आरती @ http://bhajans.ramparivar.com
>>>>
>>>
>>>
>>> --
>>>
>>> ____________________________________________________________
>>> भजन - कीर्तन - आरती @ http://bhajans.ramparivar.com
>>>
>>
>>
>> --
>>
>> ____________________________________________________________
>> भजन - कीर्तन - आरती @ http://bhajans.ramparivar.com
>>
> --
> You received this message because you are subscribed to the Google Groups
> "tesseract-ocr" group.
> To unsubscribe from this group and stop receiving emails from it, send an
> email to [email protected].
> To post to this group, send email to [email protected].
> Visit this group at https://groups.google.com/group/tesseract-ocr.
> To view this discussion on the web visit
> https://groups.google.com/d/msgid/tesseract-ocr/d3680fbd-8bdc-435a-86f1-d78fdec87b56%40googlegroups.com
> <https://groups.google.com/d/msgid/tesseract-ocr/d3680fbd-8bdc-435a-86f1-d78fdec87b56%40googlegroups.com?utm_medium=email&utm_source=footer>
> .
> For more options, visit https://groups.google.com/d/optout.
>


-- 

____________________________________________________________
भजन - कीर्तन - आरती @ http://bhajans.ramparivar.com

-- 
You received this message because you are subscribed to the Google Groups 
"tesseract-ocr" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
To post to this group, send email to [email protected].
Visit this group at https://groups.google.com/group/tesseract-ocr.
To view this discussion on the web visit 
https://groups.google.com/d/msgid/tesseract-ocr/CAG2NduU2fLert%2BT1WFkgo3sr%3DnmLSXuZRDQYV62gmXbJzZv2_A%40mail.gmail.com.
For more options, visit https://groups.google.com/d/optout.

Reply via email to