~/tesseract/src/training/tesstrain.sh \
--fonts_dir ~/.fonts \
--training_text ~/langdata/chi_sim/chi_sim_tuned.txt \
--langdata_dir ~/langdata \
--tessdata_dir ~/tessdata \
--lang chi_sim --linedata_only \
--noextract_font_properties  \
--exposures "0" \
--workspace_dir ~/tmp \
--save_box_tiff \
--fontlist  \
"NSimSun" \
"Arial Unicode MS" \
"SimSun" \
"Merchant Copy" \
"Merchant Copy Doublesize" \
"Noto Sans CJK SC" \
"Noto Sans Mono CJK SC" \
--output_dir ~/tesstutorial/chi_sim_trainnew


mkdir -p ~/tesstutorial/chi_sim_tuned_from_chi_sim

combine_tessdata -e ~/tessdata_best/chi_sim.traineddata
~/tesstutorial/chi_sim_tuned_from_chi_sim/chi_sim.lstm

~/tesseract/bin/src/training/lstmtraining \
--model_output ~/tesstutorial/chi_sim_tuned_from_chi_sim/chi_sim_tuned \
--continue_from ~/tesstutorial/chi_sim_tuned_from_chi_sim/chi_sim.lstm \
--traineddata ~/tesstutorial/chi_sim_train/chi_sim/chi_sim.traineddata \
--old_traineddata ~/tessdata_best/chi_sim.traineddata \
--train_listfile ~/tesstutorial/chi_sim_train/chi_sim.training_files.txt \
--debug_interval -1 \
--max_iterations 3600

~/tesseract/bin/src/training/lstmtraining \
--stop_training \
--continue_from
~/tesstutorial/chi_sim_tuned_from_chi_sim/chi_sim_tuned_checkpoint  \
--traineddata ~/tesstutorial/chi_sim_train/chi_sim/chi_sim.traineddata \
--model_output ~/tessdata_best/chi_sim_tuned.traineddata


On Wed, Mar 20, 2019 at 8:46 AM Shree Devi Kumar <[email protected]>
wrote:

> Also, 10000 iterations for finetuning will lead to overfitting.
>
> I tried by using fewer fonts and adding a couple of English only fonts
> that match the typeface of the image you shared. The output is improved
> compared to tessdata_best. I assume that you want to limit your unicharset
> based on your training_text (numbers, some English letters and some
> Simplified Chinese characters). The image was pre-processed to B&W and
> deskewed.
>
> I found that --psm 6 gives worse results both for tessdata_best and
> finetuned, but the default psm gives better accuracy though there are
> multiple blank lines for extra columns identified in --psm 3.
>
> See attached:
>
>
>

-- 

____________________________________________________________
भजन - कीर्तन - आरती @ http://bhajans.ramparivar.com

-- 
You received this message because you are subscribed to the Google Groups 
"tesseract-ocr" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
To post to this group, send email to [email protected].
Visit this group at https://groups.google.com/group/tesseract-ocr.
To view this discussion on the web visit 
https://groups.google.com/d/msgid/tesseract-ocr/CAG2NduUeONc98a%3DMiGE1Y1PGKK-Jb5vinDTPnEF%2BMvPUkT0nmw%40mail.gmail.com.
For more options, visit https://groups.google.com/d/optout.

Reply via email to