[tesseract-ocr] Fine tuning without losing generalizability

Ayush Pandey Wed, 28 Aug 2019 04:05:49 -0700

Hi,
    I am using the following Makefile to fine tune eng.traineddata from 
tessdata_best on my data.
        
export


SHELL := /bin/bash
HOME := $(PWD)
TESSDATA = $(HOME)/tessdata
LANGDATA = $(HOME)/langdata

# Train directory
TRAIN := $(HOME)/train_data

# Name of the model to be built
MODEL_NAME = eng

# Name of the model to continue from
CONTINUE_FROM = eng

TESSDATA_REPO = _best

# Normalization Mode - see src/training/language_specific.sh for details 
NORM_MODE = 2

# BEGIN-EVAL makefile-parser --make-help Makefile

help:
    @echo ""
    @echo "  Targets"
    @echo ""
    @echo "    unicharset       Create unicharset"
    @echo "    lists            Create lists of lstmf filenames for 
training and eval"
    @echo "    training         Start training"
    @echo "    proto-model      Build the proto model"
    @echo "    leptonica        Build leptonica"
    @echo "    tesseract        Build tesseract"
    @echo "    tesseract-langs  Download tesseract-langs"
    @echo "    langdata         Download langdata"
    @echo "    clean            Clean all generated files"
    @echo ""
    @echo "  Variables"
    @echo ""
    @echo "    MODEL_NAME         Name of the model to be built"
    @echo "    CORES              No of cores to use for compiling 
leptonica/tesseract"
    @echo "    LEPTONICA_VERSION  Leptonica version. Default: 
$(LEPTONICA_VERSION)"
    @echo "    TESSERACT_VERSION  Tesseract commit. Default: 
$(TESSERACT_VERSION)"
    @echo "    LANGDATA_VERSION   Tesseract langdata version. Default: 
$(LANGDATA_VERSION)"
    @echo "    TESSDATA_REPO      Tesseract model repo to use. Default: 
$(TESSDATA_REPO)"
    @echo "    TRAIN              Train directory"
    @echo "    RATIO_TRAIN        Ratio of train / eval training data"

# END-EVAL

# Ratio of train / eval training data
RATIO_TRAIN := 0.90

ALL_BOXES = data/all-boxes
ALL_LSTMF = data/all-lstmf

# Create unicharset
unicharset: data/unicharset

# Create lists of lstmf filenames for training and eval
#lists: $(ALL_LSTMF) data/list.train data/list.eval
lists: $(ALL_LSTMF)

train-lists: data/list.train data/list.eval

data/list.train: $(ALL_LSTMF)
    total=`cat $(ALL_LSTMF) | wc -l` \
       no=`echo "$$total * $(RATIO_TRAIN) / 1" | bc`; \
       head -n "$$no" $(ALL_LSTMF) > "$@"

data/list.eval: $(ALL_LSTMF)
    total=`cat $(ALL_LSTMF) | wc -l` \
       no=`echo "($$total - $$total * $(RATIO_TRAIN)) / 1" | bc`; \
       tail -n "$$no" $(ALL_LSTMF) > "$@"

# Start training
training: data/$(MODEL_NAME).traineddata

data/unicharset: $(ALL_BOXES)
    mkdir -p data/$(START_MODEL)
    combine_tessdata -u $(TESSDATA)/$(CONTINUE_FROM).traineddata  $(TESSDATA
)/$(CONTINUE_FROM).
    unicharset_extractor --output_unicharset "$(TRAIN)/my.unicharset" 
--norm_mode 
$(NORM_MODE) "$(ALL_BOXES)"
#    merge_unicharsets data/$(START_MODEL)/$(START_MODEL).lstm-unicharset 
$(GROUND_TRUTH_DIR)/my.unicharset  "$@"
    
$(ALL_BOXES): $(sort $(patsubst %.tif,%.box,$(wildcard $(TRAIN)/*.tif)))
    find $(TRAIN) -name '*.box' -exec cat {} \; > "$@"
    
$(TRAIN)/%.box: $(TRAIN)/%.tif $(TRAIN)/%.gt.txt
    python generate_line_box.py -i "$(TRAIN)/$*.tif" -t 
"$(TRAIN)/$*.gt.txt" > "$@"

$(ALL_LSTMF): $(sort $(patsubst %.tif,%.lstmf,$(wildcard $(TRAIN)/*.tif)))
    find $(TRAIN) -name '*.lstmf' -exec echo {} \; | sort -R -o "$@"

$(TRAIN)/%.lstmf: $(TRAIN)/%.box
    tesseract $(TRAIN)/$*.tif $(TRAIN)/$* --dpi 300 --psm 7 lstm.train
    

# Build the proto model
proto-model: data/$(MODEL_NAME)/$(MODEL_NAME).traineddata

data/$(MODEL_NAME)/$(MODEL_NAME).traineddata: $(LANGDATA) data/unicharset
    combine_lang_model \
      --input_unicharset data/unicharset \
      --script_dir $(LANGDATA) \
      --words $(LANGDATA)/$(MODEL_NAME)/$(MODEL_NAME).wordlist \
      --numbers $(LANGDATA)/$(MODEL_NAME)/$(MODEL_NAME).numbers \
      --puncs $(LANGDATA)/$(MODEL_NAME)/$(MODEL_NAME).punc \
      --output_dir data/ \
      --lang $(MODEL_NAME)

data/checkpoints/$(MODEL_NAME)_checkpoint: unicharset proto-model
    mkdir -p data/checkpoints
    lstmtraining \
      --continue_from   $(TESSDATA)/$(CONTINUE_FROM).lstm \
      --old_traineddata $(TESSDATA)/$(CONTINUE_FROM).traineddata \
      --traineddata data/$(MODEL_NAME)/$(MODEL_NAME).traineddata \
      --model_output data/checkpoints/$(MODEL_NAME) \
      --debug_interval -1 \
      --train_listfile data/list.train \
      --eval_listfile data/list.eval \
      --sequential_training \
      --max_iterations 4000

data/$(MODEL_NAME).traineddata: data/checkpoints/$(MODEL_NAME)_checkpoint
    lstmtraining \
    --stop_training \
    --continue_from $^ \
    --old_traineddata $(TESSDATA)/$(CONTINUE_FROM).traineddata \
    --traineddata data/$(MODEL_NAME)/$(MODEL_NAME).traineddata \
    --model_output $@

# Clean all generated files
clean:
    find data/train -name '*.box' -delete
    find data/train -name '*.lstmf' -delete
    rm -rf data/all-*
    rm -rf data/list.*
    rm -rf data/$(MODEL_NAME)
    rm -rf data/unicharset
    rm -rf data/checkpoints

Now the unicharset for my dataset is quite limited and it doesnt contain 
all the characters present in latin.unicharset. I want to train tesseract 
on my dataset so that it can recognize characters better on the the kind of 
images that I will be feeding to tesseract, but I don't want it to overfit 
on the limited characters that I have in my dataset. Is it possible to do 
that??. 

For eg: All my training images contain the letter 'E'. My dev and test set 
images contain the letters 'F' and 'G'. Is it possible ot train on 'E' 
without losing out accuracy on 'F' and 'G'. Basically this is a problem of 
incremental learning. 

-- 
You received this message because you are subscribed to the Google Groups 
"tesseract-ocr" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
To view this discussion on the web visit 
https://groups.google.com/d/msgid/tesseract-ocr/e8b832df-ff7b-401d-bc3a-bc64c4cf1fcd%40googlegroups.com.

[tesseract-ocr] Fine tuning without losing generalizability

Reply via email to