Hi,
I am using the following Makefile to fine tune eng.traineddata from
tessdata_best on my data.
export
SHELL := /bin/bash
HOME := $(PWD)
TESSDATA = $(HOME)/tessdata
LANGDATA = $(HOME)/langdata
# Train directory
TRAIN := $(HOME)/train_data
# Name of the model to be built
MODEL_NAME = eng
# Name of the model to continue from
CONTINUE_FROM = eng
TESSDATA_REPO = _best
# Normalization Mode - see src/training/language_specific.sh for details
NORM_MODE = 2
# BEGIN-EVAL makefile-parser --make-help Makefile
help:
@echo ""
@echo " Targets"
@echo ""
@echo " unicharset Create unicharset"
@echo " lists Create lists of lstmf filenames for
training and eval"
@echo " training Start training"
@echo " proto-model Build the proto model"
@echo " leptonica Build leptonica"
@echo " tesseract Build tesseract"
@echo " tesseract-langs Download tesseract-langs"
@echo " langdata Download langdata"
@echo " clean Clean all generated files"
@echo ""
@echo " Variables"
@echo ""
@echo " MODEL_NAME Name of the model to be built"
@echo " CORES No of cores to use for compiling
leptonica/tesseract"
@echo " LEPTONICA_VERSION Leptonica version. Default:
$(LEPTONICA_VERSION)"
@echo " TESSERACT_VERSION Tesseract commit. Default:
$(TESSERACT_VERSION)"
@echo " LANGDATA_VERSION Tesseract langdata version. Default:
$(LANGDATA_VERSION)"
@echo " TESSDATA_REPO Tesseract model repo to use. Default:
$(TESSDATA_REPO)"
@echo " TRAIN Train directory"
@echo " RATIO_TRAIN Ratio of train / eval training data"
# END-EVAL
# Ratio of train / eval training data
RATIO_TRAIN := 0.90
ALL_BOXES = data/all-boxes
ALL_LSTMF = data/all-lstmf
# Create unicharset
unicharset: data/unicharset
# Create lists of lstmf filenames for training and eval
#lists: $(ALL_LSTMF) data/list.train data/list.eval
lists: $(ALL_LSTMF)
train-lists: data/list.train data/list.eval
data/list.train: $(ALL_LSTMF)
total=`cat $(ALL_LSTMF) | wc -l` \
no=`echo "$$total * $(RATIO_TRAIN) / 1" | bc`; \
head -n "$$no" $(ALL_LSTMF) > "$@"
data/list.eval: $(ALL_LSTMF)
total=`cat $(ALL_LSTMF) | wc -l` \
no=`echo "($$total - $$total * $(RATIO_TRAIN)) / 1" | bc`; \
tail -n "$$no" $(ALL_LSTMF) > "$@"
# Start training
training: data/$(MODEL_NAME).traineddata
data/unicharset: $(ALL_BOXES)
mkdir -p data/$(START_MODEL)
combine_tessdata -u $(TESSDATA)/$(CONTINUE_FROM).traineddata $(TESSDATA
)/$(CONTINUE_FROM).
unicharset_extractor --output_unicharset "$(TRAIN)/my.unicharset"
--norm_mode
$(NORM_MODE) "$(ALL_BOXES)"
# merge_unicharsets data/$(START_MODEL)/$(START_MODEL).lstm-unicharset
$(GROUND_TRUTH_DIR)/my.unicharset "$@"
$(ALL_BOXES): $(sort $(patsubst %.tif,%.box,$(wildcard $(TRAIN)/*.tif)))
find $(TRAIN) -name '*.box' -exec cat {} \; > "$@"
$(TRAIN)/%.box: $(TRAIN)/%.tif $(TRAIN)/%.gt.txt
python generate_line_box.py -i "$(TRAIN)/$*.tif" -t
"$(TRAIN)/$*.gt.txt" > "$@"
$(ALL_LSTMF): $(sort $(patsubst %.tif,%.lstmf,$(wildcard $(TRAIN)/*.tif)))
find $(TRAIN) -name '*.lstmf' -exec echo {} \; | sort -R -o "$@"
$(TRAIN)/%.lstmf: $(TRAIN)/%.box
tesseract $(TRAIN)/$*.tif $(TRAIN)/$* --dpi 300 --psm 7 lstm.train
# Build the proto model
proto-model: data/$(MODEL_NAME)/$(MODEL_NAME).traineddata
data/$(MODEL_NAME)/$(MODEL_NAME).traineddata: $(LANGDATA) data/unicharset
combine_lang_model \
--input_unicharset data/unicharset \
--script_dir $(LANGDATA) \
--words $(LANGDATA)/$(MODEL_NAME)/$(MODEL_NAME).wordlist \
--numbers $(LANGDATA)/$(MODEL_NAME)/$(MODEL_NAME).numbers \
--puncs $(LANGDATA)/$(MODEL_NAME)/$(MODEL_NAME).punc \
--output_dir data/ \
--lang $(MODEL_NAME)
data/checkpoints/$(MODEL_NAME)_checkpoint: unicharset proto-model
mkdir -p data/checkpoints
lstmtraining \
--continue_from $(TESSDATA)/$(CONTINUE_FROM).lstm \
--old_traineddata $(TESSDATA)/$(CONTINUE_FROM).traineddata \
--traineddata data/$(MODEL_NAME)/$(MODEL_NAME).traineddata \
--model_output data/checkpoints/$(MODEL_NAME) \
--debug_interval -1 \
--train_listfile data/list.train \
--eval_listfile data/list.eval \
--sequential_training \
--max_iterations 4000
data/$(MODEL_NAME).traineddata: data/checkpoints/$(MODEL_NAME)_checkpoint
lstmtraining \
--stop_training \
--continue_from $^ \
--old_traineddata $(TESSDATA)/$(CONTINUE_FROM).traineddata \
--traineddata data/$(MODEL_NAME)/$(MODEL_NAME).traineddata \
--model_output $@
# Clean all generated files
clean:
find data/train -name '*.box' -delete
find data/train -name '*.lstmf' -delete
rm -rf data/all-*
rm -rf data/list.*
rm -rf data/$(MODEL_NAME)
rm -rf data/unicharset
rm -rf data/checkpoints
Now the unicharset for my dataset is quite limited and it doesnt contain
all the characters present in latin.unicharset. I want to train tesseract
on my dataset so that it can recognize characters better on the the kind of
images that I will be feeding to tesseract, but I don't want it to overfit
on the limited characters that I have in my dataset. Is it possible to do
that??.
For eg: All my training images contain the letter 'E'. My dev and test set
images contain the letters 'F' and 'G'. Is it possible ot train on 'E'
without losing out accuracy on 'F' and 'G'. Basically this is a problem of
incremental learning.
--
You received this message because you are subscribed to the Google Groups
"tesseract-ocr" group.
To unsubscribe from this group and stop receiving emails from it, send an email
to [email protected].
To view this discussion on the web visit
https://groups.google.com/d/msgid/tesseract-ocr/e8b832df-ff7b-401d-bc3a-bc64c4cf1fcd%40googlegroups.com.