keep the foo.traineddata inside the tessdata folder and then run the
command.

On Sun, Jul 29, 2018 at 5:00 AM <[email protected]> wrote:

> I am using a bash script to train LSTM model. I have the images and box
> file.
>
>
> My problem is the error returns when the command  combine_tessdata
> executed . also i have checked and no file called foo.traineddata created.
>
>
> Here is the bash code .
> export
>
>
> SHELL := /bin/bash
> LOCAL := $(PWD)/usr
> PATH := $(LOCAL)/bin:$(PATH)
> TESSDATA =  /usr/share/tesseract-ocr/tessdata
> LANGDATA = $(PWD)/langdata
>
>
> # Name of the model to be built. Default: $(MODEL_NAME)
> MODEL_NAME = foo
>
>
> # Name of the model to continue from. Default: $(CONTINUE_FROM)
> CONTINUE_FROM = $(MODEL_NAME)
>
>
> # No of cores to use for compiling leptonica/tesseract. Default: $(CORES)
> CORES = 4
>
>
> # Leptonica version. Default: $(LEPTONICA_VERSION)
> LEPTONICA_VERSION := 1.75.3
>
>
> # Tesseract commit. Default: $(TESSERACT_VERSION)
> TESSERACT_VERSION := 9ae97508aed1e5508458f1181b08501f984bf4e2
>
>
> # Tesseract langdata version. Default: $(LANGDATA_VERSION)
> LANGDATA_VERSION := master
>
>
> # Tesseract model repo to use. Default: $(TESSDATA_REPO)
> TESSDATA_REPO = _fast
>
>
> # Train directory. Default: $(TRAIN)
> TRAIN := data/train
>
>
> # Normalization Mode - see src/training/language_specific.sh for details.
> Default: $(NORM_MODE)
> NORM_MODE = 2
>
>
> # Page segmentation mode. Default: $(PSM)
> PSM = 6
>
>
> # Ratio of train / eval training data. Default: $(RATIO_TRAIN)
> RATIO_TRAIN := 0.90
>
>
> # BEGIN-EVAL makefile-parser --make-help Makefile
>
>
> help:
>  @echo ""
>  @echo "  Targets"
>  @echo ""
>  @echo "    unicharset       Create unicharset"
>  @echo "    lists            Create lists of lstmf filenames for training
> and eval"
>  @echo "    training         Start training"
>  @echo "    proto-model      Build the proto model"
>  @echo "    leptonica        Build leptonica"
>  @echo "    tesseract        Build tesseract"
>  @echo "    tesseract-langs  Download tesseract-langs"
>  @echo "    langdata         Download langdata"
>  @echo "    clean            Clean all generated files"
>  @echo ""
>  @echo "  Variables"
>  @echo ""
>  @echo "    MODEL_NAME         Name of the model to be built. Default:
> $(MODEL_NAME)"
>  @echo "    CONTINUE_FROM      Name of the model to continue from.
> Default: $(CONTINUE_FROM)"
>  @echo "    CORES              No of cores to use for compiling
> leptonica/tesseract. Default: $(CORES)"
>  @echo "    LEPTONICA_VERSION  Leptonica version. Default:
> $(LEPTONICA_VERSION)"
>  @echo "    TESSERACT_VERSION  Tesseract commit. Default:
> $(TESSERACT_VERSION)"
>  @echo "    LANGDATA_VERSION   Tesseract langdata version. Default:
> $(LANGDATA_VERSION)"
>  @echo "    TESSDATA_REPO      Tesseract model repo to use. Default:
> $(TESSDATA_REPO)"
>  @echo "    TRAIN              Train directory. Default: $(TRAIN)"
>  @echo "    NORM_MODE          Normalization Mode - see
> src/training/language_specific.sh for details. Default: $(NORM_MODE)"
>  @echo "    PSM                Page segmentation mode. Default: $(PSM)"
>  @echo "    RATIO_TRAIN        Ratio of train / eval training data.
> Default: $(RATIO_TRAIN)"
>
>
> # END-EVAL
>
>
> ALL_BOXES = data/all-boxes
> ALL_LSTMF = data/all-lstmf
>
>
> # Create unicharset
> unicharset: data/unicharset
>
>
> # Create lists of lstmf filenames for training and eval
> lists: $(ALL_LSTMF) data/list.train data/list.eval
>
>
> data/list.train: $(ALL_LSTMF)
>  total=`cat $(ALL_LSTMF) | wc -l` \
>     no=`echo "$$total * $(RATIO_TRAIN) / 1" | bc`; \
>     head -n "$$no" $(ALL_LSTMF) > "$@"
>
>
> data/list.eval: $(ALL_LSTMF)
>  total=`cat $(ALL_LSTMF) | wc -l` \
>     no=`echo "($$total - $$total * $(RATIO_TRAIN)) / 1" | bc`; \
>     tail -n "+$$no" $(ALL_LSTMF) > "$@"
>
>
> # Start training
> training: data/$(MODEL_NAME).traineddata
>
>
> data/unicharset: $(ALL_BOXES)
>  combine_tessdata -u $(TESSDATA)/$(CONTINUE_FROM).traineddata  $(TESSDATA
> )/$(CONTINUE_FROM).
>  unicharset_extractor --output_unicharset "$(TRAIN)/my.unicharset" --norm_mode
> $(NORM_MODE) "$(ALL_BOXES)"
>  merge_unicharsets $(TESSDATA)/$(CONTINUE_FROM).lstm-unicharset $(TRAIN)/
> my.unicharset  "$@"
>
>
> $(ALL_BOXES): $(sort $(patsubst %.tif,%.box,$(wildcard $(TRAIN)/*.tif)))
>  find $(TRAIN) -name '*.box' -exec cat {} \; > "$@"
>
>
> $(TRAIN)/%.box: $(TRAIN)/%.tif $(TRAIN)/%.gt.txt
>  python3 generate_line_box.py -i "$(TRAIN)/$*.tif" -t "$(TRAIN)/$*.gt.txt"
> > "$@"
>
>
> $(ALL_LSTMF): $(sort $(patsubst %.tif,%.lstmf,$(wildcard $(TRAIN)/*.tif)))
>  find $(TRAIN) -name '*.lstmf' -exec echo {} \; | sort -R -o "$@"
>
>
> $(TRAIN)/%.lstmf: $(TRAIN)/%.box
>  tesseract $(TRAIN)/$*.tif $(TRAIN)/$* --psm $(PSM) lstm.train
>
>
> # Build the proto model
> proto-model: data/$(MODEL_NAME)/$(MODEL_NAME).traineddata
>
>
> data/$(MODEL_NAME)/$(MODEL_NAME).traineddata: $(LANGDATA) data/unicharset
>  combine_lang_model \
>    --input_unicharset data/unicharset \
>    --script_dir $(LANGDATA) \
>    --output_dir data/ \
>    --lang $(MODEL_NAME)
>
>
> data/checkpoints/$(MODEL_NAME)_checkpoint: unicharset lists proto-model
>  mkdir -p data/checkpoints
>  lstmtraining \
>    --traineddata data/$(MODEL_NAME)/$(MODEL_NAME).traineddata \
>    --net_spec "[1,36,0,1 Ct3,3,16 Mp3,3 Lfys48 Lfx96 Lrx96 Lfx256 O1c`head
> -n1 data/unicharset`]" \
>    --model_output data/checkpoints/$(MODEL_NAME) \
>    --learning_rate 20e-4 \
>    --train_listfile data/list.train \
>    --eval_listfile data/list.eval \
>    --max_iterations 10000
>
>
> data/$(MODEL_NAME).traineddata: data/checkpoints/$(MODEL_NAME)_checkpoint
>  lstmtraining \
>  --stop_training \
>  --continue_from $^ \
>  --traineddata data/$(MODEL_NAME)/$(MODEL_NAME).traineddata \
>  --model_output $@
>
>
> # Build leptonica
> leptonica: leptonica.built
>
>
> leptonica.built: leptonica-$(LEPTONICA_VERSION)
>  cd $< ; \
>  ./configure --prefix=$(LOCAL) && \
>  make -j$(CORES) && \
>  make install && \
>  date > "$@"
>
>
> leptonica-$(LEPTONICA_VERSION): leptonica-$(LEPTONICA_VERSION).tar.gz
>  tar xf "$<"
>
>
> leptonica-$(LEPTONICA_VERSION).tar.gz:
>  wget 'http://www.leptonica.org/source/$@'
>
>
> # Build tesseract
> tesseract: tesseract.built tesseract-langs
>
>
> tesseract.built: tesseract-$(TESSERACT_VERSION)
>  cd $< && \
>  sh autogen.sh && \
>  PKG_CONFIG_PATH="$(LOCAL)/lib/pkgconfig" \
>  LEPTONICA_CFLAGS="-I$(LOCAL)/include/leptonica" \
>  ./configure --prefix=$(LOCAL) && \
>  LDFLAGS="-L$(LOCAL)/lib"\
>  make -j$(CORES) && \
>  make install && \
>  make -j$(CORES) training-install && \
>  date > "$@"
>
>
> tesseract-$(TESSERACT_VERSION):
>  wget
> https://github.com/tesseract-ocr/tesseract/archive/$(TESSERACT_VERSION).zip
>  unzip $(TESSERACT_VERSION).zip
>
>
> # Download tesseract-langs
> tesseract-langs: $(TESSDATA)/eng.traineddata
>
>
> # Download langdata
> langdata: $(LANGDATA)
>
>
> $(LANGDATA):
>  #wget '
> https://github.com/tesseract-ocr/langdata/archive/$(LANGDATA_VERSION).zip'
>  unzip $(LANGDATA_VERSION).zip
>
>
> $(TESSDATA)/eng.traineddata:
>  cd $(TESSDATA) && wget
> https://github.com/tesseract-ocr/tessdata$(TESSDATA_REPO)/raw/master/$(notdir
> $@)
>
>
> # Clean all generated files
> clean:
>  find data/train -name '*.box' -delete
>  find data/train -name '*.lstmf' -delete
>  rm -rf data/all-*
>  rm -rf data/list.*
>  rm -rf data/$(MODEL_NAME)
>  rm -rf data/unicharset
>  rm -rf data/checkpoints
>
>
> Also here is the error
>
>
> combine_tessdata -u /usr/share/tesseract-ocr/tessdata/foo.traineddata  /
> usr/share/tesseract-ocr/tessdata/foo.
> Failed to read /usr/share/tesseract-ocr/tessdata/foo.traineddata
> Makefile:97: recipe for target 'data/unicharset' failed
> make: *** [data/unicharset] Error 1
>
>
>
> --
> You received this message because you are subscribed to the Google Groups
> "tesseract-ocr" group.
> To unsubscribe from this group and stop receiving emails from it, send an
> email to [email protected].
> To post to this group, send email to [email protected].
> Visit this group at https://groups.google.com/group/tesseract-ocr.
> To view this discussion on the web visit
> https://groups.google.com/d/msgid/tesseract-ocr/964f8a60-ec0e-44d9-a6a2-1b81eb49ab2b%40googlegroups.com
> <https://groups.google.com/d/msgid/tesseract-ocr/964f8a60-ec0e-44d9-a6a2-1b81eb49ab2b%40googlegroups.com?utm_medium=email&utm_source=footer>
> .
> For more options, visit https://groups.google.com/d/optout.
>

-- 
You received this message because you are subscribed to the Google Groups 
"tesseract-ocr" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
To post to this group, send email to [email protected].
Visit this group at https://groups.google.com/group/tesseract-ocr.
To view this discussion on the web visit 
https://groups.google.com/d/msgid/tesseract-ocr/CAD_EDkZPetwKCM0wOpSNN3nMnpDTsWkHe7pvS_e%2BXgMe9dUfuQ%40mail.gmail.com.
For more options, visit https://groups.google.com/d/optout.

Reply via email to