Hi,
I just completed the build of tesseract-ocr-3.04.00
including the training portion.
Attached the patch I used together with
configure LIBS="$(pkg-config --libs icu-i18n)"
to correctly include the icu dependency.
For what I see the additional steps
make training
make training-install
are only installing these additional files
/usr/bin/ambiguous_words.exe
/usr/bin/classifier_tester.exe
/usr/bin/cntraining.exe
/usr/bin/combine_tessdata.exe
/usr/bin/dawg2wordlist.exe
/usr/bin/mftraining.exe
/usr/bin/set_unicharset_properties.exe
/usr/bin/shapeclustering.exe
/usr/bin/text2image.exe
/usr/bin/unicharset_extractor.exe
/usr/bin/wordlist2dawg.exe
full list attached.
Questions:
- anything missing ?
- which portion of
https://github.com/tesseract-ocr/langdata
you would like to see in a training data package ?
The current splits is available at:
https://cygwin.com/packages/x86_64/tesseract-ocr/tesseract-ocr-3.04.00-1
https://cygwin.com/packages/x86_64/tesseract-ocr-devel/tesseract-ocr-devel-3.04.00-1
https://cygwin.com/packages/x86_64/libtesseract-ocr_3/libtesseract-ocr_3-3.04.00-1
only English language is installed by default and it also contain the
osd data:
https://cygwin.com/packages/x86_64/tesseract-ocr-eng/tesseract-ocr-eng-3.04-1
Others :
tesseract-ocr-deu/
tesseract-ocr-fra/
tesseract-ocr-ita/
tesseract-ocr-nld/
tesseract-ocr-por/
tesseract-ocr-spa/
tesseract-ocr-vie/
Regards
Marco
--
You received this message because you are subscribed to the Google Groups
"tesseract-ocr" group.
To unsubscribe from this group and stop receiving emails from it, send an email
to [email protected].
To post to this group, send email to [email protected].
Visit this group at http://groups.google.com/group/tesseract-ocr.
To view this discussion on the web visit
https://groups.google.com/d/msgid/tesseract-ocr/55B80674.4070709%40gmail.com.
For more options, visit https://groups.google.com/d/optout.
--- origsrc/tesseract-3.04.00/ccutil/ambigs.cpp 2015-07-11 09:53:12.000000000
+0200
+++ src/tesseract-3.04.00/ccutil/ambigs.cpp 2015-07-28 23:49:08.285967500
+0200
@@ -24,13 +24,13 @@
#include "helpers.h"
#include "universalambigs.h"
-#if defined _WIN32 || defined(__CYGWIN__)
+#if defined _WIN32
#ifndef __GNUC__
#define strtok_r strtok_s
#else
#include "strtok_r.h"
#endif /* __GNUC__ */
-#endif /* _WIN32 __CYGWIN__*/
+#endif /* _WIN32 */
namespace tesseract {
--- origsrc/tesseract-3.04.00/configure.ac 2015-07-11 09:53:12.000000000
+0200
+++ src/tesseract-3.04.00/configure.ac 2015-07-29 00:09:51.557732200 +0200
@@ -88,7 +88,7 @@ case "${host_os}" in
;;
cygwin*)
AM_CONDITIONAL(ADD_RT, false)
- AM_CONDITIONAL(T_WIN, true)
+ AM_CONDITIONAL(T_WIN, false)
AC_SUBST([AM_LDFLAGS], ['-Wl,-no-undefined -Wl,--as-needed'])
;;
solaris*)
--- origsrc/tesseract-3.04.00/training/pango_font_info.cpp 2015-07-11
09:53:12.000000000 +0200
+++ src/tesseract-3.04.00/training/pango_font_info.cpp 2015-07-28
23:32:10.261768400 +0200
@@ -18,6 +18,7 @@
**********************************************************************/
// Include automatically generated configuration file if running autoconf.
+#define _GNU_SOURCE
#ifdef HAVE_CONFIG_H
#include "config_auto.h"
#endif
./usr/bin/ambiguous_words.exe
./usr/bin/classifier_tester.exe
./usr/bin/cntraining.exe
./usr/bin/combine_tessdata.exe
./usr/bin/cygtesseract-3.dll
./usr/bin/dawg2wordlist.exe
./usr/bin/mftraining.exe
./usr/bin/set_unicharset_properties.exe
./usr/bin/shapeclustering.exe
./usr/bin/tesseract.exe
./usr/bin/text2image.exe
./usr/bin/unicharset_extractor.exe
./usr/bin/wordlist2dawg.exe
./usr/include/tesseract/apitypes.h
./usr/include/tesseract/baseapi.h
./usr/include/tesseract/basedir.h
./usr/include/tesseract/capi.h
./usr/include/tesseract/errcode.h
./usr/include/tesseract/fileerr.h
./usr/include/tesseract/genericvector.h
./usr/include/tesseract/helpers.h
./usr/include/tesseract/host.h
./usr/include/tesseract/ltrresultiterator.h
./usr/include/tesseract/memry.h
./usr/include/tesseract/ndminx.h
./usr/include/tesseract/ocrclass.h
./usr/include/tesseract/osdetect.h
./usr/include/tesseract/pageiterator.h
./usr/include/tesseract/params.h
./usr/include/tesseract/platform.h
./usr/include/tesseract/publictypes.h
./usr/include/tesseract/renderer.h
./usr/include/tesseract/resultiterator.h
./usr/include/tesseract/serialis.h
./usr/include/tesseract/strngs.h
./usr/include/tesseract/tesscallback.h
./usr/include/tesseract/thresholder.h
./usr/include/tesseract/unichar.h
./usr/include/tesseract/unicharmap.h
./usr/include/tesseract/unicharset.h
./usr/lib/libtesseract.dll.a
./usr/lib/pkgconfig/tesseract.pc
./usr/share/doc/tesseract-ocr/AUTHORS
./usr/share/doc/tesseract-ocr/ChangeLog
./usr/share/doc/tesseract-ocr/COPYING
./usr/share/doc/tesseract-ocr/NEWS
./usr/share/doc/tesseract-ocr/README
./usr/share/man/man1/ambiguous_words.1.gz
./usr/share/man/man1/cntraining.1.gz
./usr/share/man/man1/combine_tessdata.1.gz
./usr/share/man/man1/dawg2wordlist.1.gz
./usr/share/man/man1/mftraining.1.gz
./usr/share/man/man1/shapeclustering.1.gz
./usr/share/man/man1/tesseract.1.gz
./usr/share/man/man1/unicharset_extractor.1.gz
./usr/share/man/man1/wordlist2dawg.1.gz
./usr/share/man/man5/unicharambigs.5.gz
./usr/share/man/man5/unicharset.5.gz
./usr/share/tessdata/configs/ambigs.train
./usr/share/tessdata/configs/api_config
./usr/share/tessdata/configs/bigram
./usr/share/tessdata/configs/box.train
./usr/share/tessdata/configs/box.train.stderr
./usr/share/tessdata/configs/digits
./usr/share/tessdata/configs/hocr
./usr/share/tessdata/configs/inter
./usr/share/tessdata/configs/kannada
./usr/share/tessdata/configs/linebox
./usr/share/tessdata/configs/logfile
./usr/share/tessdata/configs/makebox
./usr/share/tessdata/configs/pdf
./usr/share/tessdata/configs/quiet
./usr/share/tessdata/configs/rebox
./usr/share/tessdata/configs/strokewidth
./usr/share/tessdata/configs/unlv
./usr/share/tessdata/pdf.ttf
./usr/share/tessdata/tessconfigs/batch
./usr/share/tessdata/tessconfigs/batch.nochop
./usr/share/tessdata/tessconfigs/matdemo
./usr/share/tessdata/tessconfigs/msdemo
./usr/share/tessdata/tessconfigs/nobatch
./usr/share/tessdata/tessconfigs/segdemo