>
> Can't tell you for Windows.
>
On Unix-like system it works like this:
sub tesseract {
my ($conf, $imagefile) = @_;
# tesseract my_0002.png my_0002.png
# -c load_bigram_dawg=false -c load_freq_dawg=false -c
load_system_dawg=false
# -c tessedit_write_images=true
# --oem 3
# e.g. 'tessdata' => '/usr/local/share/tessdata',
$ENV{'TESSDATA_PREFIX'} = $conf->{'tessdata'} if $conf->{'tessdata'};
# e.g. 'tesseract' => '/usr/local/bin/tesseract',
my $command = $conf->{'tesseract'};
my $basename = $imagefile;
my $language = '-l ' . $options->{'language'};
my $tess_options = '-c tessedit_write_images=true'; # writes
tessinput.tif
#my $files = 'makebox hocr txt pdf'; # writes $base.box
$base.hocr $base.txt
my $files = 'txt'; # writes $base.txt
$files = $options->{'file_format'};
my $tessdata = '';
$tessdata = '--tessdata-dir ' . $conf->{'tessdata'} if
$conf->{'tessdata'};
my $psm = '--psm 4';
if ($options->{'psm'} =~ m/^\d{1,2}$/) {
$psm = '--psm ' . $options->{'psm'};
}
$basename =~ s/\.(png|jpg|tif|gif)$//i;
#my @command = ($command, $imagefile, $basename, $language,
$tess_options, $tessdata, $files);
my @command = ($command, $imagefile, $basename, $language, $psm,
$tessdata, $files);
my $command_string = join(' ', @command);
print STDERR $command_string, "\n" if ($options->{'verbose'} >= 1);
system($command_string);
if ($? == -1) {
die "$command $imagefile failed: $!";
}
my $new_name = $basename . '.tessinput.tif';
if (-e 'tessinput.tif' && -f 'tessinput.tif') {
rename('tessinput.tif',"$new_name");
}
my $txtfile = $basename . '.txt';
$basename =~ s/_\d+$//i;
my $txtall = $basename . '.tess.txt';
if (($files =~ m/txt/) && -e $txtfile && -f $txtfile) {
$command_string = "cat $txtfile >> $txtall";
print STDERR $command_string, "\n" if ($options->{'verbose'} >= 1);
system($command_string);
if ($? == -1) {
die "$command_string failed: $!";
}
}
}
--
You received this message because you are subscribed to the Google Groups
"tesseract-ocr" group.
To unsubscribe from this group and stop receiving emails from it, send an email
to [email protected].
To view this discussion on the web visit
https://groups.google.com/d/msgid/tesseract-ocr/2e66d988-28a7-4b01-983e-6fa7cf12e178o%40googlegroups.com.