Comment out this line, which has restricted recognition outputs to digits
only:
//ocr.SetVariable("tessedit_char_whitelist", "0123456789"); // If digit only
On Thursday, July 18, 2013 3:04:46 AM UTC-5, Raymond Osterbrink wrote:
>
> Hi there,
>
> i use the common tessnet2 code for console, shown on the project site,
> with only a tiny modification (which check for pdf and converts them to
> jpg).
> so far, it works fine, only problem is, words are printed in some numeric
> code (the exact amount of words in the text), which i dont know how to
> interpret or convert.
>
> Code:
> using System;
>
> using System.Collections.Generic;
> using System.Drawing;
> using System.IO;
> using GhostscriptSharp;
> using GhostscriptSharp.Settings;
>
>
> namespace tess_C2
> {
> class Program
> {
> static void Main(string[] args)
> {
> string end;
> do
> {
> Read();
> Console.WriteLine("\nquit app?\ny/n");
> end = Console.ReadLine();
> Console.Clear();
> }
> while (end != "y");
> }
> static void Read()
> {
> Bitmap image = new Bitmap(1,1);
> try
> { image = new Bitmap(Input.Img()); }
> catch
> { image = new Bitmap(Input.PdfConverter()); }
> tessnet2.Tesseract ocr = new tessnet2.Tesseract();
> ocr.SetVariable("tessedit_char_whitelist", "0123456789"); //
> If digit only
> ocr.Init(@"C:\tesseract\lang", Input.Lang(), false); // To
> use correct tessdata
> List<tessnet2.Word> result = ocr.DoOCR(image, Rectangle.Empty
> );
> Translate(result);
> foreach (tessnet2.Word word in result)
> { Console.WriteLine("{0} : {1}", word.Confidence, word.Text);
> }
> }
> static void Translate(List<tessnet2.Word> result)
> {
> //Translate numbers to text and send to Print()
>
> Print();
> }
> static void Print()
> {
> //Print Text
> }
> }
> class Input
> {
> static string fileDest;
> [+] public static string Img()
> [+] public static Bitmap PdfConverter()
>
> public static string Lang()
> {
> string[] selection = { "eng", "deu", "fra", "spa" };
> Console.Clear();
> Ask:
> byte b = 0;
> Console.WriteLine("select Language:");
> Console.Write("| ");
> foreach (string item in selection)
> {
> b++;
>
> Console.Write("\"{0}\" for \"{1}\" | ", b, item);
> }
> Console.Write("\n");
> try
> {
> byte sel = Convert.ToByte(Console.ReadLine());
> sel--;
> return selection[sel];
> }
> catch
> {
> Console.Clear();
> Console.WriteLine("wrong input");
> goto Ask;
> }
> }
> }
>
>
> }
>
> Output:
>
>> 42 : 5518
>> 255 : 5329
>> 255 : 50111
>> 255 : 5519
>> 123 : 0555051
>> 58 : 5150111151
>> 45 : 009
>> 194 : 5555111180
>> 57 : 01185
>> 42 : 5518
>
>
> i'm pretty sure, my language files are correct, its the
> tesseract-2.00eng.tar.gz pack containing:
>
> eng.DangAmbigs
>> eng.freq-dawg
>> eng.inttemp
>> eng.nomproto
>> eng.pffmtable
>> eng.unicharset
>> eng.user-words
>> eng.word-dawg
>
>
> how ever, i can imagine, there are still some *.h or *.cpp files missing,
> i found their need only via error-info in debug-mode (with F11), but i have
> absolutly no clue which one the possibly missing are.
>
> what i've got is:
>
> clst.h
>> elst.h
>> orcblock.h
>> pageres.h
>> tessnet2.cpp
>> tessnet2.h
>> varable.h
>
>
--
--
You received this message because you are subscribed to the Google
Groups "tesseract-ocr" group.
To post to this group, send email to [email protected]
To unsubscribe from this group, send email to
[email protected]
For more options, visit this group at
http://groups.google.com/group/tesseract-ocr?hl=en
---
You received this message because you are subscribed to the Google Groups
"tesseract-ocr" group.
To unsubscribe from this group and stop receiving emails from it, send an email
to [email protected].
For more options, visit https://groups.google.com/groups/opt_out.