Comment out this line, which has restricted recognition outputs to digits 
only:

//ocr.SetVariable("tessedit_char_whitelist", "0123456789"); // If digit only


On Thursday, July 18, 2013 3:04:46 AM UTC-5, Raymond Osterbrink wrote:
>
> Hi there,
>
> i use the common tessnet2 code for console, shown on the project site, 
> with only a tiny modification (which check for pdf and converts them to 
> jpg).
> so far, it works fine, only problem is, words are printed in some numeric 
> code (the exact amount of words in the text), which i dont know how to 
> interpret or convert.
>
> Code:
> using System;
>
> using System.Collections.Generic;
> using System.Drawing;
> using System.IO;
> using GhostscriptSharp;
> using GhostscriptSharp.Settings;
>
>
> namespace tess_C2
> {
>     class Program
>     {
>         static void Main(string[] args)
>         {
>             string end;
>             do
>             {
>                 Read();
>                 Console.WriteLine("\nquit app?\ny/n");
>                 end = Console.ReadLine();
>                 Console.Clear();
>             } 
>             while (end != "y");
>         }
>         static void Read()
>         {
>             Bitmap image = new Bitmap(1,1);
>             try
>             { image = new Bitmap(Input.Img()); }
>             catch
>             { image = new Bitmap(Input.PdfConverter()); }
>             tessnet2.Tesseract ocr = new tessnet2.Tesseract();
>             ocr.SetVariable("tessedit_char_whitelist", "0123456789"); // 
> If digit only
>             ocr.Init(@"C:\tesseract\lang", Input.Lang(), false); // To 
> use correct tessdata
>             List<tessnet2.Word> result = ocr.DoOCR(image, Rectangle.Empty
> );
>             Translate(result);
>             foreach (tessnet2.Word word in result)
>             { Console.WriteLine("{0} : {1}", word.Confidence, word.Text); 
> }
>         }
>         static void Translate(List<tessnet2.Word> result)
>         {
>             //Translate numbers to text and send to Print()
>
>             Print();
>         }
>         static void Print()
>         {
>             //Print Text
>         }
>     }
>     class Input
>     {
>         static string fileDest;
> [+]     public static string Img()
> [+]     public static Bitmap PdfConverter()
>        
>         public static string Lang()
>         {
>             string[] selection = { "eng", "deu", "fra", "spa" };
>             Console.Clear();
>             Ask:
>             byte b = 0;            
>             Console.WriteLine("select Language:");
>             Console.Write("| ");
>                 foreach (string item in selection)
>                 {
>                     b++;
>                     
>                     Console.Write("\"{0}\" for \"{1}\" | ", b, item);
>                 }
>                 Console.Write("\n");
>             try
>             {
>                 byte sel = Convert.ToByte(Console.ReadLine());
>                 sel--;
>                 return selection[sel];
>             }
>             catch
>             {
>                 Console.Clear();
>                 Console.WriteLine("wrong input");
>                 goto Ask;
>             }
>         }
>     }
>
>
> }
>
> Output:
>
>> 42 : 5518
>> 255 : 5329
>> 255 : 50111
>> 255 : 5519
>> 123 : 0555051
>> 58 : 5150111151
>> 45 : 009
>> 194 : 5555111180
>> 57 : 01185
>> 42 : 5518
>
>
> i'm pretty sure, my language files are correct, its the 
> tesseract-2.00eng.tar.gz pack containing: 
>
> eng.DangAmbigs 
>> eng.freq-dawg 
>> eng.inttemp 
>> eng.nomproto 
>> eng.pffmtable 
>> eng.unicharset 
>> eng.user-words 
>> eng.word-dawg
>
>
> how ever, i can imagine, there are still some *.h or *.cpp files missing, 
> i found their need only via error-info in debug-mode (with F11), but i have 
> absolutly no clue which one the possibly missing are.
>
> what i've got is:
>
> clst.h
>> elst.h
>> orcblock.h
>> pageres.h
>> tessnet2.cpp
>> tessnet2.h
>> varable.h
>
>

-- 
-- 
You received this message because you are subscribed to the Google
Groups "tesseract-ocr" group.
To post to this group, send email to [email protected]
To unsubscribe from this group, send email to
[email protected]
For more options, visit this group at
http://groups.google.com/group/tesseract-ocr?hl=en

--- 
You received this message because you are subscribed to the Google Groups 
"tesseract-ocr" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
For more options, visit https://groups.google.com/groups/opt_out.


Reply via email to