Hello, I'd like to know how Tesseract reallt deal with the recognized 
result. Or I need to produce a hocr file and read from it to get position 
information, that doesn't sound good....

Le lundi 16 juillet 2012 16:43:31 UTC+2, sventech a écrit : 
>
> hOCR output includes character location -- look into that. 
> --Sven 
>
> On Sat, Jul 14, 2012 at 7:07 AM, moka <[email protected] <javascript:>> 
> wrote: 
> > I used the source code by "tesseractdotnet", it can get characters 
> position. 
> > 
> > "tesseractdotnet" based-on tesseract-ocr v3.01 r590. 
> > 
> > When I get the tesseract 3.02 r729 from svn, and use the under code on 
> > Windows XP(VS2008), 
> > 
> > bool succed = api->Recognize(monitor) >= 0; 
> > succed return true, 
> > int nChars = head->count; 
> > the nChars is always zero. 
> > how to get characters position in tesseract 3.02 r729? 
> > 
> > detail code: 
> > 
> > int main(int argc, char **argv) { 
> > 
> >     char* image1 = "c:\\eurotext.tif"; 
> >     char* lang1 = "eng"; 
> >     int psm = 3; 
> >     char* text = new char[1024]; 
> >     int rc = 0; 
> > 
> >     rc = image2text(image1,lang1,psm,text); 
> > 
> >     printf("text: %s\n", text); 
> > } 
> > 
> > int image2text(const char* image,const char* lang,int psm,char* 
> textout){ 
> >     tesseract::PageSegMode pagesegmode = tesseract::PSM_AUTO; 
> >     pagesegmode = static_cast<tesseract::PageSegMode>(psm); 
> > 
> >     tesseract::TessBaseAPI  api; 
> > 
> >     // initialize monitor 
> >     ETEXT_DESC* monitor = NULL; 
> >     ETEXT_DESC* head = NULL; 
> >     int fixed_buffer_factor = 100; 
> >     int MAX_CHAR_RECOGNIZE = 32000; 
> >     int n = 127; 
> >     monitor = new ETEXT_DESC[fixed_buffer_factor*n]; 
> >     monitor[1].more_to_come = 127; 
> >     monitor[1].count = 0; 
> > 
> > 
> >     // initialize api 
> >     int rc = api.Init(image, lang, tesseract::OEM_DEFAULT); 
> >     if (rc) { 
> >         // fprintf(stderr, "Could not initialize tesseract.\n"); 
> >         return 1; 
> >     } 
> > 
> >     // We have 2 possible sources of pagesegmode: a config file and 
> >     // the command line. For backwards compatability reasons, the 
> >     // default in tesseract is tesseract::PSM_SINGLE_BLOCK, but the 
> >     // default for this program is tesseract::PSM_AUTO. We will let 
> >     // the config file take priority, so the command-line default 
> >     // can take priority over the tesseract default, so we use the 
> >     // value from the command line only if the retrieved mode 
> >     // is still tesseract::PSM_SINGLE_BLOCK, indicating no change 
> >     // in any config file. Therefore the only way to force 
> >     // tesseract::PSM_SINGLE_BLOCK is from the command line. 
> >     // It would be simpler if we could set the value before Init, 
> >     // but that doesn't work. 
> >     if (api.GetPageSegMode() == tesseract::PSM_SINGLE_BLOCK) 
> >         api.SetPageSegMode(pagesegmode); 
> > 
> >     FILE* fin = fopen(image, "rb"); 
> >     if (fin == NULL) { 
> >         // printf("Cannot open input file: %s\n", image); 
> >         return 2; 
> >     } 
> >     fclose(fin); 
> > 
> >     PIX   *pixs; 
> >     if ((pixs = pixRead(image)) == NULL) { 
> >         // printf("Unsupported image type.\n"); 
> >         return 3; 
> >     } 
> > 
> >     // detect characters 
> >     api.Clear(); 
> >     api.ClearAdaptiveClassifier(); 
> >     try 
> >     { 
> >         api.SetImage(pixs); 
> >         bool succed = api.Recognize(monitor) >= 0; 
> >         char* text = api.GetUTF8Text(); 
> >         sprintf(textout, "%s", text); 
> >         delete text; 
> > 
> >         head = &monitor[1]; 
> > 
> >         int lineIndex=0; 
> >         int lineIdx = 0; 
> >         int nChars = head->count; 
> >         int i = 0; 
> >         while (i < nChars) 
> >         { 
> >             // typedef struct {                  /*single character */ 
> >             // It should be noted that the format for char_code for 
> version 
> > 2.0 and beyond 
> >             // is UTF8 which means that ASCII characters will come out 
> as 
> > one structure but 
> >             // other characters will be returned in two or more 
> instances of 
> > this structure 
> >             // with a single byte of the  UTF8 code in each, but each 
> will 
> > have the same 
> >             // bounding box. Programs which want to handle languagues 
> with 
> > different 
> >             // characters sets will need to handle extended characters 
> > appropriately, but 
> >             // *all* code needs to be prepared to receive UTF8 coded 
> > characters for 
> >             // characters such as bullet and fancy quotes. 
> >             //  uinT16 char_code;              /*character itself */ 
> >             //  inT16 left;                    /*of char (-1) */ 
> >             //  inT16 right;                   /*of char (-1) */ 
> >             //  inT16 top;                     /*of char (-1) */ 
> >             //  inT16 bottom;                  /*of char (-1) */ 
> >             //  inT16 font_index;              /*what font (0) */ 
> >             //  uinT8 confidence;              /*0=perfect, 100=reject 
> > (0/100) */ 
> >             //  uinT8 point_size;              /*of char, 72=i inch, 
> (10) */ 
> >             //  inT8 blanks;                   /*no of spaces before 
> this 
> > char (1) */ 
> >             //  uinT8 formatting;              /*char formatting (0) */ 
> >             //} EANYCODE_CHAR;                 /*single character */ 
> > 
> > 
> >             EANYCODE_CHAR* ch = &(head + i)->text[0]; 
> >             printf("%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,", ch->char_code, 
> > ch->left, ch->top, ch->right, ch->bottom, ch->font_index, 
> ch->confidence, 
> > ch->point_size, ch->formatting); 
> >             i++; /*go to next char*/ 
> >         } /* end while */ 
> > 
> >     } 
> >     catch (...) 
> >     { 
> > 
> >     } 
> > 
> >     head = NULL; 
> >     monitor = NULL; 
> > 
> >     pixDestroy(&pixs); 
> > 
> >     return 0; 
> > } 
> > 
> > 
> > 
> > 
> > 
> > 
> > -- 
> > You received this message because you are subscribed to the Google 
> > Groups "tesseract-ocr" group. 
> > To post to this group, send email to 
> > [email protected]<javascript:> 
> > To unsubscribe from this group, send email to 
> > [email protected] <javascript:> 
> > For more options, visit this group at 
> > http://groups.google.com/group/tesseract-ocr?hl=en 
>
>
>
> -- 
> ``All that is gold does not glitter, 
>   not all those who wander are lost; 
> the old that is strong does not wither, 
>   deep roots are not reached by the frost. 
> From the ashes a fire shall be woken, 
>   a light from the shadows shall spring; 
> renewed shall be blade that was broken, 
>   the crownless again shall be king.” 
>

-- 
You received this message because you are subscribed to the Google
Groups "tesseract-ocr" group.
To post to this group, send email to [email protected]
To unsubscribe from this group, send email to
[email protected]
For more options, visit this group at
http://groups.google.com/group/tesseract-ocr?hl=en

Reply via email to