I used the source code by "tesseractdotnet<http://code.google.com/p/tesseractdotnet/>", it can get characters position.
"tesseractdotnet <http://code.google.com/p/tesseractdotnet/>"* based-on tesseract-ocr v3.01 r590. *When I get the tesseract 3.02 r729 from svn, and use the under code on Windows XP(VS2008), bool succed = api->Recognize(monitor) >= 0; succed return true, int nChars = head->count; the nChars is always zero. how to get characters position in tesseract 3.02 r729? detail code: int main(int argc, char **argv) { char* image1 = "c:\\eurotext.tif"; char* lang1 = "eng"; int psm = 3; char* text = new char[1024]; int rc = 0; rc = image2text(image1,lang1,psm,text); printf("text: %s\n", text); } int image2text(const char* image,const char* lang,int psm,char* textout){ tesseract::PageSegMode pagesegmode = tesseract::PSM_AUTO; pagesegmode = static_cast<tesseract::PageSegMode>(psm); tesseract::TessBaseAPI api; // initialize monitor ETEXT_DESC* monitor = NULL; ETEXT_DESC* head = NULL; int fixed_buffer_factor = 100; int MAX_CHAR_RECOGNIZE = 32000; int n = 127; monitor = new ETEXT_DESC[fixed_buffer_factor*n]; monitor[1].more_to_come = 127; monitor[1].count = 0; // initialize api int rc = api.Init(image, lang, tesseract::OEM_DEFAULT); if (rc) { // fprintf(stderr, "Could not initialize tesseract.\n"); return 1; } // We have 2 possible sources of pagesegmode: a config file and // the command line. For backwards compatability reasons, the // default in tesseract is tesseract::PSM_SINGLE_BLOCK, but the // default for this program is tesseract::PSM_AUTO. We will let // the config file take priority, so the command-line default // can take priority over the tesseract default, so we use the // value from the command line only if the retrieved mode // is still tesseract::PSM_SINGLE_BLOCK, indicating no change // in any config file. Therefore the only way to force // tesseract::PSM_SINGLE_BLOCK is from the command line. // It would be simpler if we could set the value before Init, // but that doesn't work. if (api.GetPageSegMode() == tesseract::PSM_SINGLE_BLOCK) api.SetPageSegMode(pagesegmode); FILE* fin = fopen(image, "rb"); if (fin == NULL) { // printf("Cannot open input file: %s\n", image); return 2; } fclose(fin); PIX *pixs; if ((pixs = pixRead(image)) == NULL) { // printf("Unsupported image type.\n"); return 3; } // detect characters api.Clear(); api.ClearAdaptiveClassifier(); try { api.SetImage(pixs); bool succed = api.Recognize(monitor) >= 0; char* text = api.GetUTF8Text(); sprintf(textout, "%s", text); delete text; head = &monitor[1]; int lineIndex=0; int lineIdx = 0; int nChars = head->count; int i = 0; while (i < nChars) { // typedef struct { /*single character */ // It should be noted that the format for char_code for version 2.0 and beyond // is UTF8 which means that ASCII characters will come out as one structure but // other characters will be returned in two or more instances of this structure // with a single byte of the UTF8 code in each, but each will have the same // bounding box. Programs which want to handle languagues with different // characters sets will need to handle extended characters appropriately, but // *all* code needs to be prepared to receive UTF8 coded characters for // characters such as bullet and fancy quotes. // uinT16 char_code; /*character itself */ // inT16 left; /*of char (-1) */ // inT16 right; /*of char (-1) */ // inT16 top; /*of char (-1) */ // inT16 bottom; /*of char (-1) */ // inT16 font_index; /*what font (0) */ // uinT8 confidence; /*0=perfect, 100=reject (0/100) */ // uinT8 point_size; /*of char, 72=i inch, (10) */ // inT8 blanks; /*no of spaces before this char (1) */ // uinT8 formatting; /*char formatting (0) */ //} EANYCODE_CHAR; /*single character */ EANYCODE_CHAR* ch = &(head + i)->text[0]; printf("%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,", ch->char_code, ch->left, ch->top, ch->right, ch->bottom, ch->font_index, ch->confidence, ch->point_size, ch->formatting); i++; /*go to next char*/ } /* end while */ } catch (...) { } head = NULL; monitor = NULL; pixDestroy(&pixs); return 0; } -- You received this message because you are subscribed to the Google Groups "tesseract-ocr" group. To post to this group, send email to [email protected] To unsubscribe from this group, send email to [email protected] For more options, visit this group at http://groups.google.com/group/tesseract-ocr?hl=en

