I used the source code by "tesseractdotnet", it can get characters position.
"tesseractdotnet" based-on tesseract-ocr v3.01 r590.
When I get the tesseract 3.02 r729 from svn, and use the under code on
Windows XP(VS2008),
bool succed = api->Recognize(monitor) >= 0;
succed return true,
int nChars = head->count;
the nChars is always zero.
how to get characters position in tesseract 3.02 r729?
detail code:
int main(int argc, char **argv) {
char* image1 = "c:\\eurotext.tif";
char* lang1 = "eng";
int psm = 3;
char* text = new char[1024];
int rc = 0;
rc = image2text(image1,lang1,psm,text);
printf("text: %s\n", text);
}
int image2text(const char* image,const char* lang,int psm,char* textout){
tesseract::PageSegMode pagesegmode = tesseract::PSM_AUTO;
pagesegmode = static_cast<tesseract::PageSegMode>(psm);
tesseract::TessBaseAPI api;
// initialize monitor
ETEXT_DESC* monitor = NULL;
ETEXT_DESC* head = NULL;
int fixed_buffer_factor = 100;
int MAX_CHAR_RECOGNIZE = 32000;
int n = 127;
monitor = new ETEXT_DESC[fixed_buffer_factor*n];
monitor[1].more_to_come = 127;
monitor[1].count = 0;
// initialize api
int rc = api.Init(image, lang, tesseract::OEM_DEFAULT);
if (rc) {
// fprintf(stderr, "Could not initialize tesseract.\n");
return 1;
}
// We have 2 possible sources of pagesegmode: a config file and
// the command line. For backwards compatability reasons, the
// default in tesseract is tesseract::PSM_SINGLE_BLOCK, but the
// default for this program is tesseract::PSM_AUTO. We will let
// the config file take priority, so the command-line default
// can take priority over the tesseract default, so we use the
// value from the command line only if the retrieved mode
// is still tesseract::PSM_SINGLE_BLOCK, indicating no change
// in any config file. Therefore the only way to force
// tesseract::PSM_SINGLE_BLOCK is from the command line.
// It would be simpler if we could set the value before Init,
// but that doesn't work.
if (api.GetPageSegMode() == tesseract::PSM_SINGLE_BLOCK)
api.SetPageSegMode(pagesegmode);
FILE* fin = fopen(image, "rb");
if (fin == NULL) {
// printf("Cannot open input file: %s\n", image);
return 2;
}
fclose(fin);
PIX *pixs;
if ((pixs = pixRead(image)) == NULL) {
// printf("Unsupported image type.\n");
return 3;
}
// detect characters
api.Clear();
api.ClearAdaptiveClassifier();
try
{
api.SetImage(pixs);
bool succed = api.Recognize(monitor) >= 0;
char* text = api.GetUTF8Text();
sprintf(textout, "%s", text);
delete text;
head = &monitor[1];
int lineIndex=0;
int lineIdx = 0;
int nChars = head->count;
int i = 0;
while (i < nChars)
{
// typedef struct { /*single character */
// It should be noted that the format for char_code for version
2.0 and beyond
// is UTF8 which means that ASCII characters will come out as
one structure but
// other characters will be returned in two or more instances
of this structure
// with a single byte of the UTF8 code in each, but each will
have the same
// bounding box. Programs which want to handle languagues with
different
// characters sets will need to handle extended characters
appropriately, but
// *all* code needs to be prepared to receive UTF8 coded
characters for
// characters such as bullet and fancy quotes.
// uinT16 char_code; /*character itself */
// inT16 left; /*of char (-1) */
// inT16 right; /*of char (-1) */
// inT16 top; /*of char (-1) */
// inT16 bottom; /*of char (-1) */
// inT16 font_index; /*what font (0) */
// uinT8 confidence; /*0=perfect, 100=reject
(0/100) */
// uinT8 point_size; /*of char, 72=i inch, (10) */
// inT8 blanks; /*no of spaces before this
char (1) */
// uinT8 formatting; /*char formatting (0) */
//} EANYCODE_CHAR; /*single character */
EANYCODE_CHAR* ch = &(head + i)->text[0];
printf("%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,", ch->char_code,
ch->left, ch->top, ch->right, ch->bottom, ch->font_index, ch->confidence,
ch->point_size, ch->formatting);
i++; /*go to next char*/
} /* end while */
}
catch (...)
{
}
head = NULL;
monitor = NULL;
pixDestroy(&pixs);
return 0;
}
--
You received this message because you are subscribed to the Google
Groups "tesseract-ocr" group.
To post to this group, send email to [email protected]
To unsubscribe from this group, send email to
[email protected]
For more options, visit this group at
http://groups.google.com/group/tesseract-ocr?hl=en