I am trying to read the below image with the Page Iterator. It reads the 
whole page left to right except the very last column titled "Conditional 
Payment". I would like some help to tweak my code to fix this. The text for 
that column shows up at the end in its own <BLOCK>. For example :
<BLOCK>
Coordination of
Benefits and Recovery
<BLOCK>
Conditional Payment
<BLOCK>
$76.57
$5.56

Also, under "Total Charges", it reads $127.00 as $12700. For some reason it 
is not seeing the decimal. and under "Reimbursement Amount", it reads 
$76.57 as $76157. It seems to be reading the decimal as a one. Under 
"Diagnosis Codes" sometimes it mistakes the commas for decimal. And 
sometimes it reads a dollar sign as either a: 5, 3, or 8. Is there any 
configuration or training of data or other suggestions to have more 
accuracy? Thank you.

Here is my image and following is my page iteration code.

public void     iterateOCR(string imgLoc)

        {

            Rect _pdfRect = new Rect(0, 0, 612, 792); // Entire page - PDF 
coordinate system

            try

            {

                using (var ocr = new Tesseract.TesseractEngine(@"./tessdata"
, "eng", Tesseract.EngineMode.Default))

                {

                    using (var img = Pix.LoadFromFile(imgLoc))

                    {


                        using (var page = ocr.Process(img,_pdfRect, 
PageSegMode.Auto))

                        {                            

                            Console.WriteLine("Text (iterator):");

                            using (var iter = page.GetIterator())

                            {

                                iter.Begin();


                                do

                                {

                                    do

                                    {

                                        do

                                        {

                                            do

                                            {

                                                if (iter.IsAtBeginningOf(
PageIteratorLevel.Block))

                                                {

                                                    Console.WriteLine(
"<BLOCK>");                                       

                                                }

                                                Console.Write(iter.GetText(
PageIteratorLevel.Word));

                                                Console.Write(" ");


                                                if (iter.IsAtFinalOf(
PageIteratorLevel.TextLine, PageIteratorLevel.Word))

                                                {

                                                    Console.WriteLine();

                                                }


                                            }

                                            while (iter.Next(
PageIteratorLevel.TextLine, PageIteratorLevel.Word));


                                            if (iter.IsAtFinalOf(
PageIteratorLevel.Para, PageIteratorLevel.TextLine))

                                            {

                                               Console.WriteLine();



                                            }

                                        } while (iter.Next(PageIteratorLevel
.Para, PageIteratorLevel.TextLine));

                                    } while (iter.Next(PageIteratorLevel.
Block, PageIteratorLevel.Para));

                                } while (iter.Next(PageIteratorLevel.Block
));


                            } //using iter

                        } // using page

                    } // using img

                } // using ocr

            }


            catch (Exception e)



            {

               Trace.TraceError(e.ToString());

                Console.WriteLine("Unexpected Error: " + e.Message);

                Console.WriteLine("Details: ");

                Console.WriteLine(e.ToString());}











                       


<https://lh3.googleusercontent.com/-YcwH0eKBY-o/V-l9D27_yJI/AAAAAAAAAAs/BFuWMNLiqAo196kyb3cCxp7INC3DiRQxgCLcB/s1600/1AC9C.01_Page_0004_Image_0001.png>

-- 
You received this message because you are subscribed to the Google Groups 
"tesseract-ocr" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
To post to this group, send email to [email protected].
Visit this group at https://groups.google.com/group/tesseract-ocr.
To view this discussion on the web visit 
https://groups.google.com/d/msgid/tesseract-ocr/855cc712-d4a1-4b7d-88d4-f1f640a7136a%40googlegroups.com.
For more options, visit https://groups.google.com/d/optout.

Reply via email to