Re: [tesseract-ocr] Re: Looking to hire a pytesseract consultant via skype

Lorenzo Bolzani Thu, 02 Apr 2020 02:44:29 -0700

Hi,
you could try to look at the distances between the symbol boxes, see the
attached script.


It's not very reliable as it depends very much on how you preprocess the
text and you have to fine the magic threshold. I'm using the 4.0 version,
symbol boxes were improved in the 4.1 version, it could work better.

Or, maybe simpler, you could rely on the fact that letters are alone or
followed by numbers, never as pairs.

If this is always the case this should work:

s="N1VN2V2N2V2TRV3T3R3RIN8S8R2R1T6R2T2T1"
parts = re.compile("(\w\d?)").split(s)
parts =[s for s in parts if s]

There is a mistake on R1 being seen as RI. If I is not a valid letter you
can do a simple replace of I with 1.


Bye

Lorenzo


Il giorno gio 26 mar 2020 alle ore 06:24 Bill Upham <[email protected]>
ha scritto:

> Thank you Aaron for the information, it was an improvement, I'm attaching
> one of the png files that I read. (I have 200)
> It is interesting how It is still not reading every file 100% correctly.
> My script counts the digits and sometimes it misses one of them or it calls
> a 1 a 15.
> Maybe I'm expecting perfection from computer vision and that's just not
> the case!
> Thanks again
>
> Bill Upham
>
> On Sat, Mar 14, 2020 at 3:03 PM Aaron Stewart <[email protected]>
> wrote:
>
>>
>> roi = cv2.resize(roi, None, fx=2, fy=2)
>> _, roi = cv2.threshold(roi, 128+64, 255, cv2.THRESH_BINARY)
>> roi = cv2.GaussianBlur(roi, (3,3), 0)
>> text_detected = image_to_string(roi, config="--psm 10 --oem 3
>> tessedit_char_whitelist=0123456789", )
>>
>> --
>> You received this message because you are subscribed to the Google Groups
>> "tesseract-ocr" group.
>> To unsubscribe from this group and stop receiving emails from it, send an
>> email to [email protected].
>> To view this discussion on the web visit
>> https://groups.google.com/d/msgid/tesseract-ocr/2ca084e4-aae6-423e-b359-a472e00579e6%40googlegroups.com
>> <https://groups.google.com/d/msgid/tesseract-ocr/2ca084e4-aae6-423e-b359-a472e00579e6%40googlegroups.com?utm_medium=email&utm_source=footer>
>> .
>>
> --
> You received this message because you are subscribed to the Google Groups
> "tesseract-ocr" group.
> To unsubscribe from this group and stop receiving emails from it, send an
> email to [email protected].
> To view this discussion on the web visit
> https://groups.google.com/d/msgid/tesseract-ocr/CAF5KrqB3HiPT3cKP6QLUR4u%2Bu3W1B7VbdUfKLfBYs-HnumwZWg%40mail.gmail.com
> <https://groups.google.com/d/msgid/tesseract-ocr/CAF5KrqB3HiPT3cKP6QLUR4u%2Bu3W1B7VbdUfKLfBYs-HnumwZWg%40mail.gmail.com?utm_medium=email&utm_source=footer>
> .
>

-- 
You received this message because you are subscribed to the Google Groups 
"tesseract-ocr" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
To view this discussion on the web visit 
https://groups.google.com/d/msgid/tesseract-ocr/CAMgOLLwx%2BRww%3DaxGXp%2BqfPd8rstaNaxTAb1CA2TtM-9spqV%2BiA%40mail.gmail.com.

'''
Created on May 16, 2018

@author: trz
'''

import logging
import sys
import time

import tesserocr
from collections import deque
from multiprocessing import Lock
from tesserocr import PyTessBaseAPI, RIL, iterate_level
from operator import itemgetter

import cv2
import numpy as np


# DEFAULT_LANG="ita15kf"
# DEFAULT_LANG="elett_16000"
DEFAULT_LANG = "eng"

colors = [(255, 0, 0), (0, 255, 0), (0, 0, 255)]
yellow = (255, 255, 0)
magenta = (255, 0, 255)
gray = (100, 100, 100)

DEBUG = False


def recognize_text(raw_img):

    lang = DEFAULT_LANG
    psm_mode = tesserocr.PSM.SINGLE_BLOCK

    api = PyTessBaseAPI(lang=lang)
    logging.info("Tesseract version %s", api.Version())
    api.Init(lang=lang)
    api.SetPageSegMode(psm_mode)

    api.SetImageBytes(raw_img.tobytes(), raw_img.shape[1], raw_img.shape[0], 1, raw_img.shape[1])

    api.Recognize()
    ri = api.GetIterator()

    prev_symbol_end = -1
    prev_symbol = None
    draw_img = cv2.cvtColor(raw_img, cv2.COLOR_GRAY2BGR)
    for i, r in enumerate(iterate_level(ri, RIL.SYMBOL)):

        symbolBounds = r.BoundingBox(RIL.SYMBOL)
        conf = r.Confidence(RIL.SYMBOL)

        #new_word = r.IsAtBeginningOf(RIL.WORD)
        #new_line = r.IsAtBeginningOf(RIL.TEXTLINE)
        #print("NW/NL", new_word, new_line)

        if symbolBounds is None:
            print("symbolBounds is None", conf)
            continue

        symbol = r.GetUTF8Text(RIL.SYMBOL)

        cv2.rectangle(draw_img, (symbolBounds[0], symbolBounds[1]), (symbolBounds[2], symbolBounds[3]),
                      colors[i % 3])
        if i % 2 == 0:
            cv2.rectangle(draw_img, (symbolBounds[0], 0), (symbolBounds[2], 1), yellow, 2)
        else:
            cv2.rectangle(draw_img, (symbolBounds[0], draw_img.shape[0] - 2),
                          (symbolBounds[2], draw_img.shape[0] - 2), magenta, 2)

        # space between symbols
        curr_symbol_start = symbolBounds[0]
        if prev_symbol_end == -1:
           space_between_symbols = 0
        else:
            space_between_symbols = curr_symbol_start - prev_symbol_end
        print("From", prev_symbol, "to", symbol, space_between_symbols)
        symbol_h = symbolBounds[3] - symbolBounds[1]
        half_h = symbol_h // 2 + symbolBounds[1]
        th = 12
        color = (200,200,200) if space_between_symbols < th else (0, 200, 0)
        cv2.rectangle(draw_img, (prev_symbol_end, half_h-2), (curr_symbol_start, half_h+2),
                      color, -1)
        prev_symbol_end = symbolBounds[2]
        prev_symbol = symbol


    #cv2.imwrite("boxes.jpg", draw_img)
    cv2.imshow("ocr boxes", draw_img)
    cv2.waitKey(0)


if __name__ == '__main__':
    logger = logging.getLogger()
    logger.setLevel(logging.DEBUG)

    filename = sys.argv[1]
    img = cv2.imread(filename, 0)

    res = recognize_text(img)

Re: [tesseract-ocr] Re: Looking to hire a pytesseract consultant via skype

Reply via email to