Hi,
you could try to look at the distances between the symbol boxes, see the
attached script.
It's not very reliable as it depends very much on how you preprocess the
text and you have to fine the magic threshold. I'm using the 4.0 version,
symbol boxes were improved in the 4.1 version, it could work better.
Or, maybe simpler, you could rely on the fact that letters are alone or
followed by numbers, never as pairs.
If this is always the case this should work:
s="N1VN2V2N2V2TRV3T3R3RIN8S8R2R1T6R2T2T1"
parts = re.compile("(\w\d?)").split(s)
parts =[s for s in parts if s]
There is a mistake on R1 being seen as RI. If I is not a valid letter you
can do a simple replace of I with 1.
Bye
Lorenzo
Il giorno gio 26 mar 2020 alle ore 06:24 Bill Upham <[email protected]>
ha scritto:
> Thank you Aaron for the information, it was an improvement, I'm attaching
> one of the png files that I read. (I have 200)
> It is interesting how It is still not reading every file 100% correctly.
> My script counts the digits and sometimes it misses one of them or it calls
> a 1 a 15.
> Maybe I'm expecting perfection from computer vision and that's just not
> the case!
> Thanks again
>
> Bill Upham
>
> On Sat, Mar 14, 2020 at 3:03 PM Aaron Stewart <[email protected]>
> wrote:
>
>>
>> roi = cv2.resize(roi, None, fx=2, fy=2)
>> _, roi = cv2.threshold(roi, 128+64, 255, cv2.THRESH_BINARY)
>> roi = cv2.GaussianBlur(roi, (3,3), 0)
>> text_detected = image_to_string(roi, config="--psm 10 --oem 3
>> tessedit_char_whitelist=0123456789", )
>>
>> --
>> You received this message because you are subscribed to the Google Groups
>> "tesseract-ocr" group.
>> To unsubscribe from this group and stop receiving emails from it, send an
>> email to [email protected].
>> To view this discussion on the web visit
>> https://groups.google.com/d/msgid/tesseract-ocr/2ca084e4-aae6-423e-b359-a472e00579e6%40googlegroups.com
>> <https://groups.google.com/d/msgid/tesseract-ocr/2ca084e4-aae6-423e-b359-a472e00579e6%40googlegroups.com?utm_medium=email&utm_source=footer>
>> .
>>
> --
> You received this message because you are subscribed to the Google Groups
> "tesseract-ocr" group.
> To unsubscribe from this group and stop receiving emails from it, send an
> email to [email protected].
> To view this discussion on the web visit
> https://groups.google.com/d/msgid/tesseract-ocr/CAF5KrqB3HiPT3cKP6QLUR4u%2Bu3W1B7VbdUfKLfBYs-HnumwZWg%40mail.gmail.com
> <https://groups.google.com/d/msgid/tesseract-ocr/CAF5KrqB3HiPT3cKP6QLUR4u%2Bu3W1B7VbdUfKLfBYs-HnumwZWg%40mail.gmail.com?utm_medium=email&utm_source=footer>
> .
>
--
You received this message because you are subscribed to the Google Groups
"tesseract-ocr" group.
To unsubscribe from this group and stop receiving emails from it, send an email
to [email protected].
To view this discussion on the web visit
https://groups.google.com/d/msgid/tesseract-ocr/CAMgOLLwx%2BRww%3DaxGXp%2BqfPd8rstaNaxTAb1CA2TtM-9spqV%2BiA%40mail.gmail.com.
'''
Created on May 16, 2018
@author: trz
'''
import logging
import sys
import time
import tesserocr
from collections import deque
from multiprocessing import Lock
from tesserocr import PyTessBaseAPI, RIL, iterate_level
from operator import itemgetter
import cv2
import numpy as np
# DEFAULT_LANG="ita15kf"
# DEFAULT_LANG="elett_16000"
DEFAULT_LANG = "eng"
colors = [(255, 0, 0), (0, 255, 0), (0, 0, 255)]
yellow = (255, 255, 0)
magenta = (255, 0, 255)
gray = (100, 100, 100)
DEBUG = False
def recognize_text(raw_img):
lang = DEFAULT_LANG
psm_mode = tesserocr.PSM.SINGLE_BLOCK
api = PyTessBaseAPI(lang=lang)
logging.info("Tesseract version %s", api.Version())
api.Init(lang=lang)
api.SetPageSegMode(psm_mode)
api.SetImageBytes(raw_img.tobytes(), raw_img.shape[1], raw_img.shape[0], 1, raw_img.shape[1])
api.Recognize()
ri = api.GetIterator()
prev_symbol_end = -1
prev_symbol = None
draw_img = cv2.cvtColor(raw_img, cv2.COLOR_GRAY2BGR)
for i, r in enumerate(iterate_level(ri, RIL.SYMBOL)):
symbolBounds = r.BoundingBox(RIL.SYMBOL)
conf = r.Confidence(RIL.SYMBOL)
#new_word = r.IsAtBeginningOf(RIL.WORD)
#new_line = r.IsAtBeginningOf(RIL.TEXTLINE)
#print("NW/NL", new_word, new_line)
if symbolBounds is None:
print("symbolBounds is None", conf)
continue
symbol = r.GetUTF8Text(RIL.SYMBOL)
cv2.rectangle(draw_img, (symbolBounds[0], symbolBounds[1]), (symbolBounds[2], symbolBounds[3]),
colors[i % 3])
if i % 2 == 0:
cv2.rectangle(draw_img, (symbolBounds[0], 0), (symbolBounds[2], 1), yellow, 2)
else:
cv2.rectangle(draw_img, (symbolBounds[0], draw_img.shape[0] - 2),
(symbolBounds[2], draw_img.shape[0] - 2), magenta, 2)
# space between symbols
curr_symbol_start = symbolBounds[0]
if prev_symbol_end == -1:
space_between_symbols = 0
else:
space_between_symbols = curr_symbol_start - prev_symbol_end
print("From", prev_symbol, "to", symbol, space_between_symbols)
symbol_h = symbolBounds[3] - symbolBounds[1]
half_h = symbol_h // 2 + symbolBounds[1]
th = 12
color = (200,200,200) if space_between_symbols < th else (0, 200, 0)
cv2.rectangle(draw_img, (prev_symbol_end, half_h-2), (curr_symbol_start, half_h+2),
color, -1)
prev_symbol_end = symbolBounds[2]
prev_symbol = symbol
#cv2.imwrite("boxes.jpg", draw_img)
cv2.imshow("ocr boxes", draw_img)
cv2.waitKey(0)
if __name__ == '__main__':
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
filename = sys.argv[1]
img = cv2.imread(filename, 0)
res = recognize_text(img)