[tesseract-ocr] i am doing ocr on python for pancard extarction.To extract details such as pan no,name,fname and dob.Buti dont got proper data in many pancards.What can i do?

Ashitha k a Wed, 23 Nov 2022 23:24:19 -0800

give me a solution for this problem.It is urgent

-- 
You received this message because you are subscribed to the Google Groups 
"tesseract-ocr" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
To view this discussion on the web visit 
https://groups.google.com/d/msgid/tesseract-ocr/4666ae9f-d9cd-42f7-b85a-62172aa1298fn%40googlegroups.com.

import io
import json
import re
import pytesseract as pt
from matplotlib import pyplot as plt
import matplotlib.image as Image
import cv2 
import cv2 as cv
import sys
import numpy as np
from PIL import Image,ImageEnhance


pt.pytesseract.tesseract_cmd = r'C:\Program Files 
(x86)\Tesseract-OCR\tesseract.exe'
path='C:\Windows\DigitalLocker\p1.jpeg'
img=cv2.imread(path)
blur = cv2.GaussianBlur(img,(5,5),0)
median = cv2.medianBlur(blur,5)
blur = cv2.bilateralFilter(median,9,75,75)
text=pt.pytesseract.image_to_string(blur)

def findword(textlist, wordstring):
    lineno = -1
    for wordline in textlist:
        xx = wordline.split( )
        if ([w for w in xx if re.search(wordstring, w)]):
            lineno = textlist.index(wordline)
            textlist = textlist[lineno+1:]
            return textlist
    return textlist
name = None
fname = None
dob = None
pan = None
nameline = []
dobline = []
panline = []
text0 = []
text1 = []
text2 = []
lines = text.split('\n')
for lin in lines:
    s = lin.strip()
    s = lin.replace('\n','')
    s = s.rstrip()
    s = s.lstrip()
    text1.append(s)
text1 = list(filter(None,text1))
lineno = 0
for wordline in text1:
        xx = wordline.split('\n')
        if ([w for w in xx if 
re.search('(INCOMETAXDEPARWENT|INCOME|TAX|GOW|GOVT|GOVERNMENT|OVERNMENT|VERNMENT|DEPARTMENT|EPARTMENT|PARTMENT|ARTMENT|INDIA|NDIA)$',
 w)]):
            text1 = list(text1)
            lineno = text1.index(wordline)
            break
text0 = text1[lineno+1:]
try:

# Cleaning first names
    name = text0[0]
    name = name.rstrip()
    name = name.lstrip()
    name = name.replace("8", "B")
    name = name.replace("0", "D")
    name = name.replace("6", "G")
    name = name.replace("1", "I")
    name = re.sub('[^a-zA-Z] +', ' ', name)
# Cleaning Father's name
    fname = text0[1]
    fname = fname.rstrip()
    fname = fname.lstrip()
    fname = fname.replace("8", "S")
    fname = fname.replace("0", "O")
    fname = fname.replace("6", "G")
    fname = fname.replace("1", "I")
    fname = fname.replace("\"", "A")
    fname = re.sub('[^a-zA-Z] +', ' ', fname)
# Cleaning DOB
    dob = text0[2][:10]
    dob = dob.rstrip()
    dob = dob.lstrip()
    dob = dob.replace('l', '/')
    dob = dob.replace('L', '/')
    dob = dob.replace('I', '/')
    dob = dob.replace('i', '/')
    dob = dob.replace('|', '/')
    dob = dob.replace('\"', '/1')
    dob = dob.replace(" ", "")
# Cleaning PAN Card details
    text0 = findword(text1, 
'(Pormanam|Number|umber|Account|ccount|count|Permanent|ermanent|manent|wumm)$')
    panline = text0[0]
    pan = panline.rstrip()
    pan = pan.lstrip()
    pan = pan.replace(" ", "")
    pan = pan.replace("\"", "")
    pan = pan.replace(";", "")
    pan = pan.replace("%", "L")
except:
        pass
data = {}
data['Name'] = name
data['Father Name'] = fname
data['Date of Birth'] = dob
data['PAN'] = pan
data['ID Type'] = "PAN"

print(data)

def findword(textlist, wordstring):
    lineno = -1
    for wordline in textlist:
        xx = wordline.split( )
        if ([w for w in xx if re.search(wordstring, w)]):
            lineno = textlist.index(wordline)
            textlist = textlist[lineno+1:]
            return textlist
    return textlist

try:
    to_unicode = unicode
except NameError:
    to_unicode = str
with io.open('info1.json', 'w', encoding='utf-8') as outfile:
    data = json.dumps(data, indent=4, sort_keys=True, separators=(',', ': '), 
ensure_ascii=False)
    outfile.write(to_unicode(data))
with open('info1.json', encoding='utf-8') as data:
        data_loaded = json.load(data)
if data_loaded['ID Type'] == 'PAN':
    print("\n---------- PAN Details ----------")
    print("\nPAN Number: ",data_loaded['PAN'])
    print("\nName: ",data_loaded['Name'])
    print("\nFather's Name: ",data_loaded['Father Name'])
    print("\nDate Of Birth: ", data_loaded['Date of Birth'])

[tesseract-ocr] i am doing ocr on python for pancard extarction.To extract details such as pan no,name,fname and dob.Buti dont got proper data in many pancards.What can i do?

Reply via email to