Hi,
Trying out Mayan for a document management server and have been very
impressed so far. However, we ran into some problems with pdf parsing
and I modified a few routines to use Xpdf tools which seems to be much
faster than the graphicsmagick backend.
This probably breaks for non pdf files, but it would be a nice
addition for pdf parsing:
apps/converter/backends/graphicsmagick/base.py
def get_page_count(self, input_filepath):
command = []
command.append('pdfinfo')
command.append(unicode(input_filepath))
proc = subprocess.Popen(command, close_fds=True,
stderr=subprocess.PIPE, stdout=subprocess.PIPE)
return_code = proc.wait()
if return_code != 0:
print proc.stderr.readline()
output = proc.stdout.read().splitlines()
numpages = -1
for line in output:
matchObj = re.match( r'Pages:\s+(\d+)', line, re.M|re.I)
if matchObj:
numpages = matchObj.group(1)
break
else:
print "No match!!"
if numpages < 0:
raise UnknownFileFormat
else:
return int(numpages)
PDF text parsing: /apps/ocr/parsers/__init__.py
def pdf_parser(document_page, descriptor=None):
logger.debug('parsing PDF')
pagenum = str(document_page.page_number)
logger.debug('parsing PDF page %s' % pagenum)
command = []
command.append('pdftotext')
command.append('-f')
command.append(pagenum)
command.append('-l')
command.append(pagenum)
command.append(unicode(document_page.document_version.file.path))
command.append('-')
proc = subprocess.Popen(command, close_fds=True,
stderr=subprocess.PIPE, stdout=subprocess.PIPE)
return_code = proc.wait()
if return_code != 0:
print proc.stderr.readline()
raise ParserError
output = proc.stdout.read()
numalpha = len( filter(str.isalpha, output) )
numother = len( filter(notalphaorspace, output) )
logger.debug("Numalpha = %d Numother = %d" % (numalpha,
numother))
if numother > numalpha:
logger.debug("parser error... probably scanned pdf.")
raise ParserError
document_page.content = output
document_page.page_label = _(u'Text extracted from PDF')
document_page.save()