Excellent! I will find a way to get this running along side the existing
code. Thanks for you contribution!
On Sunday, May 13, 2012 3:29:31 AM UTC-4, Brian E wrote:
>
> Hi,
>
> Trying out Mayan for a document management server and have been very
> impressed so far. However, we ran into some problems with pdf parsing
> and I modified a few routines to use Xpdf tools which seems to be much
> faster than the graphicsmagick backend.
>
> This probably breaks for non pdf files, but it would be a nice
> addition for pdf parsing:
>
> apps/converter/backends/graphicsmagick/base.py
>
>
> def get_page_count(self, input_filepath):
> command = []
> command.append('pdfinfo')
> command.append(unicode(input_filepath))
> proc = subprocess.Popen(command, close_fds=True,
> stderr=subprocess.PIPE, stdout=subprocess.PIPE)
> return_code = proc.wait()
> if return_code != 0:
> print proc.stderr.readline()
>
> output = proc.stdout.read().splitlines()
>
> numpages = -1
> for line in output:
> matchObj = re.match( r'Pages:\s+(\d+)', line, re.M|re.I)
>
> if matchObj:
> numpages = matchObj.group(1)
> break
> else:
> print "No match!!"
>
> if numpages < 0:
> raise UnknownFileFormat
> else:
> return int(numpages)
>
>
> PDF text parsing: /apps/ocr/parsers/__init__.py
>
> def pdf_parser(document_page, descriptor=None):
>
> logger.debug('parsing PDF')
> pagenum = str(document_page.page_number)
>
> logger.debug('parsing PDF page %s' % pagenum)
>
> command = []
> command.append('pdftotext')
> command.append('-f')
> command.append(pagenum)
> command.append('-l')
> command.append(pagenum)
> command.append(unicode(document_page.document_version.file.path))
> command.append('-')
>
> proc = subprocess.Popen(command, close_fds=True,
> stderr=subprocess.PIPE, stdout=subprocess.PIPE)
> return_code = proc.wait()
> if return_code != 0:
> print proc.stderr.readline()
> raise ParserError
>
> output = proc.stdout.read()
> numalpha = len( filter(str.isalpha, output) )
> numother = len( filter(notalphaorspace, output) )
>
> logger.debug("Numalpha = %d Numother = %d" % (numalpha,
> numother))
>
> if numother > numalpha:
> logger.debug("parser error... probably scanned pdf.")
> raise ParserError
>
> document_page.content = output
> document_page.page_label = _(u'Text extracted from PDF')
> document_page.save()
>
>
>