Hi Brian,
I started integrating your patch and hit an error with this: "global name
'notalphaorspace' is not defined" in this line "numother = len(
filter(notalphaorspace, output) ) " What is notalphaorspace meant to do?
Thanks.
On Sunday, May 13, 2012 1:57:30 PM UTC-4, Roberto Rosario wrote:
>
> Excellent! I will find a way to get this running along side the existing
> code. Thanks for you contribution!
>
> On Sunday, May 13, 2012 3:29:31 AM UTC-4, Brian E wrote:
>>
>> Hi,
>>
>> Trying out Mayan for a document management server and have been very
>> impressed so far. However, we ran into some problems with pdf parsing
>> and I modified a few routines to use Xpdf tools which seems to be much
>> faster than the graphicsmagick backend.
>>
>> This probably breaks for non pdf files, but it would be a nice
>> addition for pdf parsing:
>>
>> apps/converter/backends/graphicsmagick/base.py
>>
>>
>> def get_page_count(self, input_filepath):
>> command = []
>> command.append('pdfinfo')
>> command.append(unicode(input_filepath))
>> proc = subprocess.Popen(command, close_fds=True,
>> stderr=subprocess.PIPE, stdout=subprocess.PIPE)
>> return_code = proc.wait()
>> if return_code != 0:
>> print proc.stderr.readline()
>>
>> output = proc.stdout.read().splitlines()
>>
>> numpages = -1
>> for line in output:
>> matchObj = re.match( r'Pages:\s+(\d+)', line, re.M|re.I)
>>
>> if matchObj:
>> numpages = matchObj.group(1)
>> break
>> else:
>> print "No match!!"
>>
>> if numpages < 0:
>> raise UnknownFileFormat
>> else:
>> return int(numpages)
>>
>>
>> PDF text parsing: /apps/ocr/parsers/__init__.py
>>
>> def pdf_parser(document_page, descriptor=None):
>>
>> logger.debug('parsing PDF')
>> pagenum = str(document_page.page_number)
>>
>> logger.debug('parsing PDF page %s' % pagenum)
>>
>> command = []
>> command.append('pdftotext')
>> command.append('-f')
>> command.append(pagenum)
>> command.append('-l')
>> command.append(pagenum)
>> command.append(unicode(document_page.document_version.file.path))
>> command.append('-')
>>
>> proc = subprocess.Popen(command, close_fds=True,
>> stderr=subprocess.PIPE, stdout=subprocess.PIPE)
>> return_code = proc.wait()
>> if return_code != 0:
>> print proc.stderr.readline()
>> raise ParserError
>>
>> output = proc.stdout.read()
>> numalpha = len( filter(str.isalpha, output) )
>> numother = len( filter(notalphaorspace, output) )
>>
>> logger.debug("Numalpha = %d Numother = %d" % (numalpha,
>> numother))
>>
>> if numother > numalpha:
>> logger.debug("parser error... probably scanned pdf.")
>> raise ParserError
>>
>> document_page.content = output
>> document_page.page_label = _(u'Text extracted from PDF')
>> document_page.save()
>>
>>
>>