Hi Brian,

I started integrating your patch and hit an error with this: "global name 
'notalphaorspace' is not defined" in this line "numother = len( 
filter(notalphaorspace, output) ) "  What is notalphaorspace meant to do?

Thanks.


On Sunday, May 13, 2012 1:57:30 PM UTC-4, Roberto Rosario wrote:
>
> Excellent!  I will find a way to get this running along side the existing 
> code.  Thanks for you contribution!
>
> On Sunday, May 13, 2012 3:29:31 AM UTC-4, Brian E wrote:
>>
>> Hi, 
>>
>> Trying out Mayan for a document management server and have been very 
>> impressed so far.  However, we ran into some problems with pdf parsing 
>> and I modified a few routines to use Xpdf tools which seems to be much 
>> faster than the graphicsmagick backend. 
>>
>> This probably breaks for non pdf files, but it would be a nice 
>> addition for pdf parsing: 
>>
>> apps/converter/backends/graphicsmagick/base.py 
>>
>>
>>  def get_page_count(self, input_filepath): 
>>         command = [] 
>>         command.append('pdfinfo') 
>>         command.append(unicode(input_filepath)) 
>>         proc = subprocess.Popen(command, close_fds=True, 
>> stderr=subprocess.PIPE, stdout=subprocess.PIPE) 
>>         return_code = proc.wait() 
>>         if return_code != 0: 
>>             print proc.stderr.readline() 
>>
>>         output = proc.stdout.read().splitlines() 
>>
>>         numpages = -1 
>>         for line in output: 
>>             matchObj = re.match( r'Pages:\s+(\d+)', line, re.M|re.I) 
>>
>>             if matchObj: 
>>                 numpages = matchObj.group(1) 
>>                 break 
>>             else: 
>>                 print "No match!!" 
>>
>>         if numpages < 0: 
>>             raise UnknownFileFormat 
>>         else: 
>>             return int(numpages) 
>>
>>
>> PDF text parsing: /apps/ocr/parsers/__init__.py 
>>
>> def pdf_parser(document_page, descriptor=None): 
>>
>>     logger.debug('parsing PDF') 
>>     pagenum = str(document_page.page_number) 
>>
>>     logger.debug('parsing PDF page %s' % pagenum) 
>>
>>     command = [] 
>>     command.append('pdftotext') 
>>     command.append('-f') 
>>     command.append(pagenum) 
>>     command.append('-l') 
>>     command.append(pagenum) 
>>     command.append(unicode(document_page.document_version.file.path)) 
>>     command.append('-') 
>>
>>     proc = subprocess.Popen(command, close_fds=True, 
>> stderr=subprocess.PIPE, stdout=subprocess.PIPE) 
>>     return_code = proc.wait() 
>>     if return_code != 0: 
>>         print proc.stderr.readline() 
>>         raise ParserError 
>>
>>     output = proc.stdout.read() 
>>     numalpha = len( filter(str.isalpha, output) ) 
>>     numother = len( filter(notalphaorspace, output) ) 
>>
>>     logger.debug("Numalpha = %d  Numother = %d" % (numalpha, 
>> numother)) 
>>
>>     if numother > numalpha: 
>>         logger.debug("parser error... probably scanned pdf.") 
>>         raise ParserError 
>>
>>     document_page.content = output 
>>     document_page.page_label = _(u'Text extracted from PDF') 
>>     document_page.save() 
>>
>>
>>

Reply via email to