Hi,

Trying out Mayan for a document management server and have been very
impressed so far.  However, we ran into some problems with pdf parsing
and I modified a few routines to use Xpdf tools which seems to be much
faster than the graphicsmagick backend.

This probably breaks for non pdf files, but it would be a nice
addition for pdf parsing:

apps/converter/backends/graphicsmagick/base.py


 def get_page_count(self, input_filepath):
        command = []
        command.append('pdfinfo')
        command.append(unicode(input_filepath))
        proc = subprocess.Popen(command, close_fds=True,
stderr=subprocess.PIPE, stdout=subprocess.PIPE)
        return_code = proc.wait()
        if return_code != 0:
            print proc.stderr.readline()

        output = proc.stdout.read().splitlines()

        numpages = -1
        for line in output:
            matchObj = re.match( r'Pages:\s+(\d+)', line, re.M|re.I)

            if matchObj:
                numpages = matchObj.group(1)
                break
            else:
                print "No match!!"

        if numpages < 0:
            raise UnknownFileFormat
        else:
            return int(numpages)


PDF text parsing: /apps/ocr/parsers/__init__.py

def pdf_parser(document_page, descriptor=None):

    logger.debug('parsing PDF')
    pagenum = str(document_page.page_number)

    logger.debug('parsing PDF page %s' % pagenum)

    command = []
    command.append('pdftotext')
    command.append('-f')
    command.append(pagenum)
    command.append('-l')
    command.append(pagenum)
    command.append(unicode(document_page.document_version.file.path))
    command.append('-')

    proc = subprocess.Popen(command, close_fds=True,
stderr=subprocess.PIPE, stdout=subprocess.PIPE)
    return_code = proc.wait()
    if return_code != 0:
        print proc.stderr.readline()
        raise ParserError

    output = proc.stdout.read()
    numalpha = len( filter(str.isalpha, output) )
    numother = len( filter(notalphaorspace, output) )

    logger.debug("Numalpha = %d  Numother = %d" % (numalpha,
numother))

    if numother > numalpha:
        logger.debug("parser error... probably scanned pdf.")
        raise ParserError

    document_page.content = output
    document_page.page_label = _(u'Text extracted from PDF')
    document_page.save()


Reply via email to