tks this ws really helpful, i used catdoc, catppt, xls2csv, pdftotext from xdf and ps2txt from ghostview!..
BJörn Lindqvist wrote: > On 4 Jul 2006 08:38:47 -0700, Gaurav Agarwal > <[EMAIL PROTECTED]> wrote: > > Thanks Steven, Actually i wanted a do text processing for my office > > where I can view all files in the system and use the first three to > > give a summary of the document. Instead of having somebody actually > > entering the summary. Seems there is no one code that can act as > > convertor across formats, i'll have to check out convertors for > > individual formats. > > I have some old code that does just that. It uses pdftotext, catdoc > and links to convert .doc, .pdf and .html to text. > > ################################################################## > import mimetypes > from subprocess import call, Popen, PIPE > import sys > > class ConversionError(Exception): > pass > > class UnknownMimeType(ConversionError): > pass > > class NotAMimeType(ConversionError): > pass > > class ParseError(ConversionError): > pass > > def has_program(progname): > return call(["which", progname], stdout = PIPE) == 0 > > def check_requirements(): > missing = [] > for prog in "catdoc", "pdftotext", "links": > if not has_program(prog): > missing.append(prog) > if missing: > print "You need to have the programs:", " ".join(missing) > return False > return True > > if not check_requirements(): > print "Needed external programs not found, quitting" > sys.exit(1) > > def get_catdoc_args(infile): > return ["catdoc", "-s", "8859-1", infile] > > def get_pdftotext_args(infile): > return ["pdftotext", infile, "-"] > > def get_links_args(infile): > return ["links", infile, "-dump"] > > def totext(document): > filetype_to_args_map = {"application/msword" : get_catdoc_args, > "application/pdf" : get_pdftotext_args, > "text/html" : get_links_args} > > ftype, ign = mimetypes.guess_type(document) > if not ftype: > raise NotAMimeType, "Couldn't detect mimetype for %s" % document > try: > argfunc = filetype_to_args_map[ftype] > except KeyError: > s = "Don't know how to handle %s documents" % ftype > raise UnknownMimeType, s > > p = Popen(argfunc(document), stdout = PIPE, stderr = PIPE) > text = p.stdout.read() > if p.wait(): > # Force a better exception to be thrown if the file doesn't exist. > open(document) > raise ParseError, "Failed to parse %s" % document > return text > > if __name__ == "__main__": > print totext("testpdf.pdf") > > > > -- > mvh Björn -- http://mail.python.org/mailman/listinfo/python-list