doc-dump.py | 4 +-- src/docstream.py | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 66 insertions(+), 4 deletions(-)
New commits: commit ea28062eb49560c8917d19d6e20152d851858e6e Author: Miklos Vajna <[email protected]> Date: Fri May 17 16:57:51 2013 +0200 give usable error message on ole-based ww6 input diff --git a/src/docstream.py b/src/docstream.py index a94d0d3..5c024fe 100644 --- a/src/docstream.py +++ b/src/docstream.py @@ -187,6 +187,9 @@ class WordDocumentStream(DOCDirStream): self.printAndSet("wIdent", self.readuInt16()) self.printAndSet("nFib", self.readuInt16()) + if self.nFib >= 0x65 and self.nFib <= 0x69: + print '<todo what="handle nFib 0x65..0x69: ww6 syntax"/>' + ret = False self.printAndSet("unused", self.readuInt16()) self.printAndSet("lid", self.readuInt16()) self.printAndSet("pnNext", self.readuInt16()) commit 0af1cd44f27a2f93ebcaa9265ad3e4f92107d2a3 Author: Miklos Vajna <[email protected]> Date: Fri May 17 15:29:16 2013 +0200 doc: use gsf to read the ole streams when it's available The GSF parser can deal with e.g. fdo33590-1.doc diff --git a/doc-dump.py b/doc-dump.py index 17aec51..b99d6fe 100755 --- a/doc-dump.py +++ b/doc-dump.py @@ -19,10 +19,10 @@ class DOCDumper: def dump(self): file = open(self.filepath, 'rb') - strm = docstream.DOCFile(file.read(), self.params) + strm = docstream.createDOCFile(file.read(), self.params) file.close() dirnames = strm.getDirectoryNames() - print '<?xml version="1.0"?>\n<streams>' + print '<?xml version="1.0"?>\n<streams ole-type="%s">' % strm.getName() for dirname in dirnames: if len(dirname) == 0 or dirname in ['Root Entry']: continue diff --git a/src/docstream.py b/src/docstream.py index e9a674f..a94d0d3 100644 --- a/src/docstream.py +++ b/src/docstream.py @@ -6,6 +6,7 @@ # import ole +import ctypes import struct from docdirstream import DOCDirStream import docrecord @@ -54,6 +55,58 @@ class DOCFile: else: return DOCDirStream(bytes, self.params, name, doc=self) + def getName(self): + return "native" + +class GsfDOCFile(DOCFile): + """Same as DOCFile, but uses gsf to read the OLE streams.""" + def __init__ (self, chars, params, gsf): + self.gsf = gsf + DOCFile.__init__(self, chars, params) + + def initWW8(self): + self.streams = {} + self.gsf.gsf_init() + gsfInput = self.gsf.gsf_input_memory_new(self.chars, len(self.chars), False) + gsfInfile = self.gsf.gsf_infile_msole_new(gsfInput) + for i in range(self.gsf.gsf_infile_num_children(gsfInfile)): + child = self.gsf.gsf_infile_child_by_index(gsfInfile, i) + childName = ctypes.string_at(self.gsf.gsf_infile_name_by_index(gsfInfile,i)) + childSize = self.gsf.gsf_input_size(child) + childData = "" + while True: + bufSize = 1024 + pos = self.gsf.gsf_input_tell(child) + if pos == childSize: + break + elif pos + bufSize > childSize: + bufSize = childSize - pos + childData += ctypes.string_at(self.gsf.gsf_input_read(child, bufSize, None), bufSize) + self.streams[childName] = childData + self.gsf.gsf_shutdown() + + def getDirectoryNames(self): + return self.streams.keys() + + def getDirectoryStreamByName(self, name): + return self.getStreamFromBytes(name, self.streams[name]) + + def getName(self): + return "gsf" + +def createDOCFile(chars, params): + hasGsf = True + try: + gsf = ctypes.cdll.LoadLibrary('libgsf-1.so') + gsf.gsf_input_read.restype = ctypes.c_void_p + except: + hasGsf = False + + if hasGsf: + return GsfDOCFile(chars, params, gsf) + else: + return DOCFile(chars, params) + class TableStream(DOCDirStream): def __init__(self, bytes, params, name, doc): DOCDirStream.__init__(self, bytes, params, name, doc = doc) commit a2522a47ae8ea503130e8e662d630a47cbdb5d95 Author: Miklos Vajna <[email protected]> Date: Fri May 17 15:08:47 2013 +0200 doc: refactor to separate code that is specific to our own ole parser diff --git a/src/docstream.py b/src/docstream.py index e888299..e9a674f 100644 --- a/src/docstream.py +++ b/src/docstream.py @@ -20,8 +20,7 @@ class DOCFile: self.params = params if ord(self.chars[0]) == 0xD0 and ord(self.chars[1]) == 0xCF and ord(self.chars[2]) == 0x11 and ord(self.chars[3]) == 0xE0: - self.header = ole.Header(self.chars, self.params) - self.pos = self.header.parse() + self.initWW8() else: print '<?xml version="1.0"?>' if ord(self.chars[0]) == 0xDB and ord(self.chars[1]) == 0xA5: @@ -30,6 +29,10 @@ class DOCFile: print '<todo what="unhandled magic"/>' sys.exit(0) + def initWW8(self): + self.header = ole.Header(self.chars, self.params) + self.pos = self.header.parse() + def __getDirectoryObj(self): obj = self.header.getDirectory() obj.parseDirEntries() @@ -41,6 +44,9 @@ class DOCFile: def getDirectoryStreamByName(self, name): obj = self.__getDirectoryObj() bytes = obj.getRawStreamByName(name) + return self.getStreamFromBytes(name, bytes) + + def getStreamFromBytes(self, name, bytes): if name == "WordDocument": return WordDocumentStream(bytes, self.params, doc=self) if name == "1Table": _______________________________________________ Libreoffice-commits mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/libreoffice-commits
