src/docrecord.py | 54 +++++++++++++++++++++++++++--------------------------- src/docstream.py | 23 ++++++++++++++--------- 2 files changed, 41 insertions(+), 36 deletions(-)
New commits: commit 953c2e65c059c8542b977d7a42fccf26b0397e66 Author: Miklos Vajna <vmik...@suse.cz> Date: Thu May 23 12:13:17 2013 +0200 WordDocumentStream::__cpToOffset: use binary search before: $ time ./doc-dump.py fdo39958-1.doc > out.xml real 11m51.771s user 11m49.455s sys 0m0.099s after: $ time ./doc-dump.py fdo39958-1.doc > out.xml real 0m42.294s user 0m42.023s sys 0m0.058s diff --git a/src/docstream.py b/src/docstream.py index d147b82..cc0766b 100644 --- a/src/docstream.py +++ b/src/docstream.py @@ -12,6 +12,7 @@ from docdirstream import DOCDirStream import docrecord import globals import sys +import bisect class DOCFile: """Represents the whole word file - feed will all bytes.""" @@ -932,9 +933,7 @@ class WordDocumentStream(DOCDirStream): def __cpToOffset(self, cp): """Implements 2.4.1 Retrieving Text.""" plcPcd = self.clx.pcdt.plcPcd - for i in range(len(plcPcd.aCp)): - if plcPcd.aCp[i] <= cp: - index = i + index = bisect.bisect_right(plcPcd.aCp, cp) - 1 aPcd = plcPcd.aPcd[index] fcCompressed = aPcd.fc if fcCompressed.fCompressed == 1: commit 3d77a214bf257e79f5fcbe09079c139d67100069 Author: Miklos Vajna <vmik...@suse.cz> Date: Wed May 22 21:44:50 2013 +0200 doc: handle fWhichTblStm != 1 fdo37057-1.doc is a reproducer for this. diff --git a/src/docrecord.py b/src/docrecord.py index 570269d..a14f81e 100644 --- a/src/docrecord.py +++ b/src/docrecord.py @@ -99,7 +99,7 @@ class FBKF(DOCDirStream): class PlcfBkf(DOCDirStream, PLC): """A PLCFBKF is a PLC whose data elements are FBKF structures.""" def __init__(self, mainStream, offset, size): - DOCDirStream.__init__(self, mainStream.doc.getDirectoryStreamByName("1Table").bytes, mainStream = mainStream) + DOCDirStream.__init__(self, mainStream.getTableStream().bytes, mainStream = mainStream) PLC.__init__(self, size, 4) # 4 is defined by 2.8.10 self.pos = offset self.size = size @@ -152,7 +152,7 @@ class Fld(DOCDirStream): class PlcFld(DOCDirStream, PLC): """The Plcfld structure specifies the location of fields in the document.""" def __init__(self, mainStream): - DOCDirStream.__init__(self, mainStream.doc.getDirectoryStreamByName("1Table").bytes, mainStream = mainStream) + DOCDirStream.__init__(self, mainStream.getTableStream().bytes, mainStream = mainStream) PLC.__init__(self, mainStream.lcbPlcfFldMom, 2) # 2 is defined by 2.8.25 self.pos = mainStream.fcPlcfFldMom self.size = mainStream.lcbPlcfFldMom @@ -184,7 +184,7 @@ class PlcFld(DOCDirStream, PLC): class PlcfBkl(DOCDirStream, PLC): """The Plcfbkl structure is a PLC that contains only CPs and no additional data.""" def __init__(self, mainStream, offset, size, start): - DOCDirStream.__init__(self, mainStream.doc.getDirectoryStreamByName("1Table").bytes, mainStream = mainStream) + DOCDirStream.__init__(self, mainStream.getTableStream().bytes, mainStream = mainStream) PLC.__init__(self, size, 0) # 0 is defined by 2.8.12 self.pos = offset self.size = size @@ -273,7 +273,7 @@ class Sed(DOCDirStream): class PlcfSed(DOCDirStream, PLC): """The PlcfSed structure is a PLC structure where the data elements are Sed structures.""" def __init__(self, mainStream, offset, size): - DOCDirStream.__init__(self, mainStream.doc.getDirectoryStreamByName("1Table").bytes, mainStream = mainStream) + DOCDirStream.__init__(self, mainStream.getTableStream().bytes, mainStream = mainStream) PLC.__init__(self, size, Sed.size) self.pos = offset self.size = size @@ -299,7 +299,7 @@ class PlcfSed(DOCDirStream, PLC): class Tcg(DOCDirStream): """The Tcg structure specifies command-related customizations.""" def __init__(self, mainStream, offset, size): - DOCDirStream.__init__(self, mainStream.doc.getDirectoryStreamByName("1Table").bytes) + DOCDirStream.__init__(self, mainStream.getTableStream().bytes) self.pos = offset self.size = size @@ -339,7 +339,7 @@ class Sty(DOCDirStream): class Selsf(DOCDirStream): """The Selsf structure specifies the last selection that was made to the document.""" def __init__(self, mainStream): - DOCDirStream.__init__(self, mainStream.doc.getDirectoryStreamByName("1Table").bytes) + DOCDirStream.__init__(self, mainStream.getTableStream().bytes) self.pos = mainStream.fcWss self.size = mainStream.lcbWss self.mainStream = mainStream @@ -1011,7 +1011,7 @@ class PnFkpPapx(DOCDirStream): class PlcBteChpx(DOCDirStream, PLC): """The PlcBteChpx structure is a PLC that maps the offsets of text in the WordDocument stream to the character properties of that text.""" def __init__(self, mainStream): - DOCDirStream.__init__(self, mainStream.doc.getDirectoryStreamByName("1Table").bytes, mainStream=mainStream) + DOCDirStream.__init__(self, mainStream.getTableStream().bytes, mainStream=mainStream) PLC.__init__(self, mainStream.lcbPlcfBteChpx, 4) self.pos = mainStream.fcPlcfBteChpx self.size = mainStream.lcbPlcfBteChpx @@ -1036,7 +1036,7 @@ class PlcfHdd(DOCDirStream, PLC): """The Plcfhdd structure is a PLC that contains only CPs and no additional data. It specifies where header document stories begin and end.""" def __init__(self, mainStream): - DOCDirStream.__init__(self, mainStream.doc.getDirectoryStreamByName("1Table").bytes, mainStream=mainStream) + DOCDirStream.__init__(self, mainStream.getTableStream().bytes, mainStream=mainStream) PLC.__init__(self, mainStream.lcbPlcfHdd, 0) self.pos = mainStream.fcPlcfHdd self.size = mainStream.lcbPlcfHdd @@ -1081,7 +1081,7 @@ class PlcfHdd(DOCDirStream, PLC): class PlcfandTxt(DOCDirStream, PLC): """The PlcfandTxt structure is a PLC that contains only CPs and no additional data.""" def __init__(self, mainStream, offset, size): - DOCDirStream.__init__(self, mainStream.doc.getDirectoryStreamByName("1Table").bytes, mainStream=mainStream) + DOCDirStream.__init__(self, mainStream.getTableStream().bytes, mainStream=mainStream) PLC.__init__(self, size, 0) self.pos = offset self.size = size @@ -1102,7 +1102,7 @@ class PlcfandTxt(DOCDirStream, PLC): class PlcfandRef(DOCDirStream, PLC): """The PlcfandRef structure is a PLC whose data elements are ATRDPre10 structures.""" def __init__(self, mainStream, offset, size): - DOCDirStream.__init__(self, mainStream.doc.getDirectoryStreamByName("1Table").bytes, mainStream=mainStream) + DOCDirStream.__init__(self, mainStream.getTableStream().bytes, mainStream=mainStream) PLC.__init__(self, size, 30) self.pos = offset self.size = size @@ -1881,7 +1881,7 @@ class Dop2007(DOCDirStream): class RC4EncryptionHeader(DOCDirStream): """The encryption header structure used for RC4 encryption.""" def __init__(self, fib, pos, size): - DOCDirStream.__init__(self, fib.doc.getDirectoryStreamByName("1Table").bytes) + DOCDirStream.__init__(self, fib.getTableStream().bytes) self.fib = fib self.pos = pos self.size = size @@ -1900,7 +1900,7 @@ class RC4EncryptionHeader(DOCDirStream): class Dop(DOCDirStream): """The Dop structure contains the document and compatibility settings for the document.""" def __init__(self, fib): - DOCDirStream.__init__(self, fib.doc.getDirectoryStreamByName("1Table").bytes) + DOCDirStream.__init__(self, fib.getTableStream().bytes) self.pos = fib.fcDop self.size = fib.lcbDop self.fib = fib @@ -2006,7 +2006,7 @@ class SttbfFfn(DOCDirStream): class GrpXstAtnOwners(DOCDirStream): """This array contains the names of authors of comments in the document.""" def __init__(self, mainStream): - DOCDirStream.__init__(self, mainStream.doc.getDirectoryStreamByName("1Table").bytes) + DOCDirStream.__init__(self, mainStream.getTableStream().bytes) self.pos = mainStream.fcGrpXstAtnOwners self.size = mainStream.lcbGrpXstAtnOwners self.mainStream = mainStream @@ -2023,7 +2023,7 @@ class GrpXstAtnOwners(DOCDirStream): class SttbfAssoc(DOCDirStream): """The SttbfAssoc structure is an STTB that contains strings which are associated with this document.""" def __init__(self, mainStream): - DOCDirStream.__init__(self, mainStream.doc.getDirectoryStreamByName("1Table").bytes) + DOCDirStream.__init__(self, mainStream.getTableStream().bytes) self.pos = mainStream.fcSttbfAssoc self.size = mainStream.lcbSttbfAssoc self.mainStream = mainStream @@ -2069,7 +2069,7 @@ class SttbfAssoc(DOCDirStream): class SttbfRMark(DOCDirStream): """The SttbfRMark structure is an STTB structure where the strings specify the names of the authors of the revision marks, comments, and e-mail messages in the document.""" def __init__(self, mainStream): - DOCDirStream.__init__(self, mainStream.doc.getDirectoryStreamByName("1Table").bytes) + DOCDirStream.__init__(self, mainStream.getTableStream().bytes) self.pos = mainStream.fcSttbfRMark self.size = mainStream.lcbSttbfRMark self.mainStream = mainStream @@ -2105,7 +2105,7 @@ class OfficeArtWordDrawing(DOCDirStream): class OfficeArtContent(DOCDirStream): """The OfficeArtContent structure specifies information about a drawing in the document.""" def __init__(self, mainStream): - DOCDirStream.__init__(self, mainStream.doc.getDirectoryStreamByName("1Table").bytes) + DOCDirStream.__init__(self, mainStream.getTableStream().bytes) self.pos = mainStream.fcDggInfo self.size = mainStream.lcbDggInfo self.mainStream = mainStream @@ -2140,7 +2140,7 @@ class ATNBE(DOCDirStream): class SttbfAtnBkmk(DOCDirStream): """The SttbfAtnBkmk structure is an STTB whose strings are all of zero length.""" def __init__(self, mainStream, offset, size): - DOCDirStream.__init__(self, mainStream.doc.getDirectoryStreamByName("1Table").bytes) + DOCDirStream.__init__(self, mainStream.getTableStream().bytes) self.pos = offset self.size = size @@ -2650,7 +2650,7 @@ class SPLS(DOCDirStream): class PlcfSpl(DOCDirStream, PLC): """The Plcfspl structure is a Plc structure whose data elements are SpellingSpls structures.""" def __init__(self, mainStream): - DOCDirStream.__init__(self, mainStream.doc.getDirectoryStreamByName("1Table").bytes, mainStream = mainStream) + DOCDirStream.__init__(self, mainStream.getTableStream().bytes, mainStream = mainStream) PLC.__init__(self, mainStream.lcbPlcfSpl, 2) # 2 is defined by 2.8.28 self.pos = mainStream.fcPlcfSpl self.size = mainStream.lcbPlcfSpl @@ -2732,7 +2732,7 @@ class FTXBXS(DOCDirStream): class PlcftxbxTxt(DOCDirStream, PLC): """Specifies which ranges of text are contained in which textboxes.""" def __init__(self, mainStream): - DOCDirStream.__init__(self, mainStream.doc.getDirectoryStreamByName("1Table").bytes, mainStream = mainStream) + DOCDirStream.__init__(self, mainStream.getTableStream().bytes, mainStream = mainStream) PLC.__init__(self, mainStream.lcbPlcftxbxTxt, FTXBXS.size) self.pos = mainStream.fcPlcftxbxTxt self.size = mainStream.lcbPlcftxbxTxt @@ -2781,7 +2781,7 @@ class Tbkd(DOCDirStream): class PlcftxbxBkd(DOCDirStream, PLC): """Specifies which ranges of text go inside which textboxes.""" def __init__(self, mainStream): - DOCDirStream.__init__(self, mainStream.doc.getDirectoryStreamByName("1Table").bytes, mainStream = mainStream) + DOCDirStream.__init__(self, mainStream.getTableStream().bytes, mainStream = mainStream) PLC.__init__(self, mainStream.lcbPlcfTxbxBkd, 6) self.pos = mainStream.fcPlcfTxbxBkd self.size = mainStream.lcbPlcfTxbxBkd @@ -2807,7 +2807,7 @@ class PlcfSpa(DOCDirStream, PLC): """The PlcfSpa structure is a PLC structure in which the data elements are SPA structures.""" def __init__(self, mainStream, pos, size): - DOCDirStream.__init__(self, mainStream.doc.getDirectoryStreamByName("1Table").bytes, mainStream = mainStream) + DOCDirStream.__init__(self, mainStream.getTableStream().bytes, mainStream = mainStream) PLC.__init__(self, size, 26) # 2.8.37 self.pos = pos self.size = size @@ -2822,7 +2822,7 @@ class PlcfSpa(DOCDirStream, PLC): class PlcfGram(DOCDirStream, PLC): """The PlcfGram structure is a Plc structure whose data elements are GrammarSpls structures.""" def __init__(self, mainStream): - DOCDirStream.__init__(self, mainStream.doc.getDirectoryStreamByName("1Table").bytes, mainStream = mainStream) + DOCDirStream.__init__(self, mainStream.getTableStream().bytes, mainStream = mainStream) PLC.__init__(self, mainStream.lcbPlcfGram, 2) # 2 is defined by 2.8.21 self.pos = mainStream.fcPlcfGram self.size = mainStream.lcbPlcfGram @@ -2936,7 +2936,7 @@ class LVL(DOCDirStream): class PlfLst(DOCDirStream): """The PlfLst structure contains the list formatting information for the document.""" def __init__(self, mainStream): - DOCDirStream.__init__(self, mainStream.doc.getDirectoryStreamByName("1Table").bytes, mainStream = mainStream) + DOCDirStream.__init__(self, mainStream.getTableStream().bytes, mainStream = mainStream) self.pos = mainStream.fcPlfLst self.size = mainStream.lcbPlfLst @@ -2992,7 +2992,7 @@ class LFOData(DOCDirStream): class PlfLfo(DOCDirStream): """The PlfLfo structure contains the list format override data for the document.""" def __init__(self, mainStream): - DOCDirStream.__init__(self, mainStream.doc.getDirectoryStreamByName("1Table").bytes, mainStream = mainStream) + DOCDirStream.__init__(self, mainStream.getTableStream().bytes, mainStream = mainStream) self.pos = mainStream.fcPlfLfo self.size = mainStream.lcbPlfLfo @@ -3014,7 +3014,7 @@ class PlfLfo(DOCDirStream): class SttbListNames(DOCDirStream): """The SttbListNames structure is an STTB structure whose strings are the names used by the LISTNUM field.""" def __init__(self, mainStream): - DOCDirStream.__init__(self, mainStream.doc.getDirectoryStreamByName("1Table").bytes, mainStream=mainStream) + DOCDirStream.__init__(self, mainStream.getTableStream().bytes, mainStream=mainStream) self.pos = mainStream.fcSttbListNames self.size = mainStream.lcbSttbListNames @@ -3035,7 +3035,7 @@ class SttbListNames(DOCDirStream): class SttbSavedBy(DOCDirStream): """The SttbSavedBy structure is an STTB structure that specifies the save history of this document.""" def __init__(self, mainStream): - DOCDirStream.__init__(self, mainStream.doc.getDirectoryStreamByName("1Table").bytes, mainStream=mainStream) + DOCDirStream.__init__(self, mainStream.getTableStream().bytes, mainStream=mainStream) self.pos = mainStream.fcSttbSavedBy self.size = mainStream.lcbSttbSavedBy @@ -3056,7 +3056,7 @@ class SttbSavedBy(DOCDirStream): class SttbfBkmk(DOCDirStream): """The SttbfBkmk structure is an STTB structure whose strings specify the names of bookmarks.""" def __init__(self, mainStream): - DOCDirStream.__init__(self, mainStream.doc.getDirectoryStreamByName("1Table").bytes) + DOCDirStream.__init__(self, mainStream.getTableStream().bytes) self.pos = mainStream.fcSttbfBkmk self.size = mainStream.lcbSttbfBkmk self.mainStream = mainStream diff --git a/src/docstream.py b/src/docstream.py index 37db294..d147b82 100644 --- a/src/docstream.py +++ b/src/docstream.py @@ -50,7 +50,7 @@ class DOCFile: def getStreamFromBytes(self, name, bytes): if name == "WordDocument": return WordDocumentStream(bytes, self.params, doc=self) - if name == "1Table": + if name in ("0Table", "1Table"): return TableStream(bytes, self.params, name, doc=self) else: return DOCDirStream(bytes, self.params, name, doc=self) @@ -181,6 +181,12 @@ class WordDocumentStream(DOCDirStream): self.printAndSet("lidThemeCS", self.readuInt16()) print '</%s>' % name + def getTableStream(self): + if self.fWhichTblStm: + return self.doc.getDirectoryStreamByName("1Table") + else: + return self.doc.getDirectoryStreamByName("0Table") + def dumpFibBase(self, name): ret = True print '<%s type="FibBase" size="32 bytes">' % name @@ -217,7 +223,7 @@ class WordDocumentStream(DOCDirStream): if self.fEncrypted == 1 and self.fObfuscated == 0: self.printAndSet("lKey", self.readuInt32(), end = False) print '<EncryptionVersionInfo>' - tableStream = self.doc.getDirectoryStreamByName("1Table") + tableStream = self.getTableStream() self.printAndSet("vMajor", tableStream.readuInt16()) self.printAndSet("vMinor", tableStream.readuInt16()) print '</EncryptionVersionInfo>' @@ -531,7 +537,7 @@ class WordDocumentStream(DOCDirStream): def handleLcbClx(self, silent = False): offset = self.fcClx size = self.lcbClx - self.clx = docrecord.Clx(self.doc.getDirectoryStreamByName("1Table").bytes, self, offset, size) + self.clx = docrecord.Clx(self.getTableStream().bytes, self, offset, size) if not silent: self.clx.dump() @@ -546,19 +552,19 @@ class WordDocumentStream(DOCDirStream): def handleLcbPlcfBtePapx(self): offset = self.fcPlcfBtePapx size = self.lcbPlcfBtePapx - plcBtePapx = docrecord.PlcBtePapx(self.doc.getDirectoryStreamByName("1Table").bytes, self, offset, size) + plcBtePapx = docrecord.PlcBtePapx(self.getTableStream().bytes, self, offset, size) plcBtePapx.dump() def handleLcbSttbfFfn(self): offset = self.fcSttbfFfn size = self.lcbSttbfFfn - sttbfFfn = docrecord.SttbfFfn(self.doc.getDirectoryStreamByName("1Table").bytes, self, offset, size) + sttbfFfn = docrecord.SttbfFfn(self.getTableStream().bytes, self, offset, size) sttbfFfn.dump() def handleLcbStshf(self): offset = self.fcStshf size = self.lcbStshf - stsh = docrecord.STSH(self.doc.getDirectoryStreamByName("1Table").bytes, self, offset, size) + stsh = docrecord.STSH(self.getTableStream().bytes, self, offset, size) stsh.dump() def handleLcbPlcfandTxt(self): _______________________________________________ Libreoffice-commits mailing list libreoffice-comm...@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/libreoffice-commits