src/docrecord.py | 116 ++++++++++++++++++++++++++++++++++++------------------- src/docstream.py | 35 +++++++++++++--- 2 files changed, 106 insertions(+), 45 deletions(-)
New commits: commit 123b9721a4b19f469051696b969542b961261392 Author: Miklos Vajna <[email protected]> Date: Sat Jan 5 22:45:47 2013 +0100 ChpxFkp: properly transform non-compressed strings diff --git a/src/docrecord.py b/src/docrecord.py index 418d255..52dcfd4 100644 --- a/src/docrecord.py +++ b/src/docrecord.py @@ -30,16 +30,25 @@ class FcCompressed(DOCDirStream): self.printAndSet("r1", self.r1) print '</fcCompressed>' - def getTransformedValue(self, start, end, double = True): + def getTransformedValue(self, start, end, full = True): if self.fCompressed: offset = self.fc/2 - return globals.encodeName(self.mainStream.bytes[offset:offset+end-start]) + if full: + fro = offset + to = offset+end-start + else: + fro = start + to = end + return globals.encodeName(self.mainStream.bytes[fro:to]) else: - l = end - start - if double: - l = l * 2 - offset = self.fc - return globals.encodeName(self.mainStream.bytes[offset:offset+l].decode('utf-16'), lowOnly = True) + if full: + offset = self.fc + fro = offset + to = offset + (end - start) * 2 + else: + fro = start + to = end + return globals.encodeName(self.mainStream.bytes[fro:to].decode('utf-16'), lowOnly = True) @staticmethod def getFCTransformedValue(bytes, start, end): @@ -503,7 +512,7 @@ class ChpxFkp(DOCDirStream): start = self.getuInt32(pos = pos) end = self.getuInt32(pos = pos + 4) print '<rgfc index="%d" start="%d" end="%d">' % (i, start, end) - print '<transformed value="%s"/>' % FcCompressed.getFCTransformedValue(self.bytes, start, end) + print '<transformed value="%s"/>' % self.pnFkpChpx.mainStream.retrieveText(start, end) pos += 4 # rgbx diff --git a/src/docstream.py b/src/docstream.py index f56e27e..5197c40 100644 --- a/src/docstream.py +++ b/src/docstream.py @@ -424,9 +424,9 @@ class WordDocumentStream(DOCDirStream): def handleLcbClx(self, silent = False): offset = self.fcClx size = self.lcbClx - clx = docrecord.Clx(self.doc.getDirectoryStreamByName("1Table").bytes, self, offset, size) + self.clx = docrecord.Clx(self.doc.getDirectoryStreamByName("1Table").bytes, self, offset, size) if not silent: - clx.dump() + self.clx.dump() def handleLcbPlcfBteChpx(self): plcBteChpx = docrecord.PlcBteChpx(self) @@ -624,6 +624,6 @@ class WordDocumentStream(DOCDirStream): def retrieveText(self, start, end): plcPcd = self.clx.pcdt.plcPcd idx = self.__findText(plcPcd, start) - return plcPcd.aPcd[idx].fc.getTransformedValue(start, end, double = False) + return plcPcd.aPcd[idx].fc.getTransformedValue(start, end, full = False) # vim:set filetype=python shiftwidth=4 softtabstop=4 expandtab: commit 09f36ce62d691c288795cb35026a3c92cd8df429 Author: Miklos Vajna <[email protected]> Date: Sat Jan 5 22:28:44 2013 +0100 PlcPcd: separate parse and dump diff --git a/src/docrecord.py b/src/docrecord.py index 25650d6..418d255 100644 --- a/src/docrecord.py +++ b/src/docrecord.py @@ -168,24 +168,28 @@ class PlcPcd(DOCDirStream, PLC): self.size = size self.aCp = [] self.aPcd = [] + self.ranges = [] - def dump(self): - print '<plcPcd type="PlcPcd" offset="%d" size="%d bytes">' % (self.pos, self.size) pos = self.pos for i in range(self.getElements()): # aCp start = self.getuInt32(pos = pos) end = self.getuInt32(pos = pos + 4) - print '<aCP index="%d" start="%d" end="%d">' % (i, start, end) + self.ranges.append((start, end)) self.aCp.append(start) pos += 4 # aPcd aPcd = Pcd(self.bytes, self.mainStream, self.getOffset(self.pos, i), 8) - aPcd.dump() self.aPcd.append(aPcd) - print '<transformed value="%s"/>' % aPcd.fc.getTransformedValue(start, end) + def dump(self): + print '<plcPcd type="PlcPcd" offset="%d" size="%d bytes">' % (self.pos, self.size) + for i in range(self.getElements()): + start, end = self.ranges[i] + print '<aCP index="%d" start="%d" end="%d">' % (i, start, end) + self.aPcd[i].dump() + print '<transformed value="%s"/>' % self.aPcd[i].fc.getTransformedValue(start, end) print '</aCP>' print '</plcPcd>' @@ -703,11 +707,15 @@ class Pcdt(DOCDirStream): self.pos = offset self.size = size + self.clxt = self.readuInt8() + self.lcb = self.readuInt32() + self.plcPcd = PlcPcd(self.bytes, self.mainStream, self.pos, self.lcb) + def dump(self): print '<pcdt type="Pcdt" offset="%d" size="%d bytes">' % (self.pos, self.size) - self.printAndSet("clxt", self.readuInt8()) - self.printAndSet("lcb", self.readuInt32()) - PlcPcd(self.bytes, self.mainStream, self.pos, self.lcb).dump() + self.printAndSet("clxt", self.clxt) + self.printAndSet("lcb", self.lcb) + self.plcPcd.dump() print '</pcdt>' class Clx(DOCDirStream): commit 0a6ef67326a23e6c5c13f945130e118d8ffaf3f2 Author: Miklos Vajna <[email protected]> Date: Sat Jan 5 22:20:59 2013 +0100 PlcPcd: store aCp and aPcd array diff --git a/src/docrecord.py b/src/docrecord.py index ef32730..25650d6 100644 --- a/src/docrecord.py +++ b/src/docrecord.py @@ -166,6 +166,8 @@ class PlcPcd(DOCDirStream, PLC): PLC.__init__(self, size, 8) # 8 is defined by 2.8.35 self.pos = offset self.size = size + self.aCp = [] + self.aPcd = [] def dump(self): print '<plcPcd type="PlcPcd" offset="%d" size="%d bytes">' % (self.pos, self.size) @@ -175,11 +177,13 @@ class PlcPcd(DOCDirStream, PLC): start = self.getuInt32(pos = pos) end = self.getuInt32(pos = pos + 4) print '<aCP index="%d" start="%d" end="%d">' % (i, start, end) + self.aCp.append(start) pos += 4 # aPcd aPcd = Pcd(self.bytes, self.mainStream, self.getOffset(self.pos, i), 8) aPcd.dump() + self.aPcd.append(aPcd) print '<transformed value="%s"/>' % aPcd.fc.getTransformedValue(start, end) print '</aCP>' commit d35016594f20bf86f4b39b2dcdbcc6820a397ef7 Author: Miklos Vajna <[email protected]> Date: Sat Jan 5 22:16:42 2013 +0100 WordDocumentStream: add a retrieveText method diff --git a/src/docrecord.py b/src/docrecord.py index 770b956..ef32730 100644 --- a/src/docrecord.py +++ b/src/docrecord.py @@ -30,13 +30,16 @@ class FcCompressed(DOCDirStream): self.printAndSet("r1", self.r1) print '</fcCompressed>' - def getTransformedValue(self, start, end): - if self.fCompressed: - offset = self.fc/2 - return globals.encodeName(self.mainStream.bytes[offset:offset+end-start]) - else: - offset = self.fc - return globals.encodeName(self.mainStream.bytes[offset:offset+end*2-start].decode('utf-16'), lowOnly = True) + def getTransformedValue(self, start, end, double = True): + if self.fCompressed: + offset = self.fc/2 + return globals.encodeName(self.mainStream.bytes[offset:offset+end-start]) + else: + l = end - start + if double: + l = l * 2 + offset = self.fc + return globals.encodeName(self.mainStream.bytes[offset:offset+l].decode('utf-16'), lowOnly = True) @staticmethod def getFCTransformedValue(bytes, start, end): diff --git a/src/docstream.py b/src/docstream.py index f33fcf4..f56e27e 100644 --- a/src/docstream.py +++ b/src/docstream.py @@ -614,4 +614,16 @@ class WordDocumentStream(DOCDirStream): self.__dumpFibRgFcLcb2002() print '</%s>' % name + def __findText(self, plcPcd, cp): + """Find the largest i such that plcPcd.aCp[i] <= cp.""" + for i in range(len(plcPcd.aCp)): + if plcPcd.aCp[i] <= cp: + index = i + return index + + def retrieveText(self, start, end): + plcPcd = self.clx.pcdt.plcPcd + idx = self.__findText(plcPcd, start) + return plcPcd.aPcd[idx].fc.getTransformedValue(start, end, double = False) + # vim:set filetype=python shiftwidth=4 softtabstop=4 expandtab: commit e86350962d0e874edf292e6b86fa35cf06aacd97 Author: Miklos Vajna <[email protected]> Date: Sat Jan 5 22:10:37 2013 +0100 parse Clx early diff --git a/src/docstream.py b/src/docstream.py index 7671bf8..f33fcf4 100644 --- a/src/docstream.py +++ b/src/docstream.py @@ -390,6 +390,18 @@ class WordDocumentStream(DOCDirStream): ["fcSttbfUssr"], ["lcbSttbfUssr"], ] + + # Parse Clx early, as it's needed by other structures. + posOrig = self.pos + for i in fields: + value = self.readInt32() + if i[0] == "fcClx": + self.printAndSet(i[0], value, silent = True) + if i[0] == "lcbClx": + self.printAndSet(i[0], value, silent = True) + i[1](silent = True) + self.pos = posOrig + for i in fields: value = self.readInt32() hasHandler = len(i) > 1 @@ -409,11 +421,12 @@ class WordDocumentStream(DOCDirStream): def handleDop(self): docrecord.Dop(self).dump() - def handleLcbClx(self): + def handleLcbClx(self, silent = False): offset = self.fcClx size = self.lcbClx clx = docrecord.Clx(self.doc.getDirectoryStreamByName("1Table").bytes, self, offset, size) - clx.dump() + if not silent: + clx.dump() def handleLcbPlcfBteChpx(self): plcBteChpx = docrecord.PlcBteChpx(self) commit f8902aa146f25d1f4aec76d59d207ab68740cf6b Author: Miklos Vajna <[email protected]> Date: Sat Jan 5 22:07:20 2013 +0100 Clx: decouple parsing from dumping diff --git a/src/docrecord.py b/src/docrecord.py index 85a967f..770b956 100644 --- a/src/docrecord.py +++ b/src/docrecord.py @@ -709,12 +709,15 @@ class Clx(DOCDirStream): self.pos = offset self.size = size + self.firstByte = self.getuInt8() + if self.firstByte == 0x02: + self.pcdt = Pcdt(self.bytes, self.mainStream, self.pos, self.size) + def dump(self): print '<clx type="Clx" offset="%d" size="%d bytes">' % (self.pos, self.size) - firstByte = self.getuInt8() - if firstByte == 0x02: + if self.firstByte == 0x02: print '<info what="Array of Prc, 0 elements"/>' - Pcdt(self.bytes, self.mainStream, self.pos, self.size).dump() + self.pcdt.dump() else: print '<todo what="Clx::dump() first byte is not 0x02"/>' print '</clx>' commit 3a42fcbf8610c2575252e33adfe4132863192970 Author: Miklos Vajna <[email protected]> Date: Sat Jan 5 22:05:18 2013 +0100 Pcd: decouple parsing from dumping diff --git a/src/docrecord.py b/src/docrecord.py index 4efc546..85a967f 100644 --- a/src/docrecord.py +++ b/src/docrecord.py @@ -49,16 +49,21 @@ class Pcd(DOCDirStream): self.pos = offset self.size = size - def dump(self): - print '<pcd type="Pcd" offset="%d" size="%d bytes">' % (self.pos, self.size) buf = self.readuInt16() - self.printAndSet("fNoParaLast", self.getBit(buf, 0)) - self.printAndSet("fR1", self.getBit(buf, 1)) - self.printAndSet("fDirty", self.getBit(buf, 2)) - self.printAndSet("fR2", buf & (2**13-1)) + self.fNoParaLast = self.getBit(buf, 0) + self.fR1 = self.getBit(buf, 1) + self.fDirty = self.getBit(buf, 2) + self.fR2 = buf & (2**13-1) self.fc = FcCompressed(self.bytes, self.mainStream, self.pos, 4) - self.fc.dump() self.pos += 4 + + def dump(self): + print '<pcd type="Pcd" offset="%d" size="%d bytes">' % (self.pos, self.size) + self.printAndSet("fNoParaLast", self.fNoParaLast) + self.printAndSet("fR1", self.fR1) + self.printAndSet("fDirty", self.fDirty) + self.printAndSet("fR2", self.fR2) + self.fc.dump() print '</pcd>' class PLC: commit 9e5fe43c776fadee01da3f1c7174c16dd71cb0fb Author: Miklos Vajna <[email protected]> Date: Sat Jan 5 21:56:53 2013 +0100 FcCompressed: decouple parsing from dumping diff --git a/src/docrecord.py b/src/docrecord.py index 82005b2..4efc546 100644 --- a/src/docrecord.py +++ b/src/docrecord.py @@ -18,12 +18,16 @@ class FcCompressed(DOCDirStream): self.pos = offset self.size = size + buf = self.readuInt32() + self.fc = buf & ((2**32-1) >> 2) # bits 0..29 + self.fCompressed = self.getBit(buf, 30) + self.r1 = self.getBit(buf, 31) + def dump(self): print '<fcCompressed type="FcCompressed" offset="%d" size="%d bytes">' % (self.pos, self.size) - buf = self.readuInt32() - self.printAndSet("fc", buf & ((2**32-1) >> 2)) # bits 0..29 - self.printAndSet("fCompressed", self.getBit(buf, 30)) - self.printAndSet("r1", self.getBit(buf, 31)) + self.printAndSet("fc", self.fc) + self.printAndSet("fCompressed", self.fCompressed) + self.printAndSet("r1", self.r1) print '</fcCompressed>' def getTransformedValue(self, start, end): commit aab6a67b4032cee6401fa206c84b8bb98bdc8f98 Author: Miklos Vajna <[email protected]> Date: Sat Jan 5 20:56:27 2013 +0100 pass reference to parent in handleLcbPlcfBteChpx, PlcBteChpx and PnFkpChpx diff --git a/src/docrecord.py b/src/docrecord.py index fc041e5..82005b2 100644 --- a/src/docrecord.py +++ b/src/docrecord.py @@ -468,10 +468,11 @@ class BxPap(DOCDirStream): class ChpxFkp(DOCDirStream): """The ChpxFkp structure maps text to its character properties.""" - def __init__(self, bytes, mainStream, offset, size): - DOCDirStream.__init__(self, mainStream.bytes) + def __init__(self, pnFkpChpx, offset, size): + DOCDirStream.__init__(self, pnFkpChpx.mainStream.bytes) self.pos = offset self.size = size + self.pnFkpChpx = pnFkpChpx def dump(self): print '<chpxFkp type="ChpxFkp" offset="%d" size="%d bytes">' % (self.pos, self.size) @@ -525,17 +526,18 @@ class PapxFkp(DOCDirStream): class PnFkpChpx(DOCDirStream): """The PnFkpChpx structure specifies the location in the WordDocument Stream of a ChpxFkp structure.""" - def __init__(self, bytes, mainStream, offset, size, name): - DOCDirStream.__init__(self, bytes, mainStream=mainStream) + def __init__(self, plcBteChpx, offset, size, name): + DOCDirStream.__init__(self, plcBteChpx.bytes, mainStream=plcBteChpx.mainStream) self.pos = offset self.size = size self.name = name + self.plcBteChpx = plcBteChpx def dump(self): print '<%s type="PnFkpChpx" offset="%d" size="%d bytes">' % (self.name, self.pos, self.size) buf = self.readuInt32() self.printAndSet("pn", buf & (2**22-1)) - chpxFkp = ChpxFkp(self.bytes, self.mainStream, self.pn*512, 512) + chpxFkp = ChpxFkp(self, self.pn*512, 512) chpxFkp.dump() print '</%s>' % self.name @@ -587,11 +589,11 @@ class PnFkpPapx(DOCDirStream): class PlcBteChpx(DOCDirStream, PLC): """The PlcBteChpx structure is a PLC that maps the offsets of text in the WordDocument stream to the character properties of that text.""" - def __init__(self, bytes, mainStream, offset, size): - DOCDirStream.__init__(self, bytes, mainStream=mainStream) - PLC.__init__(self, size, 4) - self.pos = offset - self.size = size + def __init__(self, mainStream): + DOCDirStream.__init__(self, mainStream.doc.getDirectoryStreamByName("1Table").bytes, mainStream=mainStream) + PLC.__init__(self, mainStream.lcbPlcfBteChpx, 4) + self.pos = mainStream.fcPlcfBteChpx + self.size = mainStream.lcbPlcfBteChpx def dump(self): print '<plcBteChpx type="PlcBteChpx" offset="%d" size="%d bytes">' % (self.pos, self.size) @@ -604,7 +606,7 @@ class PlcBteChpx(DOCDirStream, PLC): pos += 4 # aPnBteChpx - aPnBteChpx = PnFkpChpx(self.bytes, self.mainStream, self.getOffset(self.pos, i), 4, "aPnBteChpx") + aPnBteChpx = PnFkpChpx(self, self.getOffset(self.pos, i), 4, "aPnBteChpx") aPnBteChpx.dump() print '</aFC>' print '</plcBteChpx>' diff --git a/src/docstream.py b/src/docstream.py index 5d4f999..7671bf8 100644 --- a/src/docstream.py +++ b/src/docstream.py @@ -416,9 +416,7 @@ class WordDocumentStream(DOCDirStream): clx.dump() def handleLcbPlcfBteChpx(self): - offset = self.fcPlcfBteChpx - size = self.lcbPlcfBteChpx - plcBteChpx = docrecord.PlcBteChpx(self.doc.getDirectoryStreamByName("1Table").bytes, self, offset, size) + plcBteChpx = docrecord.PlcBteChpx(self) plcBteChpx.dump() def handleLcbPlcfBtePapx(self): _______________________________________________ Libreoffice-commits mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/libreoffice-commits
