docstream.py

Miklos Vajna Sat, 05 Jan 2013 13:46:06 -0800

 src/docrecord.py |  116 ++++++++++++++++++++++++++++++++++++-------------------
 src/docstream.py |   35 +++++++++++++---
 2 files changed, 106 insertions(+), 45 deletions(-)


New commits:
commit 123b9721a4b19f469051696b969542b961261392
Author: Miklos Vajna <[email protected]>
Date:   Sat Jan 5 22:45:47 2013 +0100

    ChpxFkp: properly transform non-compressed strings

diff --git a/src/docrecord.py b/src/docrecord.py
index 418d255..52dcfd4 100644
--- a/src/docrecord.py
+++ b/src/docrecord.py
@@ -30,16 +30,25 @@ class FcCompressed(DOCDirStream):
         self.printAndSet("r1", self.r1)
         print '</fcCompressed>'
 
-    def getTransformedValue(self, start, end, double = True):
+    def getTransformedValue(self, start, end, full = True):
         if self.fCompressed:
             offset = self.fc/2
-            return 
globals.encodeName(self.mainStream.bytes[offset:offset+end-start])
+            if full:
+                fro = offset
+                to = offset+end-start
+            else:
+                fro = start
+                to = end
+            return globals.encodeName(self.mainStream.bytes[fro:to])
         else:
-            l = end - start
-            if double:
-                l = l * 2
-            offset = self.fc
-            return 
globals.encodeName(self.mainStream.bytes[offset:offset+l].decode('utf-16'), 
lowOnly = True)
+            if full:
+                offset = self.fc
+                fro = offset
+                to = offset + (end - start) * 2
+            else:
+                fro = start
+                to = end
+            return 
globals.encodeName(self.mainStream.bytes[fro:to].decode('utf-16'), lowOnly = 
True)
 
     @staticmethod
     def getFCTransformedValue(bytes, start, end):
@@ -503,7 +512,7 @@ class ChpxFkp(DOCDirStream):
             start = self.getuInt32(pos = pos)
             end = self.getuInt32(pos = pos + 4)
             print '<rgfc index="%d" start="%d" end="%d">' % (i, start, end)
-            print '<transformed value="%s"/>' % 
FcCompressed.getFCTransformedValue(self.bytes, start, end)
+            print '<transformed value="%s"/>' % 
self.pnFkpChpx.mainStream.retrieveText(start, end)
             pos += 4
 
             # rgbx
diff --git a/src/docstream.py b/src/docstream.py
index f56e27e..5197c40 100644
--- a/src/docstream.py
+++ b/src/docstream.py
@@ -424,9 +424,9 @@ class WordDocumentStream(DOCDirStream):
     def handleLcbClx(self, silent = False):
         offset = self.fcClx
         size = self.lcbClx
-        clx = docrecord.Clx(self.doc.getDirectoryStreamByName("1Table").bytes, 
self, offset, size)
+        self.clx = 
docrecord.Clx(self.doc.getDirectoryStreamByName("1Table").bytes, self, offset, 
size)
         if not silent:
-            clx.dump()
+            self.clx.dump()
 
     def handleLcbPlcfBteChpx(self):
         plcBteChpx = docrecord.PlcBteChpx(self)
@@ -624,6 +624,6 @@ class WordDocumentStream(DOCDirStream):
     def retrieveText(self, start, end):
         plcPcd = self.clx.pcdt.plcPcd
         idx = self.__findText(plcPcd, start)
-        return plcPcd.aPcd[idx].fc.getTransformedValue(start, end, double = 
False)
+        return plcPcd.aPcd[idx].fc.getTransformedValue(start, end, full = 
False)
 
 # vim:set filetype=python shiftwidth=4 softtabstop=4 expandtab:
commit 09f36ce62d691c288795cb35026a3c92cd8df429
Author: Miklos Vajna <[email protected]>
Date:   Sat Jan 5 22:28:44 2013 +0100

    PlcPcd: separate parse and dump

diff --git a/src/docrecord.py b/src/docrecord.py
index 25650d6..418d255 100644
--- a/src/docrecord.py
+++ b/src/docrecord.py
@@ -168,24 +168,28 @@ class PlcPcd(DOCDirStream, PLC):
         self.size = size
         self.aCp = []
         self.aPcd = []
+        self.ranges = []
 
-    def dump(self):
-        print '<plcPcd type="PlcPcd" offset="%d" size="%d bytes">' % 
(self.pos, self.size)
         pos = self.pos
         for i in range(self.getElements()):
             # aCp
             start = self.getuInt32(pos = pos)
             end = self.getuInt32(pos = pos + 4)
-            print '<aCP index="%d" start="%d" end="%d">' % (i, start, end)
+            self.ranges.append((start, end))
             self.aCp.append(start)
             pos += 4
 
             # aPcd
             aPcd = Pcd(self.bytes, self.mainStream, self.getOffset(self.pos, 
i), 8)
-            aPcd.dump()
             self.aPcd.append(aPcd)
 
-            print '<transformed value="%s"/>' % 
aPcd.fc.getTransformedValue(start, end)
+    def dump(self):
+        print '<plcPcd type="PlcPcd" offset="%d" size="%d bytes">' % 
(self.pos, self.size)
+        for i in range(self.getElements()):
+            start, end = self.ranges[i]
+            print '<aCP index="%d" start="%d" end="%d">' % (i, start, end)
+            self.aPcd[i].dump()
+            print '<transformed value="%s"/>' % 
self.aPcd[i].fc.getTransformedValue(start, end)
             print '</aCP>'
         print '</plcPcd>'
 
@@ -703,11 +707,15 @@ class Pcdt(DOCDirStream):
         self.pos = offset
         self.size = size
 
+        self.clxt = self.readuInt8()
+        self.lcb = self.readuInt32()
+        self.plcPcd = PlcPcd(self.bytes, self.mainStream, self.pos, self.lcb)
+
     def dump(self):
         print '<pcdt type="Pcdt" offset="%d" size="%d bytes">' % (self.pos, 
self.size)
-        self.printAndSet("clxt", self.readuInt8())
-        self.printAndSet("lcb", self.readuInt32())
-        PlcPcd(self.bytes, self.mainStream, self.pos, self.lcb).dump()
+        self.printAndSet("clxt", self.clxt)
+        self.printAndSet("lcb", self.lcb)
+        self.plcPcd.dump()
         print '</pcdt>'
 
 class Clx(DOCDirStream):
commit 0a6ef67326a23e6c5c13f945130e118d8ffaf3f2
Author: Miklos Vajna <[email protected]>
Date:   Sat Jan 5 22:20:59 2013 +0100

    PlcPcd: store aCp and aPcd array

diff --git a/src/docrecord.py b/src/docrecord.py
index ef32730..25650d6 100644
--- a/src/docrecord.py
+++ b/src/docrecord.py
@@ -166,6 +166,8 @@ class PlcPcd(DOCDirStream, PLC):
         PLC.__init__(self, size, 8) # 8 is defined by 2.8.35
         self.pos = offset
         self.size = size
+        self.aCp = []
+        self.aPcd = []
 
     def dump(self):
         print '<plcPcd type="PlcPcd" offset="%d" size="%d bytes">' % 
(self.pos, self.size)
@@ -175,11 +177,13 @@ class PlcPcd(DOCDirStream, PLC):
             start = self.getuInt32(pos = pos)
             end = self.getuInt32(pos = pos + 4)
             print '<aCP index="%d" start="%d" end="%d">' % (i, start, end)
+            self.aCp.append(start)
             pos += 4
 
             # aPcd
             aPcd = Pcd(self.bytes, self.mainStream, self.getOffset(self.pos, 
i), 8)
             aPcd.dump()
+            self.aPcd.append(aPcd)
 
             print '<transformed value="%s"/>' % 
aPcd.fc.getTransformedValue(start, end)
             print '</aCP>'
commit d35016594f20bf86f4b39b2dcdbcc6820a397ef7
Author: Miklos Vajna <[email protected]>
Date:   Sat Jan 5 22:16:42 2013 +0100

    WordDocumentStream: add a retrieveText method

diff --git a/src/docrecord.py b/src/docrecord.py
index 770b956..ef32730 100644
--- a/src/docrecord.py
+++ b/src/docrecord.py
@@ -30,13 +30,16 @@ class FcCompressed(DOCDirStream):
         self.printAndSet("r1", self.r1)
         print '</fcCompressed>'
 
-    def getTransformedValue(self, start, end):
-            if self.fCompressed:
-                offset = self.fc/2
-                return 
globals.encodeName(self.mainStream.bytes[offset:offset+end-start])
-            else:
-                offset = self.fc
-                return 
globals.encodeName(self.mainStream.bytes[offset:offset+end*2-start].decode('utf-16'),
 lowOnly = True)
+    def getTransformedValue(self, start, end, double = True):
+        if self.fCompressed:
+            offset = self.fc/2
+            return 
globals.encodeName(self.mainStream.bytes[offset:offset+end-start])
+        else:
+            l = end - start
+            if double:
+                l = l * 2
+            offset = self.fc
+            return 
globals.encodeName(self.mainStream.bytes[offset:offset+l].decode('utf-16'), 
lowOnly = True)
 
     @staticmethod
     def getFCTransformedValue(bytes, start, end):
diff --git a/src/docstream.py b/src/docstream.py
index f33fcf4..f56e27e 100644
--- a/src/docstream.py
+++ b/src/docstream.py
@@ -614,4 +614,16 @@ class WordDocumentStream(DOCDirStream):
         self.__dumpFibRgFcLcb2002()
         print '</%s>' % name
 
+    def __findText(self, plcPcd, cp):
+        """Find the largest i such that plcPcd.aCp[i] <= cp."""
+        for i in range(len(plcPcd.aCp)):
+            if plcPcd.aCp[i] <= cp:
+                index = i
+        return index
+
+    def retrieveText(self, start, end):
+        plcPcd = self.clx.pcdt.plcPcd
+        idx = self.__findText(plcPcd, start)
+        return plcPcd.aPcd[idx].fc.getTransformedValue(start, end, double = 
False)
+
 # vim:set filetype=python shiftwidth=4 softtabstop=4 expandtab:
commit e86350962d0e874edf292e6b86fa35cf06aacd97
Author: Miklos Vajna <[email protected]>
Date:   Sat Jan 5 22:10:37 2013 +0100

    parse Clx early

diff --git a/src/docstream.py b/src/docstream.py
index 7671bf8..f33fcf4 100644
--- a/src/docstream.py
+++ b/src/docstream.py
@@ -390,6 +390,18 @@ class WordDocumentStream(DOCDirStream):
             ["fcSttbfUssr"],
             ["lcbSttbfUssr"],
                 ]
+
+        # Parse Clx early, as it's needed by other structures.
+        posOrig = self.pos
+        for i in fields:
+            value = self.readInt32()
+            if i[0] == "fcClx":
+                self.printAndSet(i[0], value, silent = True)
+            if i[0] == "lcbClx":
+                self.printAndSet(i[0], value, silent = True)
+                i[1](silent = True)
+        self.pos = posOrig
+
         for i in fields:
             value = self.readInt32()
             hasHandler = len(i) > 1
@@ -409,11 +421,12 @@ class WordDocumentStream(DOCDirStream):
     def handleDop(self):
         docrecord.Dop(self).dump()
 
-    def handleLcbClx(self):
+    def handleLcbClx(self, silent = False):
         offset = self.fcClx
         size = self.lcbClx
         clx = docrecord.Clx(self.doc.getDirectoryStreamByName("1Table").bytes, 
self, offset, size)
-        clx.dump()
+        if not silent:
+            clx.dump()
 
     def handleLcbPlcfBteChpx(self):
         plcBteChpx = docrecord.PlcBteChpx(self)
commit f8902aa146f25d1f4aec76d59d207ab68740cf6b
Author: Miklos Vajna <[email protected]>
Date:   Sat Jan 5 22:07:20 2013 +0100

    Clx: decouple parsing from dumping

diff --git a/src/docrecord.py b/src/docrecord.py
index 85a967f..770b956 100644
--- a/src/docrecord.py
+++ b/src/docrecord.py
@@ -709,12 +709,15 @@ class Clx(DOCDirStream):
         self.pos = offset
         self.size = size
 
+        self.firstByte = self.getuInt8()
+        if self.firstByte == 0x02:
+            self.pcdt = Pcdt(self.bytes, self.mainStream, self.pos, self.size)
+
     def dump(self):
         print '<clx type="Clx" offset="%d" size="%d bytes">' % (self.pos, 
self.size)
-        firstByte = self.getuInt8()
-        if firstByte == 0x02:
+        if self.firstByte == 0x02:
             print '<info what="Array of Prc, 0 elements"/>'
-            Pcdt(self.bytes, self.mainStream, self.pos, self.size).dump()
+            self.pcdt.dump()
         else:
             print '<todo what="Clx::dump() first byte is not 0x02"/>'
         print '</clx>'
commit 3a42fcbf8610c2575252e33adfe4132863192970
Author: Miklos Vajna <[email protected]>
Date:   Sat Jan 5 22:05:18 2013 +0100

    Pcd: decouple parsing from dumping

diff --git a/src/docrecord.py b/src/docrecord.py
index 4efc546..85a967f 100644
--- a/src/docrecord.py
+++ b/src/docrecord.py
@@ -49,16 +49,21 @@ class Pcd(DOCDirStream):
         self.pos = offset
         self.size = size
 
-    def dump(self):
-        print '<pcd type="Pcd" offset="%d" size="%d bytes">' % (self.pos, 
self.size)
         buf = self.readuInt16()
-        self.printAndSet("fNoParaLast", self.getBit(buf, 0))
-        self.printAndSet("fR1", self.getBit(buf, 1))
-        self.printAndSet("fDirty", self.getBit(buf, 2))
-        self.printAndSet("fR2", buf & (2**13-1))
+        self.fNoParaLast = self.getBit(buf, 0)
+        self.fR1 = self.getBit(buf, 1)
+        self.fDirty = self.getBit(buf, 2)
+        self.fR2 = buf & (2**13-1)
         self.fc = FcCompressed(self.bytes, self.mainStream, self.pos, 4)
-        self.fc.dump()
         self.pos += 4
+
+    def dump(self):
+        print '<pcd type="Pcd" offset="%d" size="%d bytes">' % (self.pos, 
self.size)
+        self.printAndSet("fNoParaLast", self.fNoParaLast)
+        self.printAndSet("fR1", self.fR1)
+        self.printAndSet("fDirty", self.fDirty)
+        self.printAndSet("fR2", self.fR2)
+        self.fc.dump()
         print '</pcd>'
 
 class PLC:
commit 9e5fe43c776fadee01da3f1c7174c16dd71cb0fb
Author: Miklos Vajna <[email protected]>
Date:   Sat Jan 5 21:56:53 2013 +0100

    FcCompressed: decouple parsing from dumping

diff --git a/src/docrecord.py b/src/docrecord.py
index 82005b2..4efc546 100644
--- a/src/docrecord.py
+++ b/src/docrecord.py
@@ -18,12 +18,16 @@ class FcCompressed(DOCDirStream):
         self.pos = offset
         self.size = size
 
+        buf = self.readuInt32()
+        self.fc = buf & ((2**32-1) >> 2) # bits 0..29
+        self.fCompressed = self.getBit(buf, 30)
+        self.r1 = self.getBit(buf, 31)
+
     def dump(self):
         print '<fcCompressed type="FcCompressed" offset="%d" size="%d bytes">' 
% (self.pos, self.size)
-        buf = self.readuInt32()
-        self.printAndSet("fc", buf & ((2**32-1) >> 2)) # bits 0..29
-        self.printAndSet("fCompressed", self.getBit(buf, 30))
-        self.printAndSet("r1", self.getBit(buf, 31))
+        self.printAndSet("fc", self.fc)
+        self.printAndSet("fCompressed", self.fCompressed)
+        self.printAndSet("r1", self.r1)
         print '</fcCompressed>'
 
     def getTransformedValue(self, start, end):
commit aab6a67b4032cee6401fa206c84b8bb98bdc8f98
Author: Miklos Vajna <[email protected]>
Date:   Sat Jan 5 20:56:27 2013 +0100

    pass reference to parent in handleLcbPlcfBteChpx, PlcBteChpx and PnFkpChpx

diff --git a/src/docrecord.py b/src/docrecord.py
index fc041e5..82005b2 100644
--- a/src/docrecord.py
+++ b/src/docrecord.py
@@ -468,10 +468,11 @@ class BxPap(DOCDirStream):
 
 class ChpxFkp(DOCDirStream):
     """The ChpxFkp structure maps text to its character properties."""
-    def __init__(self, bytes, mainStream, offset, size):
-        DOCDirStream.__init__(self, mainStream.bytes)
+    def __init__(self, pnFkpChpx, offset, size):
+        DOCDirStream.__init__(self, pnFkpChpx.mainStream.bytes)
         self.pos = offset
         self.size = size
+        self.pnFkpChpx = pnFkpChpx
 
     def dump(self):
         print '<chpxFkp type="ChpxFkp" offset="%d" size="%d bytes">' % 
(self.pos, self.size)
@@ -525,17 +526,18 @@ class PapxFkp(DOCDirStream):
 
 class PnFkpChpx(DOCDirStream):
     """The PnFkpChpx structure specifies the location in the WordDocument 
Stream of a ChpxFkp structure."""
-    def __init__(self, bytes, mainStream, offset, size, name):
-        DOCDirStream.__init__(self, bytes, mainStream=mainStream)
+    def __init__(self, plcBteChpx, offset, size, name):
+        DOCDirStream.__init__(self, plcBteChpx.bytes, 
mainStream=plcBteChpx.mainStream)
         self.pos = offset
         self.size = size
         self.name = name
+        self.plcBteChpx = plcBteChpx
 
     def dump(self):
         print '<%s type="PnFkpChpx" offset="%d" size="%d bytes">' % 
(self.name, self.pos, self.size)
         buf = self.readuInt32()
         self.printAndSet("pn", buf & (2**22-1))
-        chpxFkp = ChpxFkp(self.bytes, self.mainStream, self.pn*512, 512)
+        chpxFkp = ChpxFkp(self, self.pn*512, 512)
         chpxFkp.dump()
         print '</%s>' % self.name
 
@@ -587,11 +589,11 @@ class PnFkpPapx(DOCDirStream):
 
 class PlcBteChpx(DOCDirStream, PLC):
     """The PlcBteChpx structure is a PLC that maps the offsets of text in the 
WordDocument stream to the character properties of that text."""
-    def __init__(self, bytes, mainStream, offset, size):
-        DOCDirStream.__init__(self, bytes, mainStream=mainStream)
-        PLC.__init__(self, size, 4)
-        self.pos = offset
-        self.size = size
+    def __init__(self, mainStream):
+        DOCDirStream.__init__(self, 
mainStream.doc.getDirectoryStreamByName("1Table").bytes, mainStream=mainStream)
+        PLC.__init__(self, mainStream.lcbPlcfBteChpx, 4)
+        self.pos = mainStream.fcPlcfBteChpx
+        self.size = mainStream.lcbPlcfBteChpx
 
     def dump(self):
         print '<plcBteChpx type="PlcBteChpx" offset="%d" size="%d bytes">' % 
(self.pos, self.size)
@@ -604,7 +606,7 @@ class PlcBteChpx(DOCDirStream, PLC):
             pos += 4
 
             # aPnBteChpx
-            aPnBteChpx = PnFkpChpx(self.bytes, self.mainStream, 
self.getOffset(self.pos, i), 4, "aPnBteChpx")
+            aPnBteChpx = PnFkpChpx(self, self.getOffset(self.pos, i), 4, 
"aPnBteChpx")
             aPnBteChpx.dump()
             print '</aFC>'
         print '</plcBteChpx>'
diff --git a/src/docstream.py b/src/docstream.py
index 5d4f999..7671bf8 100644
--- a/src/docstream.py
+++ b/src/docstream.py
@@ -416,9 +416,7 @@ class WordDocumentStream(DOCDirStream):
         clx.dump()
 
     def handleLcbPlcfBteChpx(self):
-        offset = self.fcPlcfBteChpx
-        size = self.lcbPlcfBteChpx
-        plcBteChpx = 
docrecord.PlcBteChpx(self.doc.getDirectoryStreamByName("1Table").bytes, self, 
offset, size)
+        plcBteChpx = docrecord.PlcBteChpx(self)
         plcBteChpx.dump()
 
     def handleLcbPlcfBtePapx(self):
_______________________________________________
Libreoffice-commits mailing list
[email protected]
http://lists.freedesktop.org/mailman/listinfo/libreoffice-commits

[Libreoffice-commits] .: 9 commits - src/docrecord.py src/docstream.py

Reply via email to