fdo#47644 big 18 meg .doc is super dooper slow to load, mostly because
of the extra checks I added to sanity test .doc file during parsing.
Turns out that seeking backwards in the those ole2 formats is incredibly
slow, this might affect other formats like xls and ppt that use some of
the shared "msfilter" code.

1st attachment gets us back to where we came from in terms of
performance there.

The second one is maybe a bit more contentious for 3-5, but I include it
for a look-over. Basically there's a "page chain" in the ole2 format and
to seek to a location you walk through the chain until you get to the
correct page. I see that in practice most documents have their chain in
a presorted order (I have no idea if having them unsorted is actually a
sign of a broken document or if its legal) so we can do a far faster
binary_search if we walk the chain once and hold on to the results.

So second attachment keeps the chain if its one of the circumstances
where we have to parse the whole thing anyway, and if not (the usual
one) then if we're seeking an (fairly arbitrary) large number of steps
where its likely that the cost-benefit is in favour of it then generate
the chain once and hold on to it for reuse until there any event which
might invalidate the chain.

Second patch knocks about 50 seconds off load time. First patch knocks
some unknown but > 10 mins off load time. 

C.

Don't even think about measuring on a gcc dbgutil build btw, the concept
checking of the stl lower_bound is a huge cost so the times aren't even
roughly uniformly slower on dbgutil vs product but spike dramatically on
some stl stuff.
>From 1e89ba85d89858ae8c3263a4e6b5b522016fe445 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Caol=C3=A1n=20McNamara?= <caol...@redhat.com>
Date: Thu, 10 May 2012 15:41:17 +0100
Subject: [PATCH] Resolves: fdo#47644 compound storage backend is poor at
 knowing stream size

---
 sot/inc/sot/stg.hxx              |    3 +++
 sot/inc/sot/storage.hxx          |    1 +
 sot/source/sdstor/stg.cxx        |    7 +++++++
 sot/source/sdstor/storage.cxx    |    7 +++++++
 sot/source/sdstor/ucbstorage.cxx |    5 +++++
 tools/inc/tools/stream.hxx       |    3 ++-
 tools/source/stream/stream.cxx   |    6 +++---
 7 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/sot/inc/sot/stg.hxx b/sot/inc/sot/stg.hxx
index c8d18da..3071017 100644
--- a/sot/inc/sot/stg.hxx
+++ b/sot/inc/sot/stg.hxx
@@ -90,6 +90,7 @@ public:
     virtual sal_uLong   Tell() = 0;
     virtual void    Flush() = 0;
     virtual sal_Bool    SetSize( sal_uLong nNewSize ) = 0;
+    virtual sal_uLong   GetSize() const = 0;
     virtual sal_Bool    CopyTo( BaseStorageStream * pDestStm ) = 0;
     virtual sal_Bool    Commit() = 0;
     virtual sal_Bool    Revert() = 0;
@@ -171,6 +172,7 @@ public:
     virtual sal_uLong   Tell() { return nPos; }
     virtual void    Flush();
     virtual sal_Bool    SetSize( sal_uLong nNewSize );
+    virtual sal_uLong   GetSize() const;
     virtual sal_Bool    CopyTo( BaseStorageStream * pDestStm );
     virtual sal_Bool    Commit();
     virtual sal_Bool    Revert();
@@ -266,6 +268,7 @@ public:
     virtual sal_uLong               Tell();
     virtual void                Flush();
     virtual sal_Bool                SetSize( sal_uLong nNewSize );
+    virtual sal_uLong               GetSize() const;
     virtual sal_Bool                CopyTo( BaseStorageStream * pDestStm );
     virtual sal_Bool                Commit();
     virtual sal_Bool                Revert();
diff --git a/sot/inc/sot/storage.hxx b/sot/inc/sot/storage.hxx
index 26258bc..4f602e9 100644
--- a/sot/inc/sot/storage.hxx
+++ b/sot/inc/sot/storage.hxx
@@ -96,6 +96,7 @@ public:
     sal_Bool                GetProperty( const String& rName, ::com::sun::star::uno::Any& rValue );
     ::com::sun::star::uno::Reference< ::com::sun::star::io::XInputStream >
                         GetXInputStream() const;
+    virtual sal_Size remainingSize();
 };
 
 #ifndef SOT_DECL_SOTSTORAGESTREAM_DEFINED
diff --git a/sot/source/sdstor/stg.cxx b/sot/source/sdstor/stg.cxx
index 4c4be65..18e95f6 100644
--- a/sot/source/sdstor/stg.cxx
+++ b/sot/source/sdstor/stg.cxx
@@ -252,6 +252,13 @@ sal_Bool StorageStream::SetSize( sal_uLong nNewSize )
         return sal_False;
 }
 
+sal_uLong StorageStream::GetSize() const
+{
+    if( Validate() )
+        return pEntry->GetSize();
+    return 0;
+}
+
 sal_Bool StorageStream::Commit()
 {
     if( !Validate() )
diff --git a/sot/source/sdstor/storage.cxx b/sot/source/sdstor/storage.cxx
index 897f9fe..4d24f1f 100644
--- a/sot/source/sdstor/storage.cxx
+++ b/sot/source/sdstor/storage.cxx
@@ -298,6 +298,13 @@ sal_uInt32 SotStorageStream::GetSize() const
     return nSize;
 }
 
+sal_Size SotStorageStream::remainingSize()
+{
+    if (pOwnStm)
+        return pOwnStm->GetSize() - Tell();
+    return SvStream::remainingSize();
+}
+
 /*************************************************************************
 |*    SotStorageStream::CopyTo()
 |*
diff --git a/sot/source/sdstor/ucbstorage.cxx b/sot/source/sdstor/ucbstorage.cxx
index b6fb581..020fb9a 100644
--- a/sot/source/sdstor/ucbstorage.cxx
+++ b/sot/source/sdstor/ucbstorage.cxx
@@ -1566,6 +1566,11 @@ sal_Bool UCBStorageStream::GetProperty( const String& rName, ::com::sun::star::u
     return sal_False;
 }
 
+sal_uLong UCBStorageStream::GetSize() const
+{
+    return pImp->GetSize();
+}
+
 UCBStorage::UCBStorage( SvStream& rStrm, sal_Bool bDirect )
 {
     String aURL = GetLinkedFile( rStrm );
diff --git a/tools/inc/tools/stream.hxx b/tools/inc/tools/stream.hxx
index 0ad9585..e577ca8 100644
--- a/tools/inc/tools/stream.hxx
+++ b/tools/inc/tools/stream.hxx
@@ -379,7 +379,7 @@ public:
     sal_Size        SeekRel( sal_sSize nPos );
     sal_Size        Tell() const { return nBufFilePos+nBufActualPos;  }
     //length between current (Tell()) pos and end of stream
-    sal_Size        remainingSize();
+    virtual sal_Size remainingSize();
     void            Flush();
     sal_Bool        IsEof() const { return bIsEof; }
     // next Tell() <= nSize
@@ -678,6 +678,7 @@ public:
     sal_Bool            IsObjectMemoryOwner() { return bOwnsData; }
     void            SetResizeOffset( sal_Size nNewResize ) { nResize = nNewResize; }
     sal_Size            GetResizeOffset() const { return nResize; }
+    virtual sal_Size remainingSize() { return GetSize() - Tell(); }
 };
 
 // --------------------
diff --git a/tools/source/stream/stream.cxx b/tools/source/stream/stream.cxx
index 3e870b4..84d652c 100644
--- a/tools/source/stream/stream.cxx
+++ b/tools/source/stream/stream.cxx
@@ -1820,9 +1820,9 @@ sal_Size SvStream::Seek( sal_Size nFilePos )
     return nBufFilePos + nBufActualPos;
 }
 
-//probably not as inefficient as it looks seeing as STREAM_SEEK_TO_END in the
-//Seek backends is nomally special cased feel free to make this virtual and add
-//good implementations for SvFileStream etc
+//STREAM_SEEK_TO_END in the some of the Seek backends is special cased to be
+//efficient, in others e.g. SotStorageStream it's really horribly slow, and in
+//those this should be overridden
 sal_Size SvStream::remainingSize()
 {
     sal_Size nCurr = Tell();
-- 
1.7.7.6

>From 2e8885be8be0e23a43bd9d0d87add436a84c2891 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Caol=C3=A1n=20McNamara?= <caol...@redhat.com>
Date: Thu, 10 May 2012 15:43:47 +0100
Subject: [PATCH] Related: fdo#47644 for huge files build a page chain cache
 for seeks

For huge ole2 structured storage files build a page chain cache on
demand to speed up long distance seeks

i.e. reduces .doc parse time for fdo#47644 from 1 minute 7 seconds to 18
seconds for me
---
 sot/source/sdstor/stgstrms.cxx |  133 +++++++++++++++++++++++++++++++++++-----
 sot/source/sdstor/stgstrms.hxx |    4 +
 2 files changed, 121 insertions(+), 16 deletions(-)

diff --git a/sot/source/sdstor/stgstrms.cxx b/sot/source/sdstor/stgstrms.cxx
index 5e6c1ea..e982837 100644
--- a/sot/source/sdstor/stgstrms.cxx
+++ b/sot/source/sdstor/stgstrms.cxx
@@ -26,9 +26,10 @@
  *
  ************************************************************************/
 
+#include <algorithm>
 
 #include <string.h>     // memcpy()
-
+#include <sal/log.hxx>
 #include <osl/file.hxx>
 #include <tools/tempfile.hxx>
 #include <tools/debug.hxx>
@@ -334,6 +335,56 @@ void StgStrm::SetEntry( StgDirEntry& r )
     r.SetDirty();
 }
 
+namespace lcl
+{
+#if defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L
+    using std::is_sorted;
+#else
+    template <typename iter> bool is_sorted(iter aStart, iter aEnd)
+    {
+        if (aStart == aEnd)
+            return true;
+
+        for (iter aNext = aStart + 1; aNext != aEnd; aStart = aNext, ++aNext)
+        {
+            if (*aNext < *aStart)
+                return false;
+        }
+
+        return true;
+    }
+#endif
+}
+
+bool StgStrm::buildPageChainCache()
+{
+    if (nSize > 0)
+        m_aPagesCache.reserve(nSize/nPageSize);
+
+    sal_Int32 nBgn = nStart;
+    while (nBgn >= 0)
+    {
+        m_aPagesCache.push_back(nBgn);
+        sal_Int32 nOldBgn = nBgn;
+        nBgn = pFat->GetNextPage(nBgn);
+        if (nBgn == nOldBgn)
+            return false;
+    }
+
+    m_bSortedPageChain = lcl::is_sorted(m_aPagesCache.begin(), m_aPagesCache.end());
+
+    SAL_WARN_IF(!m_bSortedPageChain, "sot", "unsorted page chain, that's suspicious");
+
+    return true;
+}
+
+//See fdo#47644 for a .doc with a vast amount of pages where seeking around the
+//document takes a colossal amount of time
+//
+//There's a cost to building a page cache, so only build one if the number of
+//pages to seek through hits some sufficiently high value where it's worth it.
+#define ARBITRARY_LARGE_AMOUNT_OF_PAGES 256
+
 // Compute page number and offset for the given byte position.
 // If the position is behind the size, set the stream right
 // behind the EOF.
@@ -355,7 +406,7 @@ sal_Bool StgStrm::Pos2Page( sal_Int32 nBytePos )
         return sal_True;
     if( nNew > nOld )
     {
-        // the new position is behind the current, so an incremental
+        // the new position is after the current, so an incremental
         // positioning is OK. Set the page relative position
         nRel = nNew - nOld;
         nBgn = nPage;
@@ -369,13 +420,59 @@ sal_Bool StgStrm::Pos2Page( sal_Int32 nBytePos )
     }
     // now, traverse the FAT chain.
     nRel /= nPageSize;
+
     sal_Int32 nLast = STG_EOF;
-    while( nRel && nBgn >= 0 )
+    if (m_aPagesCache.empty() && nRel < ARBITRARY_LARGE_AMOUNT_OF_PAGES)
+    {
+        while (nRel && nBgn >= 0)
+        {
+            nLast = nBgn;
+            nBgn = pFat->GetNextPage( nBgn );
+            nRel--;
+        }
+    }
+    else if (nBgn >= 0)
     {
-        nLast = nBgn;
-        nBgn = pFat->GetNextPage( nBgn );
-        nRel--;
+        //Seeking large distances is slow, so if we're starting seeking (some
+        //fairly arbitrary) large distances, build a cache and re-use it for
+        //subsequent seeks
+        if (m_aPagesCache.empty())
+        {
+            SAL_INFO("sot", "kicking off large seek helper\n");
+            buildPageChainCache();
+        }
+
+        std::vector<sal_Int32>::iterator aI;
+
+        if (m_bSortedPageChain)
+            aI = std::lower_bound(m_aPagesCache.begin(), m_aPagesCache.end(), nBgn);
+        else
+            aI = std::find(m_aPagesCache.begin(), m_aPagesCache.end(), nBgn);
+
+        if (aI == m_aPagesCache.end())
+        {
+            SAL_WARN("sot", "Unknown page position");
+            nBgn = STG_EOF;
+        }
+        else
+        {
+            size_t nBgnDistance = std::distance(m_aPagesCache.begin(), aI);
+
+            size_t nIndex = nBgnDistance + nRel;
+
+            if (nIndex > m_aPagesCache.size())
+            {
+                nRel = m_aPagesCache.size() - nBgnDistance;
+                nIndex = m_aPagesCache.size() - 1;
+            }
+            else
+                nRel = 0;
+
+            nLast = nIndex ? m_aPagesCache[nIndex - 1] : STG_EOF;
+            nBgn = m_aPagesCache[nIndex];
+        }
     }
+
     // special case: seek to 1st byte of new, unallocated page
     // (in case the file size is a multiple of the page size)
     if( nBytePos == nSize && nBgn == STG_EOF && !nRel && !nOffset )
@@ -404,6 +501,8 @@ StgPage* StgStrm::GetPhysPage( sal_Int32 nBytePos, sal_Bool bForce )
 
 sal_Bool StgStrm::Copy( sal_Int32 nFrom, sal_Int32 nBytes )
 {
+    m_aPagesCache.clear();
+
     sal_Int32 nTo = nStart;
     sal_Int32 nPgs = ( nBytes + nPageSize - 1 ) / nPageSize;
     while( nPgs-- )
@@ -430,6 +529,8 @@ sal_Bool StgStrm::Copy( sal_Int32 nFrom, sal_Int32 nBytes )
 
 sal_Bool StgStrm::SetSize( sal_Int32 nBytes )
 {
+    m_aPagesCache.clear();
+
     // round up to page size
     sal_Int32 nOld = ( ( nSize + nPageSize - 1 ) / nPageSize ) * nPageSize;
     sal_Int32 nNew = ( ( nBytes + nPageSize - 1 ) / nPageSize ) * nPageSize;
@@ -528,6 +629,8 @@ sal_Int32 StgFATStrm::GetPage( short nOff, sal_Bool bMake, sal_uInt16 *pnMasterA
         {
             if( bMake )
             {
+                m_aPagesCache.clear();
+
                 // create a new master page
                 nFAT = nMaxPage++;
                 pMaster = rIo.Copy( nFAT, STG_FREE );
@@ -582,6 +685,8 @@ sal_Int32 StgFATStrm::GetPage( short nOff, sal_Bool bMake, sal_uInt16 *pnMasterA
 
 sal_Bool StgFATStrm::SetPage( short nOff, sal_Int32 nNewPage )
 {
+    m_aPagesCache.clear();
+
     sal_Bool bRes = sal_True;
     if( nOff < rIo.aHdr.GetFAT1Size() )
         rIo.aHdr.SetFATPage( nOff, nNewPage );
@@ -631,6 +736,8 @@ sal_Bool StgFATStrm::SetPage( short nOff, sal_Int32 nNewPage )
 
 sal_Bool StgFATStrm::SetSize( sal_Int32 nBytes )
 {
+    m_aPagesCache.clear();
+
     // Set the number of entries to a multiple of the page size
     short nOld = (short) ( ( nSize + ( nPageSize - 1 ) ) / nPageSize );
     short nNew = (short) (
@@ -747,16 +854,10 @@ void StgDataStrm::Init( sal_Int32 nBgn, sal_Int32 nLen )
     {
         // determine the actual size of the stream by scanning
         // the FAT chain and counting the # of pages allocated
-        nSize = 0;
-        sal_Int32 nOldBgn = -1;
-        while( nBgn >= 0 && nBgn != nOldBgn )
-        {
-            nOldBgn = nBgn;
-            nBgn = pFat->GetNextPage( nBgn );
-            if( nBgn == nOldBgn )
-                rIo.SetError( ERRCODE_IO_WRONGFORMAT );
-            nSize += nPageSize;
-        }
+        bool bOk = buildPageChainCache();
+        if (!bOk)
+            rIo.SetError( ERRCODE_IO_WRONGFORMAT );
+        nSize = m_aPagesCache.size() * nPageSize;
     }
 }
 
diff --git a/sot/source/sdstor/stgstrms.hxx b/sot/source/sdstor/stgstrms.hxx
index a6a0ad1..cfae9e2 100644
--- a/sot/source/sdstor/stgstrms.hxx
+++ b/sot/source/sdstor/stgstrms.hxx
@@ -30,6 +30,7 @@
 #define _STGSTRMS_HXX
 
 #include <tools/stream.hxx>
+#include <vector>
 
 class StgIo;
 class StgStrm;
@@ -77,6 +78,9 @@ protected:
     sal_Int32 nPage;                        // current logical page
     short nOffset;                      // offset into current page
     short nPageSize;                    // logical page size
+    std::vector<sal_Int32> m_aPagesCache;
+    bool m_bSortedPageChain;
+    bool buildPageChainCache();
     sal_Bool  Copy( sal_Int32 nFrom, sal_Int32 nBytes );
     StgStrm( StgIo& );
 public:
-- 
1.7.7.6

_______________________________________________
LibreOffice mailing list
LibreOffice@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/libreoffice

Reply via email to