Kelson has submitted this change and it was merged.

Change subject: Read the cluster content only when necessary.
......................................................................


Read the cluster content only when necessary.

Instead of reading the full cluster content at cluster creation, it is
better to read the cluster content when we need it.
This is mainly useful when we want a cluster to only get an article
offset.
For compressed cluster we read all in once cause :
- We create a proxy uncompressor stream from the input stream.
  This proxy stream do not handle teelg who is necessary for lazy_read.
- This change is only useful when we use getOffset, and there is no
  offset available on comressed cluster.

We do not use the operator>> anymore.
This operator is designed to allow chained reads. ie :
in >> cluster1 >> cluster2;

As may do not read all the content when reading from in, the use of the
operator>> is now not desirable.

Change-Id: I0709eb6b8fe49512ee302d13dfd5641cdc1c676b
---
M zimlib/include/zim/cluster.h
M zimlib/src/cluster.cpp
M zimlib/src/file.cpp
M zimlib/src/fileimpl.cpp
4 files changed, 87 insertions(+), 46 deletions(-)

Approvals:
  Kelson: Verified; Looks good to me, approved



diff --git a/zimlib/include/zim/cluster.h b/zimlib/include/zim/cluster.h
index 96b16f0..4fd3971 100644
--- a/zimlib/include/zim/cluster.h
+++ b/zimlib/include/zim/cluster.h
@@ -23,6 +23,7 @@
 #include <zim/zim.h>
 #include <zim/refcounted.h>
 #include <zim/smartptr.h>
+#include <zim/fstream.h>
 #include <iosfwd>
 #include <vector>
 
@@ -33,7 +34,6 @@
 
   class ClusterImpl : public RefCounted
   {
-      friend std::istream& operator>> (std::istream& in, ClusterImpl& 
blobImpl);
       friend std::ostream& operator<< (std::ostream& out, const ClusterImpl& 
blobImpl);
 
       typedef std::vector<size_type> Offsets;
@@ -41,11 +41,28 @@
 
       CompressionType compression;
       Offsets offsets;
-      Data data;
+      Data _data;
       offset_type startOffset;
 
-      void read(std::istream& in);
+      ifstream* lazy_read_stream;
+
+      offset_type read_header(std::istream& in);
+      void read_content(std::istream& in);
       void write(std::ostream& out) const;
+
+      void set_lazy_read(ifstream* in) {
+        lazy_read_stream = in;
+      }
+
+      bool is_fully_initialised() const { return lazy_read_stream == 0; }
+      void finalise_read();
+      const Data& data() const {
+        if ( !is_fully_initialised() )
+        {
+           const_cast<ClusterImpl*>(this)->finalise_read();
+        }
+        return _data;
+      }
 
     public:
       ClusterImpl();
@@ -55,20 +72,21 @@
       bool isCompressed() const                { return compression == 
zimcompZip || compression == zimcompBzip2 || compression == zimcompLzma; }
 
       size_type getCount() const               { return offsets.size() - 1; }
-      const char* getData(unsigned n) const    { return &data[ offsets[n] ]; }
+      const char* getData(unsigned n) const    { return &data()[ offsets[n] ]; 
}
       size_type getSize(unsigned n) const      { return offsets[n+1] - 
offsets[n]; }
-      size_type getSize() const                { return offsets.size() * 
sizeof(size_type) + data.size(); }
+      size_type getSize() const                { return offsets.size() * 
sizeof(size_type) + data().size(); }
       offset_type getOffset(size_type n) const { return startOffset + 
offsets[n]; }
       Blob getBlob(size_type n) const;
       void clear();
 
       void addBlob(const Blob& blob);
       void addBlob(const char* data, unsigned size);
+
+      void init_from_stream(ifstream& in, offset_type offset);
   };
 
   class Cluster
   {
-      friend std::istream& operator>> (std::istream& in, Cluster& blob);
       friend std::ostream& operator<< (std::ostream& out, const Cluster& blob);
 
       SmartPtr<ClusterImpl> impl;
@@ -98,10 +116,10 @@
       void addBlob(const Blob& blob)                { 
getImpl()->addBlob(blob); }
 
       operator bool() const   { return impl; }
+
+      void init_from_stream(ifstream& in, offset_type offset);
   };
 
-  std::istream& operator>> (std::istream& in, ClusterImpl& blobImpl);
-  std::istream& operator>> (std::istream& in, Cluster& blob);
   std::ostream& operator<< (std::ostream& out, const ClusterImpl& blobImpl);
   std::ostream& operator<< (std::ostream& out, const Cluster& blob);
 
diff --git a/zimlib/src/cluster.cpp b/zimlib/src/cluster.cpp
index 3b24fee..9dbefdc 100644
--- a/zimlib/src/cluster.cpp
+++ b/zimlib/src/cluster.cpp
@@ -20,6 +20,7 @@
 #include <zim/cluster.h>
 #include <zim/blob.h>
 #include <zim/endian.h>
+#include <zim/error.h>
 #include <stdlib.h>
 #include <sstream>
 
@@ -60,34 +61,35 @@
   }
 
   ClusterImpl::ClusterImpl()
-    : compression(zimcompNone)
+    : compression(zimcompNone),
+      startOffset(0),
+      lazy_read_stream(NULL)
   {
     offsets.push_back(0);
   }
 
-  void ClusterImpl::read(std::istream& in)
+  /* This return the number of char read */
+  offset_type ClusterImpl::read_header(std::istream& in)
   {
-    log_debug1("read");
-
+    log_debug1("read_header");
     // read first offset, which specifies, how many offsets we need to read
     size_type offset;
     in.read(reinterpret_cast<char*>(&offset), sizeof(offset));
     if (in.fail())
-      return;
+    {
+      std::cerr << "fail at read offset" << std::endl;
+      throw ZimFileFormatError("fail at read first offset");
+    }
 
     offset = fromLittleEndian(&offset);
 
     size_type n = offset / 4;
     size_type a = offset;
-    // offset are from start of cluster !after the char telling the 
compression!
-    // but startOffset is offset from start of the cluster.
-    startOffset = offset + sizeof(char);
 
     log_debug1("first offset is " << offset << " n=" << n << " a=" << a);
 
     // read offsets
     offsets.clear();
-    data.clear();
     offsets.reserve(n);
     offsets.push_back(0);
     while (--n)
@@ -95,27 +97,44 @@
       in.read(reinterpret_cast<char*>(&offset), sizeof(offset));
       if (in.fail())
       {
-        log_debug1("fail at " << n);
-        return;
+        log_debug("fail at " << n);
+        throw ZimFileFormatError("fail at read offset");
       }
       offset = fromLittleEndian(&offset);
       log_debug1("offset=" << offset << '(' << offset-a << ')');
       offsets.push_back(offset - a);
     }
+    return a;
+  }
 
+  void ClusterImpl::read_content(std::istream& in)
+  {
+    log_debug1("read_content");
+    _data.clear();
     // last offset points past the end of the cluster, so we know now, how may 
bytes to read
     if (offsets.size() > 1)
     {
-      n = offsets.back() - offsets.front();
+      size_type n = offsets.back() - offsets.front();
       if (n > 0)
       {
-        data.resize(n);
-        log_debug1("read " << n << " bytes of data");
-        in.read(&(data[0]), n);
+        _data.resize(n);
+        log_debug("read " << n << " bytes of data");
+        in.read(&(_data[0]), n);
       }
       else
         log_warn("read empty cluster");
     }
+  }
+
+  void ClusterImpl::finalise_read() {
+    if ( !lazy_read_stream )
+    {
+        std::cerr << "lazy_read null" << std::endl;
+        return;
+    }
+    lazy_read_stream->seekg(startOffset);
+    read_content(*lazy_read_stream);
+    lazy_read_stream = NULL;
   }
 
   void ClusterImpl::write(std::ostream& out) const
@@ -129,8 +148,8 @@
       out.write(reinterpret_cast<const char*>(&o), sizeof(size_type));
     }
 
-    if (data.size() > 0)
-      out.write(&(data[0]), data.size());
+    if (_data.size() > 0)
+      out.write(&(_data[0]), _data.size());
     else
       log_warn("write empty cluster");
   }
@@ -138,8 +157,8 @@
   void ClusterImpl::addBlob(const Blob& blob)
   {
     log_debug1("addBlob(ptr, " << blob.size() << ')');
-    data.insert(data.end(), blob.data(), blob.end());
-    offsets.push_back(data.size());
+    _data.insert(_data.end(), blob.data(), blob.end());
+    offsets.push_back(_data.size());
   }
 
   Blob ClusterImpl::getBlob(size_type n) const
@@ -152,7 +171,7 @@
   void ClusterImpl::clear()
   {
     offsets.clear();
-    data.clear();
+    _data.clear();
     offsets.push_back(0);
   }
 
@@ -166,19 +185,29 @@
     return impl->getBlob(n);
   }
 
-  std::istream& operator>> (std::istream& in, ClusterImpl& clusterImpl)
+  void Cluster::init_from_stream(ifstream& in, offset_type offset)
   {
-    log_trace("read cluster");
+    getImpl()->init_from_stream(in, offset);
+  }
+
+  void ClusterImpl::init_from_stream(ifstream& in, offset_type offset)
+  {
+    log_trace("init_from_stream");
+    in.seekg(offset);
+
+    clear();
 
     char c;
     in.get(c);
-    clusterImpl.setCompression(static_cast<CompressionType>(c));
+    setCompression(static_cast<CompressionType>(c));
 
     switch (static_cast<CompressionType>(c))
     {
       case zimcompDefault:
       case zimcompNone:
-        clusterImpl.read(in);
+        startOffset = read_header(in);
+        startOffset += sizeof(char) + offset;
+        set_lazy_read(&in);
         break;
 
       case zimcompZip:
@@ -187,7 +216,8 @@
           log_debug("uncompress data (zlib)");
           zim::InflateStream is(in);
           is.exceptions(std::ios::failbit | std::ios::badbit);
-          clusterImpl.read(is);
+          read_header(is);
+          read_content(is);
 #else
           throw std::runtime_error("zlib not enabled in this library");
 #endif
@@ -200,7 +230,8 @@
           log_debug("uncompress data (bzip2)");
           zim::Bunzip2Stream is(in);
           is.exceptions(std::ios::failbit | std::ios::badbit);
-          clusterImpl.read(is);
+          read_header(is);
+          read_content(is);
 #else
           throw std::runtime_error("bzip2 not enabled in this library");
 #endif
@@ -213,7 +244,8 @@
           log_debug("uncompress data (lzma)");
           zim::UnlzmaStream is(in);
           is.exceptions(std::ios::failbit | std::ios::badbit);
-          clusterImpl.read(is);
+          read_header(is);
+          read_content(is);
 #else
           throw std::runtime_error("lzma not enabled in this library");
 #endif
@@ -225,13 +257,6 @@
         in.setstate(std::ios::failbit);
         break;
     }
-
-    return in;
-  }
-
-  std::istream& operator>> (std::istream& in, Cluster& cluster)
-  {
-    return in >> *cluster.getImpl();
   }
 
   std::ostream& operator<< (std::ostream& out, const ClusterImpl& clusterImpl)
diff --git a/zimlib/src/file.cpp b/zimlib/src/file.cpp
index c5f25a4..38bcf76 100644
--- a/zimlib/src/file.cpp
+++ b/zimlib/src/file.cpp
@@ -206,8 +206,7 @@
     Cluster cluster = getCluster(clusterIdx);
     if (cluster.isCompressed())
         return 0;
-    offset_type blobOffset = cluster.getBlobOffset(blobIdx);
-    return getClusterOffset(clusterIdx) + blobOffset;
+    return cluster.getBlobOffset(blobIdx);
   }
 
   std::string urldecode(const std::string& url)
diff --git a/zimlib/src/fileimpl.cpp b/zimlib/src/fileimpl.cpp
index 8c072eb..cc305d7 100644
--- a/zimlib/src/fileimpl.cpp
+++ b/zimlib/src/fileimpl.cpp
@@ -176,8 +176,7 @@
 
     offset_type clusterOffset = getClusterOffset(idx);
     log_debug("read cluster " << idx << " from offset " << clusterOffset);
-    zimFile.seekg(clusterOffset);
-    zimFile >> cluster;
+    cluster.init_from_stream(zimFile, clusterOffset);
 
     if (zimFile.fail())
       throw ZimFileFormatError("error reading cluster data");

-- 
To view, visit https://gerrit.wikimedia.org/r/314718
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I0709eb6b8fe49512ee302d13dfd5641cdc1c676b
Gerrit-PatchSet: 1
Gerrit-Project: openzim
Gerrit-Branch: master
Gerrit-Owner: Mgautierfr <[email protected]>
Gerrit-Reviewer: Kelson <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to