[MediaWiki-commits] [Gerrit] openzim[master]: Read the cluster content only when necessary.

2017-03-25 Thread Kelson (Code Review)
Kelson has submitted this change and it was merged. ( 
https://gerrit.wikimedia.org/r/314718 )

Change subject: Read the cluster content only when necessary.
..


Read the cluster content only when necessary.

Instead of reading the full cluster content at cluster creation, it is
better to read the cluster content when we need it.
This is mainly useful when we want a cluster to only get an article
offset.
For compressed cluster we read all in once cause :
- We create a proxy uncompressor stream from the input stream.
  This proxy stream do not handle teelg who is necessary for lazy_read.
- This change is only useful when we use getOffset, and there is no
  offset available on comressed cluster.

We do not use the operator>> anymore.
This operator is designed to allow chained reads. ie :
in >> cluster1 >> cluster2;

As may do not read all the content when reading from in, the use of the
operator>> is now not desirable.

Change-Id: I0709eb6b8fe49512ee302d13dfd5641cdc1c676b
---
M zimlib/include/zim/cluster.h
M zimlib/src/cluster.cpp
M zimlib/src/file.cpp
M zimlib/src/fileimpl.cpp
4 files changed, 87 insertions(+), 46 deletions(-)

Approvals:
  Kelson: Verified; Looks good to me, approved



diff --git a/zimlib/include/zim/cluster.h b/zimlib/include/zim/cluster.h
index 96b16f0..4fd3971 100644
--- a/zimlib/include/zim/cluster.h
+++ b/zimlib/include/zim/cluster.h
@@ -23,6 +23,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -33,7 +34,6 @@
 
   class ClusterImpl : public RefCounted
   {
-  friend std::istream& operator>> (std::istream& in, ClusterImpl& 
blobImpl);
   friend std::ostream& operator<< (std::ostream& out, const ClusterImpl& 
blobImpl);
 
   typedef std::vector Offsets;
@@ -41,11 +41,28 @@
 
   CompressionType compression;
   Offsets offsets;
-  Data data;
+  Data _data;
   offset_type startOffset;
 
-  void read(std::istream& in);
+  ifstream* lazy_read_stream;
+
+  offset_type read_header(std::istream& in);
+  void read_content(std::istream& in);
   void write(std::ostream& out) const;
+
+  void set_lazy_read(ifstream* in) {
+lazy_read_stream = in;
+  }
+
+  bool is_fully_initialised() const { return lazy_read_stream == 0; }
+  void finalise_read();
+  const Data& data() const {
+if ( !is_fully_initialised() )
+{
+   const_cast(this)->finalise_read();
+}
+return _data;
+  }
 
 public:
   ClusterImpl();
@@ -55,20 +72,21 @@
   bool isCompressed() const{ return compression == 
zimcompZip || compression == zimcompBzip2 || compression == zimcompLzma; }
 
   size_type getCount() const   { return offsets.size() - 1; }
-  const char* getData(unsigned n) const{ return [ offsets[n] ]; }
+  const char* getData(unsigned n) const{ return ()[ offsets[n] ]; 
}
   size_type getSize(unsigned n) const  { return offsets[n+1] - 
offsets[n]; }
-  size_type getSize() const{ return offsets.size() * 
sizeof(size_type) + data.size(); }
+  size_type getSize() const{ return offsets.size() * 
sizeof(size_type) + data().size(); }
   offset_type getOffset(size_type n) const { return startOffset + 
offsets[n]; }
   Blob getBlob(size_type n) const;
   void clear();
 
   void addBlob(const Blob& blob);
   void addBlob(const char* data, unsigned size);
+
+  void init_from_stream(ifstream& in, offset_type offset);
   };
 
   class Cluster
   {
-  friend std::istream& operator>> (std::istream& in, Cluster& blob);
   friend std::ostream& operator<< (std::ostream& out, const Cluster& blob);
 
   SmartPtr impl;
@@ -98,10 +116,10 @@
   void addBlob(const Blob& blob){ 
getImpl()->addBlob(blob); }
 
   operator bool() const   { return impl; }
+
+  void init_from_stream(ifstream& in, offset_type offset);
   };
 
-  std::istream& operator>> (std::istream& in, ClusterImpl& blobImpl);
-  std::istream& operator>> (std::istream& in, Cluster& blob);
   std::ostream& operator<< (std::ostream& out, const ClusterImpl& blobImpl);
   std::ostream& operator<< (std::ostream& out, const Cluster& blob);
 
diff --git a/zimlib/src/cluster.cpp b/zimlib/src/cluster.cpp
index 3b24fee..9dbefdc 100644
--- a/zimlib/src/cluster.cpp
+++ b/zimlib/src/cluster.cpp
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -60,34 +61,35 @@
   }
 
   ClusterImpl::ClusterImpl()
-: compression(zimcompNone)
+: compression(zimcompNone),
+  startOffset(0),
+  lazy_read_stream(NULL)
   {
 offsets.push_back(0);
   }
 
-  void ClusterImpl::read(std::istream& in)
+  /* This return the number of char read */
+  offset_type ClusterImpl::read_header(std::istream& in)
   {
-log_debug1("read");
-
+log_debug1("read_header");
 

[MediaWiki-commits] [Gerrit] openzim[master]: Read the cluster content only when necessary.

2016-10-09 Thread Kelson (Code Review)
Kelson has submitted this change and it was merged.

Change subject: Read the cluster content only when necessary.
..


Read the cluster content only when necessary.

Instead of reading the full cluster content at cluster creation, it is
better to read the cluster content when we need it.
This is mainly useful when we want a cluster to only get an article
offset.
For compressed cluster we read all in once cause :
- We create a proxy uncompressor stream from the input stream.
  This proxy stream do not handle teelg who is necessary for lazy_read.
- This change is only useful when we use getOffset, and there is no
  offset available on comressed cluster.

We do not use the operator>> anymore.
This operator is designed to allow chained reads. ie :
in >> cluster1 >> cluster2;

As may do not read all the content when reading from in, the use of the
operator>> is now not desirable.

Change-Id: I0709eb6b8fe49512ee302d13dfd5641cdc1c676b
---
M zimlib/include/zim/cluster.h
M zimlib/src/cluster.cpp
M zimlib/src/file.cpp
M zimlib/src/fileimpl.cpp
4 files changed, 87 insertions(+), 46 deletions(-)

Approvals:
  Kelson: Verified; Looks good to me, approved



diff --git a/zimlib/include/zim/cluster.h b/zimlib/include/zim/cluster.h
index 96b16f0..4fd3971 100644
--- a/zimlib/include/zim/cluster.h
+++ b/zimlib/include/zim/cluster.h
@@ -23,6 +23,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -33,7 +34,6 @@
 
   class ClusterImpl : public RefCounted
   {
-  friend std::istream& operator>> (std::istream& in, ClusterImpl& 
blobImpl);
   friend std::ostream& operator<< (std::ostream& out, const ClusterImpl& 
blobImpl);
 
   typedef std::vector Offsets;
@@ -41,11 +41,28 @@
 
   CompressionType compression;
   Offsets offsets;
-  Data data;
+  Data _data;
   offset_type startOffset;
 
-  void read(std::istream& in);
+  ifstream* lazy_read_stream;
+
+  offset_type read_header(std::istream& in);
+  void read_content(std::istream& in);
   void write(std::ostream& out) const;
+
+  void set_lazy_read(ifstream* in) {
+lazy_read_stream = in;
+  }
+
+  bool is_fully_initialised() const { return lazy_read_stream == 0; }
+  void finalise_read();
+  const Data& data() const {
+if ( !is_fully_initialised() )
+{
+   const_cast(this)->finalise_read();
+}
+return _data;
+  }
 
 public:
   ClusterImpl();
@@ -55,20 +72,21 @@
   bool isCompressed() const{ return compression == 
zimcompZip || compression == zimcompBzip2 || compression == zimcompLzma; }
 
   size_type getCount() const   { return offsets.size() - 1; }
-  const char* getData(unsigned n) const{ return [ offsets[n] ]; }
+  const char* getData(unsigned n) const{ return ()[ offsets[n] ]; 
}
   size_type getSize(unsigned n) const  { return offsets[n+1] - 
offsets[n]; }
-  size_type getSize() const{ return offsets.size() * 
sizeof(size_type) + data.size(); }
+  size_type getSize() const{ return offsets.size() * 
sizeof(size_type) + data().size(); }
   offset_type getOffset(size_type n) const { return startOffset + 
offsets[n]; }
   Blob getBlob(size_type n) const;
   void clear();
 
   void addBlob(const Blob& blob);
   void addBlob(const char* data, unsigned size);
+
+  void init_from_stream(ifstream& in, offset_type offset);
   };
 
   class Cluster
   {
-  friend std::istream& operator>> (std::istream& in, Cluster& blob);
   friend std::ostream& operator<< (std::ostream& out, const Cluster& blob);
 
   SmartPtr impl;
@@ -98,10 +116,10 @@
   void addBlob(const Blob& blob){ 
getImpl()->addBlob(blob); }
 
   operator bool() const   { return impl; }
+
+  void init_from_stream(ifstream& in, offset_type offset);
   };
 
-  std::istream& operator>> (std::istream& in, ClusterImpl& blobImpl);
-  std::istream& operator>> (std::istream& in, Cluster& blob);
   std::ostream& operator<< (std::ostream& out, const ClusterImpl& blobImpl);
   std::ostream& operator<< (std::ostream& out, const Cluster& blob);
 
diff --git a/zimlib/src/cluster.cpp b/zimlib/src/cluster.cpp
index 3b24fee..9dbefdc 100644
--- a/zimlib/src/cluster.cpp
+++ b/zimlib/src/cluster.cpp
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -60,34 +61,35 @@
   }
 
   ClusterImpl::ClusterImpl()
-: compression(zimcompNone)
+: compression(zimcompNone),
+  startOffset(0),
+  lazy_read_stream(NULL)
   {
 offsets.push_back(0);
   }
 
-  void ClusterImpl::read(std::istream& in)
+  /* This return the number of char read */
+  offset_type ClusterImpl::read_header(std::istream& in)
   {
-log_debug1("read");
-
+log_debug1("read_header");
 // read first offset, which specifies, how