Kelson has submitted this change and it was merged. ( 
https://gerrit.wikimedia.org/r/295383 )

Change subject: Compress blobs and track file size immediately after adding 
each article.
......................................................................


Compress blobs and track file size immediately after adding each article.

Rather than first collecting all the directory entries and only afterwards
writing the blobs, write each cluster on the fly as we see each article.
Keep track of both compressed and uncompressed clusters so that we don't
needlessly terminate compressed clusters just because we happen to have
encountered an uncompressible file.  Account for additions to each of
the various indices as we go so that we maintain a fairly accurate
size for the file at every point, which will allow us to stop adding
articles once the ZIM file gets to a certain size.

Change-Id: Ib644fff4cb804320a07aadbea499c8416df66adc
---
M zimlib/include/zim/writer/zimcreator.h
M zimlib/src/zimcreator.cpp
2 files changed, 126 insertions(+), 86 deletions(-)

Approvals:
  Kelson: Verified; Looks good to me, approved



diff --git a/zimlib/include/zim/writer/zimcreator.h 
b/zimlib/include/zim/writer/zimcreator.h
index 5134d98..4296371 100644
--- a/zimlib/include/zim/writer/zimcreator.h
+++ b/zimlib/include/zim/writer/zimcreator.h
@@ -52,10 +52,10 @@
         CompressionType compression;
         bool isEmpty;
         offset_type clustersSize;
+        offset_type currentSize;
 
-        void createDirents(ArticleSource& src);
+        void createDirents(ArticleSource& src, const std::string& tmpfname);
         void createTitleIndex(ArticleSource& src);
-        void createClusters(ArticleSource& src, const std::string& tmpfname);
         void fillHeader(ArticleSource& src);
         void write(const std::string& fname, const std::string& tmpfname);
 
@@ -84,6 +84,10 @@
         void setMinChunkSize(int s)   { minChunkSize = s; }
 
         void create(const std::string& fname, ArticleSource& src);
+
+        /* The user can query `currentSize` after each article has been
+         * added to the ZIM file. */
+        offset_type getCurrentSize() { return currentSize; }
     };
 
   }
diff --git a/zimlib/src/zimcreator.cpp b/zimlib/src/zimcreator.cpp
index 66ce902..ac2720b 100644
--- a/zimlib/src/zimcreator.cpp
+++ b/zimlib/src/zimcreator.cpp
@@ -56,28 +56,30 @@
       : minChunkSize(1024-64),
         nextMimeIdx(0),
 #ifdef ENABLE_LZMA
-        compression(zimcompLzma)
+        compression(zimcompLzma),
 #elif ENABLE_BZIP2
-        compression(zimcompBzip2)
+        compression(zimcompBzip2),
 #elif ENABLE_ZLIB
-        compression(zimcompZip)
+        compression(zimcompZip),
 #else
-        compression(zimcompNone)
+        compression(zimcompNone),
 #endif
+        currentSize(0)
     {
     }
 
     ZimCreator::ZimCreator(int& argc, char* argv[])
       : nextMimeIdx(0),
 #ifdef ENABLE_LZMA
-        compression(zimcompLzma)
+        compression(zimcompLzma),
 #elif ENABLE_BZIP2
-        compression(zimcompBzip2)
+        compression(zimcompBzip2),
 #elif ENABLE_ZLIB
-        compression(zimcompZip)
+        compression(zimcompZip),
 #else
-        compression(zimcompNone)
+        compression(zimcompNone),
 #endif
+        currentSize(0)
     {
       Arg<unsigned> minChunkSizeArg(argc, argv, "--min-chunk-size");
       if (minChunkSizeArg.isSet())
@@ -110,15 +112,12 @@
       log_debug("basename " << basename);
 
       INFO("create directory entries");
-      createDirents(src);
+      createDirents(src, basename + ".tmp");
       INFO(dirents.size() << " directory entries created");
 
       INFO("create title index");
       createTitleIndex(src);
       INFO(dirents.size() << " title index created");
-
-      INFO("create clusters");
-      createClusters(src, basename + ".tmp");
       INFO(clusterOffsets.size() << " clusters created");
 
       INFO("fill header");
@@ -132,9 +131,23 @@
       INFO("ready");
     }
 
-    void ZimCreator::createDirents(ArticleSource& src)
+    void ZimCreator::createDirents(ArticleSource& src, const std::string& 
tmpfname)
     {
       INFO("collect articles");
+      std::ofstream out(tmpfname.c_str());
+      currentSize =
+        80 /* for header */ +
+        1 /* for mime type table termination */ +
+        16 /* for md5sum */;
+
+      // We keep both a "compressed cluster" and an "uncompressed cluster"
+      // because we don't know which one will fill up first.  We also need
+      // to track the dirents currently in each, so we can fix up the
+      // cluster index if the other one ends up written first.
+      DirentsType compDirents, uncompDirents;
+      Cluster compCluster, uncompCluster;
+      compCluster.setCompression(compression);
+      uncompCluster.setCompression(zimcompNone);
 
       const Article* article;
       while ((article = src.getNextArticle()) != 0)
@@ -163,13 +176,107 @@
         }
         else
         {
+          uint16_t oldMimeIdx = nextMimeIdx;
           dirent.setArticle(getMimeTypeIdx(article->getMimeType()), 0, 0);
           dirent.setCompress(article->shouldCompress());
           log_debug("is article; mimetype " << dirent.getMimeType());
+          if (oldMimeIdx != nextMimeIdx)
+          {
+            // Account for the size of the mime type entry
+            currentSize += rmimeTypes[oldMimeIdx].size() +
+              1 /* trailing null */;
+          }
         }
 
         dirents.push_back(dirent);
+        currentSize +=
+          dirent.getDirentSize() /* for directory entry */ +
+          sizeof(offset_type) /* for url pointer list */ +
+          sizeof(size_type) /* for title pointer list */;
+
+        // If this is a redirect, we're done: there's no blob to add.
+        if (dirent.isRedirect())
+        {
+          continue;
+        }
+
+        // Add blob data to compressed or uncompressed cluster.
+        Blob blob = src.getData(dirent.getAid());
+        if (blob.size() > 0)
+        {
+          isEmpty = false;
+        }
+
+        Cluster *cluster;
+        DirentsType *myDirents, *otherDirents;
+        if (dirent.isCompress())
+        {
+          cluster = &compCluster;
+          myDirents = &compDirents;
+          otherDirents = &uncompDirents;
+        }
+        else
+        {
+          cluster = &uncompCluster;
+          myDirents = &uncompDirents;
+          otherDirents = &compDirents;
+        }
+        myDirents->push_back(dirent);
+        dirent.setCluster(clusterOffsets.size(), cluster->count());
+        cluster->addBlob(blob);
+
+        // If cluster is now large enough, write it to disk.
+        if (cluster->size() >= minChunkSize * 1024)
+        {
+          log_info("cluster with " << cluster->count() << " articles, " <<
+                   cluster->size() << " bytes; current title \"" <<
+                   dirent.getTitle() << '\"');
+          offset_type start = out.tellp();
+          clusterOffsets.push_back(start);
+          out << *cluster;
+          log_debug("cluster written");
+          cluster->clear();
+          myDirents->clear();
+          // Update the cluster number of the dirents *not* written to disk.
+          for (DirentsType::iterator di = otherDirents->begin();
+               di != otherDirents->end(); ++di)
+          {
+            di->setCluster(clusterOffsets.size(), di->getBlobNumber());
+          }
+          offset_type end = out.tellp();
+          currentSize += (end - start) +
+            sizeof(offset_type) /* for cluster pointer entry */;
+        }
       }
+
+      // When we've seen all articles, write any remaining clusters.
+      if (compCluster.count() > 0)
+      {
+        clusterOffsets.push_back(out.tellp());
+        out << compCluster;
+        for (DirentsType::iterator di = uncompDirents.begin();
+             di != uncompDirents.end(); ++di)
+        {
+          di->setCluster(clusterOffsets.size(), di->getBlobNumber());
+        }
+      }
+      compCluster.clear();
+      compDirents.clear();
+
+      if (uncompCluster.count() > 0)
+      {
+        clusterOffsets.push_back(out.tellp());
+        out << uncompCluster;
+      }
+      uncompCluster.clear();
+      uncompDirents.clear();
+
+      if (!out)
+      {
+        throw std::runtime_error("failed to write temporary cluster file");
+      }
+
+      clustersSize = out.tellp();
 
       // sort
       INFO("sort " << dirents.size() << " directory entries (aid)");
@@ -266,77 +373,6 @@
 
       CompareTitle compareTitle(dirents);
       std::sort(titleIdx.begin(), titleIdx.end(), compareTitle);
-    }
-
-    void ZimCreator::createClusters(ArticleSource& src, const std::string& 
tmpfname)
-    {
-      std::ofstream out(tmpfname.c_str());
-
-      Cluster cluster;
-      cluster.setCompression(compression);
-
-      DirentsType::size_type count = 0, progress = 0;
-      for (DirentsType::iterator di = dirents.begin(); out && di != 
dirents.end(); ++di, ++count)
-      {
-        while (progress < count * 100 / dirents.size() + 1)
-        {
-          INFO(progress << "% ready");
-          progress += 10;
-        }
-
-        if (di->isRedirect())
-          continue;
-
-        Blob blob = src.getData(di->getAid());
-        if (blob.size() > 0)
-          isEmpty = false;
-
-        if (di->isCompress())
-        {
-          di->setCluster(clusterOffsets.size(), cluster.count());
-          cluster.addBlob(blob);
-          if (cluster.size() >= minChunkSize * 1024)
-          {
-            log_info("compress cluster with " << cluster.count() << " 
articles, " << cluster.size() << " bytes; current title \"" << di->getTitle() 
<< '\"');
-
-            clusterOffsets.push_back(out.tellp());
-            out << cluster;
-            log_debug("cluster compressed");
-            cluster.clear();
-            cluster.setCompression(compression);
-          }
-        }
-        else
-        {
-          if (cluster.count() > 0)
-          {
-            clusterOffsets.push_back(out.tellp());
-            cluster.setCompression(compression);
-            out << cluster;
-            cluster.clear();
-            cluster.setCompression(compression);
-          }
-
-          di->setCluster(clusterOffsets.size(), cluster.count());
-          clusterOffsets.push_back(out.tellp());
-          Cluster c;
-          c.addBlob(blob);
-          c.setCompression(zimcompNone);
-          out << c;
-        }
-      }
-
-      if (cluster.count() > 0)
-      {
-        clusterOffsets.push_back(out.tellp());
-        cluster.setCompression(compression);
-        out << cluster;
-      }
-
-      if (!out)
-        throw std::runtime_error("failed to write temporary cluster file");
-
-      clustersSize = out.tellp();
     }
 
     void ZimCreator::fillHeader(ArticleSource& src)

-- 
To view, visit https://gerrit.wikimedia.org/r/295383
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Ib644fff4cb804320a07aadbea499c8416df66adc
Gerrit-PatchSet: 2
Gerrit-Project: openzim
Gerrit-Branch: master
Gerrit-Owner: C. Scott Ananian <canan...@wikimedia.org>
Gerrit-Reviewer: Kelson <kel...@kiwix.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to