Petr Onderka has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/83994


Change subject: Group compression of diff dumps
......................................................................

Group compression of diff dumps

Change-Id: I2d357218056f9324afa21d586c4b40d935b84830
---
M CMakeLists.txt
M Diff/ChangeProcessor.cpp
M Diff/ChangeProcessor.h
M Diff/Changes/Change.h
M Diff/Changes/NewRevisionChange.cpp
M Diff/Changes/NewRevisionChange.h
M Diff/Changes/RevisionChange.cpp
M Diff/Changes/RevisionChange.h
M Diff/DiffReader.cpp
M Diff/DiffWriter.cpp
M Diff/DiffWriter.h
M Dump.cpp
M Dump.h
M DumpObjects/DumpRevision.cpp
M DumpObjects/DumpRevision.h
M DumpObjects/TextGroup.cpp
M DumpObjects/TextGroup.h
M DumpWriters/CompositeWriter.cpp
M DumpWriters/CompositeWriter.h
M DumpWriters/DumpWriter.cpp
M DumpWriters/DumpWriter.h
M DumpWriters/IDumpWriter.h
M DumpWriters/WriterWrapper.cpp
M DumpWriters/WriterWrapper.h
M Incremental dumps.vcxproj
M TODO.txt
M TextGroupsManager.cpp
M TextGroupsManager.h
M main.cpp
29 files changed, 186 insertions(+), 115 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/operations/dumps/incremental 
refs/changes/94/83994/1

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 578ffdf..0ebbb37 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -11,6 +11,7 @@
   Diff/Changes/FullDeletePageChange.cpp
   Diff/Changes/PartialDeletePageChange.cpp
   Diff/Changes/DeleteRevisionChange.cpp
+  Diff/Changes/DiffTextGroup.cpp
   Diff/Changes/NewModelFormatChange.cpp
   Diff/Changes/NewPageChange.cpp
   Diff/Changes/NewRevisionChange.cpp
@@ -75,6 +76,7 @@
   Diff/Changes/FullDeletePageChange.h
   Diff/Changes/PartialDeletePageChange.h
   Diff/Changes/DeleteRevisionChange.h
+  Diff/Changes/DiffTextGroup.h
   Diff/Changes/NewModelFormatChange.h
   Diff/Changes/NewPageChange.h
   Diff/Changes/NewRevisionChange.h
diff --git a/Diff/ChangeProcessor.cpp b/Diff/ChangeProcessor.cpp
index 9640cd0..b3f2d7c 100644
--- a/Diff/ChangeProcessor.cpp
+++ b/Diff/ChangeProcessor.cpp
@@ -1,5 +1,6 @@
 #include "ChangeProcessor.h"
 #include "../Dump.h"
+#include "../TextGroupsManager.h"
 #include "../DumpObjects/DumpRevision.h"
 #include "../Indexes/Index.h"
 
@@ -14,7 +15,7 @@
 }
 
 ChangeProcessor::ChangeProcessor(std::shared_ptr<WritableDump> dump)
-    : dump(dump)
+    : dump(dump), currentTextGroupId(0)
 {}
 
 void ChangeProcessor::Process(SiteInfoChange change)
@@ -67,6 +68,7 @@
     DumpRevision dumpRevision(dump, change.revision.RevisionId);
     dumpRevision.revision = change.revision;
     dumpRevision.SetModelFormatId(change.modelFormatId);
+    dumpRevision.SetTextGroup(currentTextGroupId, change.textId);
     dumpRevision.Write();
 
     currentPage->page.RevisionIds.insert(change.revision.RevisionId);
@@ -99,7 +101,7 @@
         revision.Sha1 = revisionChanges.Sha1;
 
         if (IsPages(dump->fileHeader.Kind))
-            revision.SetText(revisionChanges.GetText());
+            dumpRevision.SetTextGroup(currentTextGroupId, change.textId);
         else
             revision.TextLength = revisionChanges.TextLength;
     }
@@ -132,9 +134,14 @@
     dump->DeletePagePartial(change.pageId);
 }
 
-void ChangeProcessor::End()
+void ChangeProcessor::Process(DiffTextGroup change)
+{
+    currentTextGroupId = 
dump->textGroupsManager->ImportTextGroup(change.compressedTexts);
+}
+
+void ChangeProcessor::Complete()
 {
     WritePage();
 
-    dump->Complete();
+    dump->Complete(nullptr);
 }
\ No newline at end of file
diff --git a/Diff/ChangeProcessor.h b/Diff/ChangeProcessor.h
index 0d6104a..3691415 100644
--- a/Diff/ChangeProcessor.h
+++ b/Diff/ChangeProcessor.h
@@ -19,6 +19,7 @@
     std::shared_ptr<WritableDump> dump;
 
     std::unique_ptr<DumpPage> currentPage;
+    std::uint32_t currentTextGroupId;
 
     void WritePage();
 public:
@@ -33,7 +34,8 @@
     void Process(DeleteRevisionChange change);
     void Process(FullDeletePageChange change);
     void Process(PartialDeletePageChange change);
+    void Process(DiffTextGroup change);
 
     // has to be called after processing is complete
-    void End();
+    void Complete();
 };
diff --git a/Diff/Changes/Change.h b/Diff/Changes/Change.h
index f384efa..b7bd2c2 100644
--- a/Diff/Changes/Change.h
+++ b/Diff/Changes/Change.h
@@ -15,7 +15,9 @@
     ChangeRevision    = 0x21,
     DeleteRevision    = 0x22,
 
-    NewModelFormat    = 0x30
+    NewModelFormat    = 0x30,
+
+    TextGroup         = 0x40
 };
 
 class Change : public DumpObjectBase
diff --git a/Diff/Changes/NewRevisionChange.cpp 
b/Diff/Changes/NewRevisionChange.cpp
index ccc9758..8205edc 100644
--- a/Diff/Changes/NewRevisionChange.cpp
+++ b/Diff/Changes/NewRevisionChange.cpp
@@ -1,28 +1,18 @@
 #include "NewRevisionChange.h"
 #include "../../DumpObjects/DumpRevision.h"
-#include "../../SevenZip.h"
-
-void NewRevisionChange::EnsureCompressed()
-{
-    if (compressedTextSet)
-        return;
-
-    compressedText = SevenZip::Compress(revision.GetText());
-    compressedTextSet = true;
-}
 
 NewRevisionChange NewRevisionChange::Read(std::istream &stream, bool withText)
 {
     std::uint8_t modelFormatId;
     auto revision = DumpRevision::ReadCore(stream, modelFormatId, withText);
+    std::uint8_t textId = 0;
 
     if (withText && !HasFlag(revision.Flags, RevisionFlags::TextDeleted))
     {
-        std::string compressedText = DumpTraits<string>::ReadLong(stream);
-        revision.SetText(SevenZip::Decompress(compressedText));
+        ReadValue(stream, textId);
     }
 
-    return NewRevisionChange(revision, modelFormatId, withText);
+    return NewRevisionChange(revision, modelFormatId, withText, textId);
 }
 
 void NewRevisionChange::WriteInternal()
@@ -33,15 +23,12 @@
 
     if (withText && !HasFlag(revision.Flags, RevisionFlags::TextDeleted))
     {
-        EnsureCompressed();
-        DumpTraits<std::string>::WriteLong(*stream, compressedText);
+        WriteValue(textId);
     }
 }
 
 std::uint32_t NewRevisionChange::NewLength()
 {
-    EnsureCompressed();
-
     std::uint32_t result = 0;
 
     result += ValueSize(ChangeKind::NewRevision);
@@ -49,8 +36,7 @@
 
     if (withText && !HasFlag(revision.Flags, RevisionFlags::TextDeleted))
     {
-        EnsureCompressed();
-        result += DumpTraits<std::string>::DumpSizeLong(compressedText);
+        result += ValueSize(textId);
     }
 
     return result;
diff --git a/Diff/Changes/NewRevisionChange.h b/Diff/Changes/NewRevisionChange.h
index 732d8f8..169e204 100644
--- a/Diff/Changes/NewRevisionChange.h
+++ b/Diff/Changes/NewRevisionChange.h
@@ -7,17 +7,14 @@
 {
 private:
     bool withText;
-
-    std::string compressedText;
-    bool compressedTextSet;
-
-    void EnsureCompressed();
 public:
     Revision revision;
     std::uint8_t modelFormatId;
 
-    NewRevisionChange(const Revision &revision, std::uint8_t modelFormatId, 
bool withText)
-        : revision(revision), modelFormatId(modelFormatId), 
withText(withText), compressedTextSet(false)
+    std::uint8_t textId;
+
+    NewRevisionChange(const Revision &revision, std::uint8_t modelFormatId, 
bool withText, std::uint8_t textId)
+        : revision(revision), modelFormatId(modelFormatId), 
withText(withText), textId(textId)
     {}
 
     static NewRevisionChange Read(std::istream &stream, bool withText);
diff --git a/Diff/Changes/RevisionChange.cpp b/Diff/Changes/RevisionChange.cpp
index f72159a..8c4ab73 100644
--- a/Diff/Changes/RevisionChange.cpp
+++ b/Diff/Changes/RevisionChange.cpp
@@ -1,6 +1,5 @@
 #include "RevisionChange.h"
 #include "../../DumpObjects/DumpUser.h"
-#include "../../SevenZip.h"
 
 RevisionChangeFlags operator &(RevisionChangeFlags first, RevisionChangeFlags 
second)
 {
@@ -30,23 +29,14 @@
 }
 
 
-void RevisionChange::EnsureCompressed()
-{
-    if (compressedTextSet)
-        return;
-
-    compressedText = SevenZip::Compress(revisionChanges.GetText());
-    compressedTextSet = true;
-}
-
 RevisionChange::RevisionChange(bool withText)
     : withText(withText)
 {}
 
 RevisionChange::RevisionChange(
     const Revision &oldRevision, Revision &newRevision,
-    std::uint8_t newRevisionModelFormatId, bool withText)
-    : withText(withText)
+    std::uint8_t newRevisionModelFormatId, bool withText, std::uint8_t textId)
+    : withText(withText), textId(textId)
 {
     revisionChanges = newRevision;
     this->newRevisionModelFormatId = newRevisionModelFormatId;
@@ -116,10 +106,7 @@
         ReadValue(stream, result.revisionChanges.Sha1);
 
         if (withText)
-        {
-            auto compressedText = DumpTraits<std::string>::ReadLong(stream);
-            
result.revisionChanges.SetText(SevenZip::Decompress(compressedText));
-        }
+            ReadValue(stream, result.textId);
         else
             ReadValue(stream, result.revisionChanges.TextLength);
     }
@@ -156,10 +143,7 @@
         WriteValue(revisionChanges.Sha1);
 
         if (withText)
-        {
-            EnsureCompressed();
-            DumpTraits<std::string>::WriteLong(*stream, compressedText);
-        }
+            WriteValue(textId);
         else
             WriteValue(revisionChanges.TextLength);
     }
@@ -196,10 +180,7 @@
         result += ValueSize(revisionChanges.Sha1);
 
         if (withText)
-        {
-            EnsureCompressed();
-            result += DumpTraits<std::string>::DumpSizeLong(compressedText);
-        }
+            result += ValueSize(textId);
         else
             result += ValueSize(revisionChanges.TextLength);
     }
diff --git a/Diff/Changes/RevisionChange.h b/Diff/Changes/RevisionChange.h
index b989581..73b2662 100644
--- a/Diff/Changes/RevisionChange.h
+++ b/Diff/Changes/RevisionChange.h
@@ -26,21 +26,18 @@
 private:
     bool withText;
 
-    std::string compressedText;
-    bool compressedTextSet;
-
     RevisionChange(bool withText);
-
-    void EnsureCompressed();
 public:
     RevisionChangeFlags flags;
 
     Revision revisionChanges;
     std::uint8_t newRevisionModelFormatId;
 
+    std::uint8_t textId;
+
     RevisionChange(
         const Revision &oldRevision, Revision &newRevision,
-        std::uint8_t newRevisionModelFormatId, bool withText);
+        std::uint8_t newRevisionModelFormatId, bool withText, std::uint8_t 
textId);
 
     static RevisionChange Read(std::istream &stream, bool withText);
     virtual void WriteInternal() override;
diff --git a/Diff/DiffReader.cpp b/Diff/DiffReader.cpp
index abb11c3..3fd291c 100644
--- a/Diff/DiffReader.cpp
+++ b/Diff/DiffReader.cpp
@@ -69,10 +69,13 @@
         case ChangeKind::DeletePagePartial:
             changeProcessor.Process(PartialDeletePageChange::Read(*stream));
             break;
+        case ChangeKind::TextGroup:
+            changeProcessor.Process(DiffTextGroup::Read(*stream));
+            break;
         default:
             throw DumpException();
         }
     }
 
-    changeProcessor.End();
+    changeProcessor.Complete();
 }
\ No newline at end of file
diff --git a/Diff/DiffWriter.cpp b/Diff/DiffWriter.cpp
index e192d46..4ba271d 100644
--- a/Diff/DiffWriter.cpp
+++ b/Diff/DiffWriter.cpp
@@ -24,6 +24,24 @@
     }
 }
 
+template <typename TChange>
+void DiffWriter::Process(TChange &change)
+{
+    if (IsPages(dumpKind))
+        changeQueue.push(std::unique_ptr<Change>(new TChange(change)));
+    else
+        change.Write(stream.get());
+}
+
+void DiffWriter::HandleQueue()
+{
+    while (!changeQueue.empty())
+    {
+        changeQueue.front()->Write(stream.get());
+        changeQueue.pop();
+    }
+}
+
 DiffWriter::DiffWriter(const std::string &fileName, const std::string &name, 
const std::string &oldTimestamp, const std::string &newTimestamp)
     : stream(std::unique_ptr<std::ostream>(new std::ofstream(fileName, 
std::ios::binary))),
         dumpStarted(false), pageStarted(false), pageWritten(false),
@@ -56,7 +74,7 @@
         throw DumpException();
 
     NewPageChange change(page);
-    change.Write(stream.get());
+    Process(change);
 
     pageWritten = true;
     pageStarted = true;
@@ -73,7 +91,7 @@
 
     if (change.HasChanges())
     {
-        change.Write(stream.get());
+        Process(change);
         pageWritten = true;
     }
     else
@@ -106,10 +124,10 @@
         throw DumpException();
 
     NewModelFormatChange change(id, model, format);
-    change.Write(stream.get());
+    Process(change);
 }
 
-void DiffWriter::NewRevision(const Revision &revision, std::uint8_t 
modelFormatId)
+void DiffWriter::NewRevision(const Revision &revision, std::uint8_t 
modelFormatId, std::uint8_t textId)
 {
     if (!dumpStarted)
         throw DumpException();
@@ -118,12 +136,12 @@
 
     EnsurePageWritten();
 
-    NewRevisionChange change(revision, modelFormatId, IsPages(dumpKind));
-    change.Write(stream.get());
+    NewRevisionChange change(revision, modelFormatId, IsPages(dumpKind), 
textId);
+    Process(change);
 }
 
 void DiffWriter::ChangeRevision(
-    const Revision &oldRevision, Revision &newRevision, std::uint8_t 
newModelFormatId)
+    const Revision &oldRevision, Revision &newRevision, std::uint8_t 
newModelFormatId , std::uint8_t textId)
 {
     if (!dumpStarted)
         throw DumpException();
@@ -132,8 +150,8 @@
 
     EnsurePageWritten();
 
-    RevisionChange change(oldRevision, newRevision, newModelFormatId, 
IsPages(dumpKind));
-    change.Write(stream.get());
+    RevisionChange change(oldRevision, newRevision, newModelFormatId, 
IsPages(dumpKind), textId);
+    Process(change);
 }
 
 void DiffWriter::DeleteRevision(std::uint32_t revisionId)
@@ -145,7 +163,7 @@
         EnsurePageWritten();
 
     DeleteRevisionChange change(revisionId);
-    change.Write(stream.get());
+    Process(change);
 }
 
 void DiffWriter::EndPage()
@@ -169,7 +187,7 @@
         throw DumpException();
 
     FullDeletePageChange change(pageId);
-    change.Write(stream.get());
+    Process(change);
 }
 
 void DiffWriter::DeletePagePartial(std::uint32_t pageId)
@@ -180,5 +198,21 @@
         throw DumpException();
 
     PartialDeletePageChange change(pageId);
-    change.Write(stream.get());
+    Process(change);
+}
+
+void DiffWriter::SetTextGroup(const std::string& compressedTexts)
+{
+    if (!dumpStarted)
+        throw DumpException();
+
+    DiffTextGroup diffTextGroup(compressedTexts);
+    diffTextGroup.Write(stream.get());
+
+    HandleQueue();
+}
+
+void DiffWriter::Complete()
+{
+    HandleQueue();
 }
\ No newline at end of file
diff --git a/Diff/DiffWriter.h b/Diff/DiffWriter.h
index 76546ca..d789554 100644
--- a/Diff/DiffWriter.h
+++ b/Diff/DiffWriter.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <string>
+#include <queue>
 #include "Changes/PageChange.h"
 #include "../Objects/Page.h"
 #include "../Objects/Revision.h"
@@ -22,7 +23,15 @@
     const std::string oldTimestamp;
     const std::string newTimestamp;
 
+    std::queue<std::unique_ptr<Change>> changeQueue;
+
     void EnsurePageWritten();
+
+    // either saves the change directly, or adds it to queue
+    template <typename TChange>
+    void Process(TChange &change);
+
+    void HandleQueue();
 public:
     DiffWriter(const std::string &fileName, const std::string &name, const 
std::string &oldTimestamp, const std::string &newTimestamp);
 
@@ -35,12 +44,16 @@
     
     void NewModelFormat(std::uint8_t id, const std::string &model, const 
std::string &format);
 
-    void NewRevision(const Revision &revision, std::uint8_t modelFormatId);
-    void ChangeRevision(const Revision &oldRevision, Revision &newRevision, 
std::uint8_t newModelFormatId);
+    void NewRevision(const Revision &revision, std::uint8_t modelFormatId, 
std::uint8_t textId);
+    void ChangeRevision(const Revision &oldRevision, Revision &newRevision, 
std::uint8_t newModelFormatId, std::uint8_t textId);
     void DeleteRevision(std::uint32_t revisionId);
 
     void EndPage();
 
     void DeletePageFull(std::uint32_t pageId);
     void DeletePagePartial(std::uint32_t pageId);
+
+    void SetTextGroup(const std::string& compressedTexts);
+
+    void Complete();
 };
\ No newline at end of file
diff --git a/Dump.cpp b/Dump.cpp
index e4cad1c..46d2d31 100644
--- a/Dump.cpp
+++ b/Dump.cpp
@@ -90,9 +90,9 @@
     return dump;
 }
 
-void WritableDump::Complete()
+void WritableDump::Complete(DiffWriter* diffWriter)
 {
-    textGroupsManager->Complete();
+    textGroupsManager->Complete(diffWriter);
 
     spaceIndex->Write();
     pageIdIndex->Write();
diff --git a/Dump.h b/Dump.h
index 4d80a7b..e454671 100644
--- a/Dump.h
+++ b/Dump.h
@@ -8,6 +8,7 @@
 #include "DumpObjects/FileHeader.h"
 #include "DumpObjects/DumpSiteInfo.h"
 #include "DumpObjects/TextGroup.h"
+#include "Diff/DiffWriter.h"
 
 using std::int32_t;
 using std::int64_t;
@@ -70,7 +71,7 @@
     std::unique_ptr<TextGroupsManager> textGroupsManager;
 
     // it's necessary to call this after writing is finished
-    void Complete();
+    void Complete(DiffWriter* diffWriter);
 
     void DeletePagePartial(std::uint32_t pageId);
     // also deletes revisions of the given page
diff --git a/DumpObjects/DumpRevision.cpp b/DumpObjects/DumpRevision.cpp
index aedc93c..16c010a 100644
--- a/DumpObjects/DumpRevision.cpp
+++ b/DumpObjects/DumpRevision.cpp
@@ -70,7 +70,7 @@
     if (wasLoaded && originalRevision == revision)
     {
         if (forceDiff && diffWriter != nullptr)
-            diffWriter->ChangeRevision(originalRevision, revision, 
modelFormatId);
+            diffWriter->ChangeRevision(originalRevision, revision, 
modelFormatId, textId);
 
         return;
     }
@@ -94,7 +94,7 @@
 
     if (withText && !HasFlag(revision.Flags, RevisionFlags::TextDeleted))
     {
-        if (textGroupId == 0 || originalRevision.Sha1 != revision.Sha1)
+        if (textGroupId == 0 || (!textSaved && originalRevision.Sha1 != 
revision.Sha1))
         {
             if (textGroupId != 0)
                 DeleteText();
@@ -111,9 +111,9 @@
     if (diffWriter != nullptr)
     {
         if (wasLoaded)
-            diffWriter->ChangeRevision(originalRevision, revision, 
modelFormatId);
+            diffWriter->ChangeRevision(originalRevision, revision, 
modelFormatId, textId);
         else
-            diffWriter->NewRevision(revision, modelFormatId);
+            diffWriter->NewRevision(revision, modelFormatId, textId);
     }
 }
 
@@ -126,7 +126,7 @@
     else
         dumpRef->revisionIdIndex->Add(revision.RevisionId, offset);
 
-    dumpRef->textGroupsManager->WriteTextGroupIfFull();
+    dumpRef->textGroupsManager->WriteTextGroupIfFull(diffWriter);
 }
 
 std::uint32_t DumpRevision::NewLength()
@@ -143,7 +143,7 @@
 }
 
 DumpRevision::DumpRevision(std::weak_ptr<WritableDump> dump, std::uint32_t 
revisionId)
-    : DumpObject(dump), revision(), modelFormatId(), textGroupId(0), textId(0),
+    : DumpObject(dump), revision(), modelFormatId(), textGroupId(0), 
textId(0), textSaved(false),
         isModelFormatIdNew(false), wasLoaded(true), textUnloaded(false), 
diffWriter(), forceDiff(false)
 {
     withText = IsPages(dump.lock()->fileHeader.Kind);
@@ -368,6 +368,13 @@
     return result;
 }
 
+void DumpRevision::SetTextGroup(std::uint32_t textGroupId, std::uint8_t textId)
+{
+    this->textGroupId = textGroupId;
+    this->textId = textId;
+    this->textSaved = true;
+}
+
 void DumpRevision::DeleteText()
 {
     if (withText && !HasFlag(revision.Flags, RevisionFlags::TextDeleted))
diff --git a/DumpObjects/DumpRevision.h b/DumpObjects/DumpRevision.h
index 47e3009..833fed1 100644
--- a/DumpObjects/DumpRevision.h
+++ b/DumpObjects/DumpRevision.h
@@ -17,6 +17,8 @@
 
     std::uint32_t textGroupId;
     std::uint8_t textId;
+    // is set when text group is set from outside
+    bool textSaved;
 
     std::uint32_t textLength;
     std::uint64_t textOffset;
@@ -48,6 +50,8 @@
     static void WriteCore(std::ostream &stream, Revision &revision, 
std::uint8_t modelFormatId, bool withText);
     static std::uint32_t LengthCore(const Revision &revision, std::uint8_t 
modelFormatId, bool withText);
 
+    void SetTextGroup(std::uint32_t textGroupId, std::uint8_t textId);
+
     // deletes text of the current revision from the dump, if there is any
     void DeleteText();
 };
\ No newline at end of file
diff --git a/DumpObjects/TextGroup.cpp b/DumpObjects/TextGroup.cpp
index 7314741..120546c 100644
--- a/DumpObjects/TextGroup.cpp
+++ b/DumpObjects/TextGroup.cpp
@@ -8,9 +8,6 @@
 
 void TextGroup::EnsureCompressed()
 {
-    if (texts.empty())
-        throw DumpException();
-
     if (!compressedTexts.empty())
         return;
 
@@ -151,6 +148,14 @@
     return result;
 }
 
+void TextGroup::SetCompressedTexts(const std::string& newCompressedTexts)
+{
+    if (!compressedTexts.empty() || !texts.empty())
+        throw DumpException();
+
+    compressedTexts = newCompressedTexts;
+}
+
 bool TextGroup::IsEmpty() const
 {
     return std::all_of(texts.begin(), texts.end(), [](const std::string &s) { 
return s == deletedTextMark; });
diff --git a/DumpObjects/TextGroup.h b/DumpObjects/TextGroup.h
index 7184db8..0ea50e3 100644
--- a/DumpObjects/TextGroup.h
+++ b/DumpObjects/TextGroup.h
@@ -7,7 +7,6 @@
 private:
     std::uint32_t textGroupId;
     std::vector<std::string> texts;
-    std::string compressedTexts;
     // if a group is not editable, new texts cannot be added, but old texts 
can still be deleted
     bool isEditable;
 
@@ -20,6 +19,8 @@
     virtual void WriteInternal() override;
     virtual void UpdateIndex(Offset offset, bool overwrite) override;
 public:
+    std::string compressedTexts;
+
     TextGroup(std::weak_ptr<WritableDump> dump, std::uint32_t textGroupId);
 
     virtual uint32_t NewLength() override;
@@ -31,5 +32,7 @@
     bool DeleteText(std::uint8_t textId);
     std::string GetText(std::uint8_t textId) const;
 
+    void SetCompressedTexts(const std::string& compressedTexts);
+
     bool IsFull() const;
 };
\ No newline at end of file
diff --git a/DumpWriters/CompositeWriter.cpp b/DumpWriters/CompositeWriter.cpp
index ad75049..4e53dd0 100644
--- a/DumpWriters/CompositeWriter.cpp
+++ b/DumpWriters/CompositeWriter.cpp
@@ -34,8 +34,8 @@
         writer->SetDumpKind(dumpKind);
 }
 
-void CompositeWriter::EndDump()
+void CompositeWriter::Complete()
 {
     for (auto &writer : writers)
-        writer->EndDump();
+        writer->Complete();
 }
\ No newline at end of file
diff --git a/DumpWriters/CompositeWriter.h b/DumpWriters/CompositeWriter.h
index 8966f52..2967d3f 100644
--- a/DumpWriters/CompositeWriter.h
+++ b/DumpWriters/CompositeWriter.h
@@ -20,5 +20,5 @@
     virtual void EndPage() override;
     virtual void SetSiteInfo(const std::shared_ptr<const SiteInfo> siteInfo) 
override;
     virtual void SetDumpKind(DumpKind dumpKind) override;
-    virtual void EndDump() override;
+    virtual void Complete() override;
 };
\ No newline at end of file
diff --git a/DumpWriters/DumpWriter.cpp b/DumpWriters/DumpWriter.cpp
index 55af7df..1154b20 100644
--- a/DumpWriters/DumpWriter.cpp
+++ b/DumpWriters/DumpWriter.cpp
@@ -137,7 +137,7 @@
     }
 }
 
-void DumpWriter::EndDump()
+void DumpWriter::Complete()
 {
     for (std::uint32_t i = 0; i < unvisitedPageIds.size(); i++)
     {
@@ -162,5 +162,8 @@
         }
     }
 
-    dump->Complete();
+    dump->Complete(diffWriter.get());
+
+    if (diffWriter != nullptr)
+        diffWriter->Complete();
 }
\ No newline at end of file
diff --git a/DumpWriters/DumpWriter.h b/DumpWriters/DumpWriter.h
index 2d99090..c70cbc9 100644
--- a/DumpWriters/DumpWriter.h
+++ b/DumpWriters/DumpWriter.h
@@ -29,5 +29,5 @@
     virtual void EndPage() override;
     virtual void SetSiteInfo(const std::shared_ptr<const SiteInfo> siteInfo) 
override;
     virtual void SetDumpKind(DumpKind dumpKind) override;
-    virtual void EndDump() override;
+    virtual void Complete() override;
 };
\ No newline at end of file
diff --git a/DumpWriters/IDumpWriter.h b/DumpWriters/IDumpWriter.h
index 42cde90..cc50438 100644
--- a/DumpWriters/IDumpWriter.h
+++ b/DumpWriters/IDumpWriter.h
@@ -14,7 +14,7 @@
     virtual void EndPage() = 0;
     virtual void SetSiteInfo(const std::shared_ptr<const SiteInfo> siteInfo) = 
0;
     virtual void SetDumpKind(DumpKind dumpKind) = 0;
-    virtual void EndDump() = 0;
+    virtual void Complete() = 0;
 
     virtual ~IDumpWriter() {}
 };
diff --git a/DumpWriters/WriterWrapper.cpp b/DumpWriters/WriterWrapper.cpp
index 697f89f..886b255 100644
--- a/DumpWriters/WriterWrapper.cpp
+++ b/DumpWriters/WriterWrapper.cpp
@@ -20,7 +20,7 @@
     wrapped->SetSiteInfo(siteInfo);
 }
 
-void WriterWrapper::EndDump()
+void WriterWrapper::Complete()
 {
-    wrapped->EndDump();
+    wrapped->Complete();
 }
\ No newline at end of file
diff --git a/DumpWriters/WriterWrapper.h b/DumpWriters/WriterWrapper.h
index 7718133..d520177 100644
--- a/DumpWriters/WriterWrapper.h
+++ b/DumpWriters/WriterWrapper.h
@@ -16,5 +16,5 @@
     virtual void EndPage() override;
     virtual void SetSiteInfo(const std::shared_ptr<const SiteInfo> siteInfo) 
override;
     virtual void SetDumpKind(DumpKind dumpKind) override = 0;
-    virtual void EndDump() override;
+    virtual void Complete() override;
 };
\ No newline at end of file
diff --git a/Incremental dumps.vcxproj b/Incremental dumps.vcxproj
index cce07ea..81882bb 100644
--- a/Incremental dumps.vcxproj
+++ b/Incremental dumps.vcxproj
@@ -84,6 +84,7 @@
     <ClCompile Include="Diff\ChangeProcessor.cpp" />
     <ClCompile Include="Diff\Changes\Change.cpp" />
     <ClCompile Include="Diff\Changes\DeleteRevisionChange.cpp" />
+    <ClCompile Include="Diff\Changes\DiffTextGroup.cpp" />
     <ClCompile Include="Diff\Changes\FullDeletePageChange.cpp" />
     <ClCompile Include="Diff\Changes\PartialDeletePageChange.cpp" />
     <ClCompile Include="Diff\DiffReader.cpp" />
@@ -107,6 +108,7 @@
     <ClInclude Include="Diff\ChangeProcessor.h" />
     <ClInclude Include="Diff\Changes\Change.h" />
     <ClInclude Include="Diff\Changes\DeleteRevisionChange.h" />
+    <ClInclude Include="Diff\Changes\DiffTextGroup.h" />
     <ClInclude Include="Diff\Changes\FullDeletePageChange.h" />
     <ClInclude Include="Diff\Changes\PartialDeletePageChange.h" />
     <ClInclude Include="Diff\DiffReader.h" />
diff --git a/TODO.txt b/TODO.txt
index 81942cd..151e49f 100644
--- a/TODO.txt
+++ b/TODO.txt
@@ -1,13 +1,13 @@
 short term:
 - don't write empty unsaved indexes
 - compression of metadata and indexes
-- better (group) compression of diff dumps
- - one group for id and dd (no compressing twice)
+- better error messages (including bad filename)
 - idumps r to stdout
 - std::
 - encapsulate public fields? (e.g. in WritableDump)
 - deal with Readable/WritableDump
 - extract base36 code from DumpRevision
+- save dumpkind
 - do something with build process
 
 long term:
diff --git a/TextGroupsManager.cpp b/TextGroupsManager.cpp
index dba23ca..900159f 100644
--- a/TextGroupsManager.cpp
+++ b/TextGroupsManager.cpp
@@ -6,6 +6,12 @@
     : dump(dump), textGroupModified(false)
 { }
 
+std::unique_ptr<TextGroup> TextGroupsManager::CreateNewGroup()
+{
+    std::uint32_t newId = getNewId(*dump.lock()->textGroupIdIndex);
+    return std::unique_ptr<TextGroup>(new TextGroup(dump, newId));
+}
+
 std::pair<std::uint32_t, std::uint8_t> TextGroupsManager::AddTextToGroup(const 
std::string& text)
 {
     auto dumpRef = dump.lock();
@@ -14,8 +20,7 @@
     {
         auto streamPos = dumpRef->stream->tellg();
 
-        std::uint32_t newId = getNewId(*dumpRef->textGroupIdIndex);
-        currentGroup = std::unique_ptr<TextGroup>(new TextGroup(dump, newId));
+        currentGroup = CreateNewGroup();
         textGroupModified = false;
 
         // this method can be written in the middle of writing an object,
@@ -30,19 +35,22 @@
     return std::make_pair(textGroupId, textId);
 }
 
-void TextGroupsManager::EndGroup()
+void TextGroupsManager::EndGroup(DiffWriter* diffWriter)
 {
     if (textGroupModified)
         currentGroup->Write();
+
+    if (diffWriter != nullptr)
+        diffWriter->SetTextGroup(currentGroup->compressedTexts);
 
     currentGroup = nullptr;
     textGroupModified = false;
 }
 
-void TextGroupsManager::WriteTextGroupIfFull()
+void TextGroupsManager::WriteTextGroupIfFull(DiffWriter* diffWriter)
 {
     if (currentGroup != nullptr && currentGroup->IsFull())
-        EndGroup();
+        EndGroup(diffWriter);
 }
 
 void TextGroupsManager::DeleteTextFromGroup(std::uint32_t textGroupId, 
std::uint8_t textId)
@@ -73,7 +81,7 @@
 {
     if (currentGroup == nullptr || currentGroup->GetTextGroupId() != 
textGroupId)
     {
-        EndGroup();
+        EndGroup(nullptr);
 
         currentGroup = std::unique_ptr<TextGroup>(new TextGroup(dump, 
textGroupId));
         textGroupModified = false;
@@ -82,8 +90,17 @@
     return currentGroup->GetText(textId);
 }
 
-void TextGroupsManager::Complete()
+std::uint32_t TextGroupsManager::ImportTextGroup(const std::string& 
compressedTexts)
 {
-    EndGroup();
+    auto group = CreateNewGroup();
+    group->SetCompressedTexts(compressedTexts);
+    group->Write();
+
+    return group->GetTextGroupId();
+}
+
+void TextGroupsManager::Complete(DiffWriter* diffWriter)
+{
+    EndGroup(diffWriter);
     EndDeletedGroup();
 }
\ No newline at end of file
diff --git a/TextGroupsManager.h b/TextGroupsManager.h
index e854482..337f62f 100644
--- a/TextGroupsManager.h
+++ b/TextGroupsManager.h
@@ -5,6 +5,7 @@
 #include <utility>
 #include <memory>
 #include "DumpObjects/TextGroup.h"
+#include "Diff/DiffWriter.h"
 
 class TextGroupsManager
 {
@@ -15,17 +16,21 @@
     std::unique_ptr<TextGroup> currentDeletedGroup;
     bool textGroupModified;
 
+    std::unique_ptr<TextGroup> CreateNewGroup();
+
+    void EndGroup(DiffWriter* diffWriter);
     void EndDeletedGroup();
 public:
     TextGroupsManager(std::weak_ptr<WritableDump> dump);
 
     std::pair<std::uint32_t, std::uint8_t> AddTextToGroup(const std::string& 
text);
-    void EndGroup();
-    void WriteTextGroupIfFull();
+    void WriteTextGroupIfFull(DiffWriter* diffWriter);
 
     void DeleteTextFromGroup(std::uint32_t textGroupId, std::uint8_t textId);
 
     std::string GetTextFromGroup(std::uint32_t textGroupId, std::uint8_t 
textId);
 
-    void Complete();
+    std::uint32_t ImportTextGroup(const std::string& compressedTexts); 
+
+    void Complete(DiffWriter* diffWriter);
 };
\ No newline at end of file
diff --git a/main.cpp b/main.cpp
index 5a20936..28f3080 100644
--- a/main.cpp
+++ b/main.cpp
@@ -163,7 +163,7 @@
 
     XmlMediawikiProcessor::Process(&writer, inputFileName);
 
-    writer.EndDump();
+    writer.Complete();
 }
 
 void updateDump(std::queue<std::string> &parameters)
@@ -210,7 +210,7 @@
 
     XmlMediawikiProcessor::Process(&writer, dumpBackupStream);
 
-    writer.EndDump();
+    writer.Complete();
 }
 
 void readDump(std::string dumpFileName, std::string outputFileName)

-- 
To view, visit https://gerrit.wikimedia.org/r/83994
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I2d357218056f9324afa21d586c4b40d935b84830
Gerrit-PatchSet: 1
Gerrit-Project: operations/dumps/incremental
Gerrit-Branch: gsoc
Gerrit-Owner: Petr Onderka <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to