Petr Onderka has submitted this change and it was merged.
Change subject: Group compression of diff dumps
......................................................................
Group compression of diff dumps
Change-Id: I2d357218056f9324afa21d586c4b40d935b84830
---
M CMakeLists.txt
M Diff/ChangeProcessor.cpp
M Diff/ChangeProcessor.h
M Diff/Changes/Change.h
A Diff/Changes/DiffTextGroup.cpp
A Diff/Changes/DiffTextGroup.h
M Diff/Changes/NewRevisionChange.cpp
M Diff/Changes/NewRevisionChange.h
M Diff/Changes/RevisionChange.cpp
M Diff/Changes/RevisionChange.h
M Diff/DiffReader.cpp
M Diff/DiffWriter.cpp
M Diff/DiffWriter.h
M Dump.cpp
M Dump.h
M DumpObjects/DumpRevision.cpp
M DumpObjects/DumpRevision.h
M DumpObjects/TextGroup.cpp
M DumpObjects/TextGroup.h
M DumpWriters/CompositeWriter.cpp
M DumpWriters/CompositeWriter.h
M DumpWriters/DumpWriter.cpp
M DumpWriters/DumpWriter.h
M DumpWriters/IDumpWriter.h
M DumpWriters/WriterWrapper.cpp
M DumpWriters/WriterWrapper.h
M Incremental dumps.vcxproj
M TODO.txt
M TextGroupsManager.cpp
M TextGroupsManager.h
M main.cpp
31 files changed, 226 insertions(+), 116 deletions(-)
Approvals:
Petr Onderka: Verified; Looks good to me, approved
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 578ffdf..0ebbb37 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -11,6 +11,7 @@
Diff/Changes/FullDeletePageChange.cpp
Diff/Changes/PartialDeletePageChange.cpp
Diff/Changes/DeleteRevisionChange.cpp
+ Diff/Changes/DiffTextGroup.cpp
Diff/Changes/NewModelFormatChange.cpp
Diff/Changes/NewPageChange.cpp
Diff/Changes/NewRevisionChange.cpp
@@ -75,6 +76,7 @@
Diff/Changes/FullDeletePageChange.h
Diff/Changes/PartialDeletePageChange.h
Diff/Changes/DeleteRevisionChange.h
+ Diff/Changes/DiffTextGroup.h
Diff/Changes/NewModelFormatChange.h
Diff/Changes/NewPageChange.h
Diff/Changes/NewRevisionChange.h
diff --git a/Diff/ChangeProcessor.cpp b/Diff/ChangeProcessor.cpp
index 9640cd0..b3f2d7c 100644
--- a/Diff/ChangeProcessor.cpp
+++ b/Diff/ChangeProcessor.cpp
@@ -1,5 +1,6 @@
#include "ChangeProcessor.h"
#include "../Dump.h"
+#include "../TextGroupsManager.h"
#include "../DumpObjects/DumpRevision.h"
#include "../Indexes/Index.h"
@@ -14,7 +15,7 @@
}
ChangeProcessor::ChangeProcessor(std::shared_ptr<WritableDump> dump)
- : dump(dump)
+ : dump(dump), currentTextGroupId(0)
{}
void ChangeProcessor::Process(SiteInfoChange change)
@@ -67,6 +68,7 @@
DumpRevision dumpRevision(dump, change.revision.RevisionId);
dumpRevision.revision = change.revision;
dumpRevision.SetModelFormatId(change.modelFormatId);
+ dumpRevision.SetTextGroup(currentTextGroupId, change.textId);
dumpRevision.Write();
currentPage->page.RevisionIds.insert(change.revision.RevisionId);
@@ -99,7 +101,7 @@
revision.Sha1 = revisionChanges.Sha1;
if (IsPages(dump->fileHeader.Kind))
- revision.SetText(revisionChanges.GetText());
+ dumpRevision.SetTextGroup(currentTextGroupId, change.textId);
else
revision.TextLength = revisionChanges.TextLength;
}
@@ -132,9 +134,14 @@
dump->DeletePagePartial(change.pageId);
}
-void ChangeProcessor::End()
+void ChangeProcessor::Process(DiffTextGroup change)
+{
+ currentTextGroupId =
dump->textGroupsManager->ImportTextGroup(change.compressedTexts);
+}
+
+void ChangeProcessor::Complete()
{
WritePage();
- dump->Complete();
+ dump->Complete(nullptr);
}
\ No newline at end of file
diff --git a/Diff/ChangeProcessor.h b/Diff/ChangeProcessor.h
index 0d6104a..3691415 100644
--- a/Diff/ChangeProcessor.h
+++ b/Diff/ChangeProcessor.h
@@ -19,6 +19,7 @@
std::shared_ptr<WritableDump> dump;
std::unique_ptr<DumpPage> currentPage;
+ std::uint32_t currentTextGroupId;
void WritePage();
public:
@@ -33,7 +34,8 @@
void Process(DeleteRevisionChange change);
void Process(FullDeletePageChange change);
void Process(PartialDeletePageChange change);
+ void Process(DiffTextGroup change);
// has to be called after processing is complete
- void End();
+ void Complete();
};
diff --git a/Diff/Changes/Change.h b/Diff/Changes/Change.h
index f384efa..b7bd2c2 100644
--- a/Diff/Changes/Change.h
+++ b/Diff/Changes/Change.h
@@ -15,7 +15,9 @@
ChangeRevision = 0x21,
DeleteRevision = 0x22,
- NewModelFormat = 0x30
+ NewModelFormat = 0x30,
+
+ TextGroup = 0x40
};
class Change : public DumpObjectBase
diff --git a/Diff/Changes/DiffTextGroup.cpp b/Diff/Changes/DiffTextGroup.cpp
new file mode 100644
index 0000000..b70d0cb
--- /dev/null
+++ b/Diff/Changes/DiffTextGroup.cpp
@@ -0,0 +1,24 @@
+#include "DiffTextGroup.h"
+
+DiffTextGroup::DiffTextGroup(const std::string &compressedTexts)
+ : compressedTexts(compressedTexts)
+{}
+
+DiffTextGroup DiffTextGroup::Read(std::istream &stream)
+{
+ auto texts = DumpTraits<std::string>::ReadLong(stream);
+
+ return DiffTextGroup(texts);
+}
+
+std::uint32_t DiffTextGroup::NewLength()
+{
+ return ValueSize(ChangeKind::TextGroup)
+ + DumpTraits<std::string>::DumpSizeLong(compressedTexts);
+}
+
+void DiffTextGroup::WriteInternal()
+{
+ WriteValue(ChangeKind::TextGroup);
+ DumpTraits<std::string>::WriteLong(*stream, compressedTexts);
+}
\ No newline at end of file
diff --git a/Diff/Changes/DiffTextGroup.h b/Diff/Changes/DiffTextGroup.h
new file mode 100644
index 0000000..1329423
--- /dev/null
+++ b/Diff/Changes/DiffTextGroup.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include "Change.h"
+
+class DiffTextGroup : public Change
+{
+public:
+ const std::string compressedTexts;
+
+ DiffTextGroup(const std::string &compressedTexts);
+
+ static DiffTextGroup Read(std::istream &stream);
+ virtual std::uint32_t NewLength() override;
+ virtual void WriteInternal() override;
+};
\ No newline at end of file
diff --git a/Diff/Changes/NewRevisionChange.cpp
b/Diff/Changes/NewRevisionChange.cpp
index ccc9758..8205edc 100644
--- a/Diff/Changes/NewRevisionChange.cpp
+++ b/Diff/Changes/NewRevisionChange.cpp
@@ -1,28 +1,18 @@
#include "NewRevisionChange.h"
#include "../../DumpObjects/DumpRevision.h"
-#include "../../SevenZip.h"
-
-void NewRevisionChange::EnsureCompressed()
-{
- if (compressedTextSet)
- return;
-
- compressedText = SevenZip::Compress(revision.GetText());
- compressedTextSet = true;
-}
NewRevisionChange NewRevisionChange::Read(std::istream &stream, bool withText)
{
std::uint8_t modelFormatId;
auto revision = DumpRevision::ReadCore(stream, modelFormatId, withText);
+ std::uint8_t textId = 0;
if (withText && !HasFlag(revision.Flags, RevisionFlags::TextDeleted))
{
- std::string compressedText = DumpTraits<string>::ReadLong(stream);
- revision.SetText(SevenZip::Decompress(compressedText));
+ ReadValue(stream, textId);
}
- return NewRevisionChange(revision, modelFormatId, withText);
+ return NewRevisionChange(revision, modelFormatId, withText, textId);
}
void NewRevisionChange::WriteInternal()
@@ -33,15 +23,12 @@
if (withText && !HasFlag(revision.Flags, RevisionFlags::TextDeleted))
{
- EnsureCompressed();
- DumpTraits<std::string>::WriteLong(*stream, compressedText);
+ WriteValue(textId);
}
}
std::uint32_t NewRevisionChange::NewLength()
{
- EnsureCompressed();
-
std::uint32_t result = 0;
result += ValueSize(ChangeKind::NewRevision);
@@ -49,8 +36,7 @@
if (withText && !HasFlag(revision.Flags, RevisionFlags::TextDeleted))
{
- EnsureCompressed();
- result += DumpTraits<std::string>::DumpSizeLong(compressedText);
+ result += ValueSize(textId);
}
return result;
diff --git a/Diff/Changes/NewRevisionChange.h b/Diff/Changes/NewRevisionChange.h
index 732d8f8..169e204 100644
--- a/Diff/Changes/NewRevisionChange.h
+++ b/Diff/Changes/NewRevisionChange.h
@@ -7,17 +7,14 @@
{
private:
bool withText;
-
- std::string compressedText;
- bool compressedTextSet;
-
- void EnsureCompressed();
public:
Revision revision;
std::uint8_t modelFormatId;
- NewRevisionChange(const Revision &revision, std::uint8_t modelFormatId,
bool withText)
- : revision(revision), modelFormatId(modelFormatId),
withText(withText), compressedTextSet(false)
+ std::uint8_t textId;
+
+ NewRevisionChange(const Revision &revision, std::uint8_t modelFormatId,
bool withText, std::uint8_t textId)
+ : revision(revision), modelFormatId(modelFormatId),
withText(withText), textId(textId)
{}
static NewRevisionChange Read(std::istream &stream, bool withText);
diff --git a/Diff/Changes/RevisionChange.cpp b/Diff/Changes/RevisionChange.cpp
index f72159a..8c4ab73 100644
--- a/Diff/Changes/RevisionChange.cpp
+++ b/Diff/Changes/RevisionChange.cpp
@@ -1,6 +1,5 @@
#include "RevisionChange.h"
#include "../../DumpObjects/DumpUser.h"
-#include "../../SevenZip.h"
RevisionChangeFlags operator &(RevisionChangeFlags first, RevisionChangeFlags
second)
{
@@ -30,23 +29,14 @@
}
-void RevisionChange::EnsureCompressed()
-{
- if (compressedTextSet)
- return;
-
- compressedText = SevenZip::Compress(revisionChanges.GetText());
- compressedTextSet = true;
-}
-
RevisionChange::RevisionChange(bool withText)
: withText(withText)
{}
RevisionChange::RevisionChange(
const Revision &oldRevision, Revision &newRevision,
- std::uint8_t newRevisionModelFormatId, bool withText)
- : withText(withText)
+ std::uint8_t newRevisionModelFormatId, bool withText, std::uint8_t textId)
+ : withText(withText), textId(textId)
{
revisionChanges = newRevision;
this->newRevisionModelFormatId = newRevisionModelFormatId;
@@ -116,10 +106,7 @@
ReadValue(stream, result.revisionChanges.Sha1);
if (withText)
- {
- auto compressedText = DumpTraits<std::string>::ReadLong(stream);
-
result.revisionChanges.SetText(SevenZip::Decompress(compressedText));
- }
+ ReadValue(stream, result.textId);
else
ReadValue(stream, result.revisionChanges.TextLength);
}
@@ -156,10 +143,7 @@
WriteValue(revisionChanges.Sha1);
if (withText)
- {
- EnsureCompressed();
- DumpTraits<std::string>::WriteLong(*stream, compressedText);
- }
+ WriteValue(textId);
else
WriteValue(revisionChanges.TextLength);
}
@@ -196,10 +180,7 @@
result += ValueSize(revisionChanges.Sha1);
if (withText)
- {
- EnsureCompressed();
- result += DumpTraits<std::string>::DumpSizeLong(compressedText);
- }
+ result += ValueSize(textId);
else
result += ValueSize(revisionChanges.TextLength);
}
diff --git a/Diff/Changes/RevisionChange.h b/Diff/Changes/RevisionChange.h
index b989581..73b2662 100644
--- a/Diff/Changes/RevisionChange.h
+++ b/Diff/Changes/RevisionChange.h
@@ -26,21 +26,18 @@
private:
bool withText;
- std::string compressedText;
- bool compressedTextSet;
-
RevisionChange(bool withText);
-
- void EnsureCompressed();
public:
RevisionChangeFlags flags;
Revision revisionChanges;
std::uint8_t newRevisionModelFormatId;
+ std::uint8_t textId;
+
RevisionChange(
const Revision &oldRevision, Revision &newRevision,
- std::uint8_t newRevisionModelFormatId, bool withText);
+ std::uint8_t newRevisionModelFormatId, bool withText, std::uint8_t
textId);
static RevisionChange Read(std::istream &stream, bool withText);
virtual void WriteInternal() override;
diff --git a/Diff/DiffReader.cpp b/Diff/DiffReader.cpp
index abb11c3..3fd291c 100644
--- a/Diff/DiffReader.cpp
+++ b/Diff/DiffReader.cpp
@@ -69,10 +69,13 @@
case ChangeKind::DeletePagePartial:
changeProcessor.Process(PartialDeletePageChange::Read(*stream));
break;
+ case ChangeKind::TextGroup:
+ changeProcessor.Process(DiffTextGroup::Read(*stream));
+ break;
default:
throw DumpException();
}
}
- changeProcessor.End();
+ changeProcessor.Complete();
}
\ No newline at end of file
diff --git a/Diff/DiffWriter.cpp b/Diff/DiffWriter.cpp
index e192d46..38c0f7d 100644
--- a/Diff/DiffWriter.cpp
+++ b/Diff/DiffWriter.cpp
@@ -18,9 +18,27 @@
if (!pageWritten)
{
- unwrittenPage->Write(stream.get());
+ Process(*unwrittenPage);
unwrittenPage = nullptr;
pageWritten = true;
+ }
+}
+
+template <typename TChange>
+void DiffWriter::Process(TChange &change)
+{
+ if (IsPages(dumpKind))
+ changeQueue.push(std::unique_ptr<Change>(new TChange(change)));
+ else
+ change.Write(stream.get());
+}
+
+void DiffWriter::HandleQueue()
+{
+ while (!changeQueue.empty())
+ {
+ changeQueue.front()->Write(stream.get());
+ changeQueue.pop();
}
}
@@ -56,7 +74,7 @@
throw DumpException();
NewPageChange change(page);
- change.Write(stream.get());
+ Process(change);
pageWritten = true;
pageStarted = true;
@@ -73,7 +91,7 @@
if (change.HasChanges())
{
- change.Write(stream.get());
+ Process(change);
pageWritten = true;
}
else
@@ -106,10 +124,10 @@
throw DumpException();
NewModelFormatChange change(id, model, format);
- change.Write(stream.get());
+ Process(change);
}
-void DiffWriter::NewRevision(const Revision &revision, std::uint8_t
modelFormatId)
+void DiffWriter::NewRevision(const Revision &revision, std::uint8_t
modelFormatId, std::uint8_t textId)
{
if (!dumpStarted)
throw DumpException();
@@ -118,12 +136,12 @@
EnsurePageWritten();
- NewRevisionChange change(revision, modelFormatId, IsPages(dumpKind));
- change.Write(stream.get());
+ NewRevisionChange change(revision, modelFormatId, IsPages(dumpKind),
textId);
+ Process(change);
}
void DiffWriter::ChangeRevision(
- const Revision &oldRevision, Revision &newRevision, std::uint8_t
newModelFormatId)
+ const Revision &oldRevision, Revision &newRevision, std::uint8_t
newModelFormatId , std::uint8_t textId)
{
if (!dumpStarted)
throw DumpException();
@@ -132,8 +150,8 @@
EnsurePageWritten();
- RevisionChange change(oldRevision, newRevision, newModelFormatId,
IsPages(dumpKind));
- change.Write(stream.get());
+ RevisionChange change(oldRevision, newRevision, newModelFormatId,
IsPages(dumpKind), textId);
+ Process(change);
}
void DiffWriter::DeleteRevision(std::uint32_t revisionId)
@@ -145,7 +163,7 @@
EnsurePageWritten();
DeleteRevisionChange change(revisionId);
- change.Write(stream.get());
+ Process(change);
}
void DiffWriter::EndPage()
@@ -169,7 +187,7 @@
throw DumpException();
FullDeletePageChange change(pageId);
- change.Write(stream.get());
+ Process(change);
}
void DiffWriter::DeletePagePartial(std::uint32_t pageId)
@@ -180,5 +198,21 @@
throw DumpException();
PartialDeletePageChange change(pageId);
- change.Write(stream.get());
+ Process(change);
+}
+
+void DiffWriter::SetTextGroup(const std::string& compressedTexts)
+{
+ if (!dumpStarted)
+ throw DumpException();
+
+ DiffTextGroup diffTextGroup(compressedTexts);
+ diffTextGroup.Write(stream.get());
+
+ HandleQueue();
+}
+
+void DiffWriter::Complete()
+{
+ HandleQueue();
}
\ No newline at end of file
diff --git a/Diff/DiffWriter.h b/Diff/DiffWriter.h
index 76546ca..d789554 100644
--- a/Diff/DiffWriter.h
+++ b/Diff/DiffWriter.h
@@ -1,6 +1,7 @@
#pragma once
#include <string>
+#include <queue>
#include "Changes/PageChange.h"
#include "../Objects/Page.h"
#include "../Objects/Revision.h"
@@ -22,7 +23,15 @@
const std::string oldTimestamp;
const std::string newTimestamp;
+ std::queue<std::unique_ptr<Change>> changeQueue;
+
void EnsurePageWritten();
+
+ // either saves the change directly, or adds it to queue
+ template <typename TChange>
+ void Process(TChange &change);
+
+ void HandleQueue();
public:
DiffWriter(const std::string &fileName, const std::string &name, const
std::string &oldTimestamp, const std::string &newTimestamp);
@@ -35,12 +44,16 @@
void NewModelFormat(std::uint8_t id, const std::string &model, const
std::string &format);
- void NewRevision(const Revision &revision, std::uint8_t modelFormatId);
- void ChangeRevision(const Revision &oldRevision, Revision &newRevision,
std::uint8_t newModelFormatId);
+ void NewRevision(const Revision &revision, std::uint8_t modelFormatId,
std::uint8_t textId);
+ void ChangeRevision(const Revision &oldRevision, Revision &newRevision,
std::uint8_t newModelFormatId, std::uint8_t textId);
void DeleteRevision(std::uint32_t revisionId);
void EndPage();
void DeletePageFull(std::uint32_t pageId);
void DeletePagePartial(std::uint32_t pageId);
+
+ void SetTextGroup(const std::string& compressedTexts);
+
+ void Complete();
};
\ No newline at end of file
diff --git a/Dump.cpp b/Dump.cpp
index e4cad1c..46d2d31 100644
--- a/Dump.cpp
+++ b/Dump.cpp
@@ -90,9 +90,9 @@
return dump;
}
-void WritableDump::Complete()
+void WritableDump::Complete(DiffWriter* diffWriter)
{
- textGroupsManager->Complete();
+ textGroupsManager->Complete(diffWriter);
spaceIndex->Write();
pageIdIndex->Write();
diff --git a/Dump.h b/Dump.h
index 4d80a7b..e454671 100644
--- a/Dump.h
+++ b/Dump.h
@@ -8,6 +8,7 @@
#include "DumpObjects/FileHeader.h"
#include "DumpObjects/DumpSiteInfo.h"
#include "DumpObjects/TextGroup.h"
+#include "Diff/DiffWriter.h"
using std::int32_t;
using std::int64_t;
@@ -70,7 +71,7 @@
std::unique_ptr<TextGroupsManager> textGroupsManager;
// it's necessary to call this after writing is finished
- void Complete();
+ void Complete(DiffWriter* diffWriter);
void DeletePagePartial(std::uint32_t pageId);
// also deletes revisions of the given page
diff --git a/DumpObjects/DumpRevision.cpp b/DumpObjects/DumpRevision.cpp
index aedc93c..16c010a 100644
--- a/DumpObjects/DumpRevision.cpp
+++ b/DumpObjects/DumpRevision.cpp
@@ -70,7 +70,7 @@
if (wasLoaded && originalRevision == revision)
{
if (forceDiff && diffWriter != nullptr)
- diffWriter->ChangeRevision(originalRevision, revision,
modelFormatId);
+ diffWriter->ChangeRevision(originalRevision, revision,
modelFormatId, textId);
return;
}
@@ -94,7 +94,7 @@
if (withText && !HasFlag(revision.Flags, RevisionFlags::TextDeleted))
{
- if (textGroupId == 0 || originalRevision.Sha1 != revision.Sha1)
+ if (textGroupId == 0 || (!textSaved && originalRevision.Sha1 !=
revision.Sha1))
{
if (textGroupId != 0)
DeleteText();
@@ -111,9 +111,9 @@
if (diffWriter != nullptr)
{
if (wasLoaded)
- diffWriter->ChangeRevision(originalRevision, revision,
modelFormatId);
+ diffWriter->ChangeRevision(originalRevision, revision,
modelFormatId, textId);
else
- diffWriter->NewRevision(revision, modelFormatId);
+ diffWriter->NewRevision(revision, modelFormatId, textId);
}
}
@@ -126,7 +126,7 @@
else
dumpRef->revisionIdIndex->Add(revision.RevisionId, offset);
- dumpRef->textGroupsManager->WriteTextGroupIfFull();
+ dumpRef->textGroupsManager->WriteTextGroupIfFull(diffWriter);
}
std::uint32_t DumpRevision::NewLength()
@@ -143,7 +143,7 @@
}
DumpRevision::DumpRevision(std::weak_ptr<WritableDump> dump, std::uint32_t
revisionId)
- : DumpObject(dump), revision(), modelFormatId(), textGroupId(0), textId(0),
+ : DumpObject(dump), revision(), modelFormatId(), textGroupId(0),
textId(0), textSaved(false),
isModelFormatIdNew(false), wasLoaded(true), textUnloaded(false),
diffWriter(), forceDiff(false)
{
withText = IsPages(dump.lock()->fileHeader.Kind);
@@ -368,6 +368,13 @@
return result;
}
+void DumpRevision::SetTextGroup(std::uint32_t textGroupId, std::uint8_t textId)
+{
+ this->textGroupId = textGroupId;
+ this->textId = textId;
+ this->textSaved = true;
+}
+
void DumpRevision::DeleteText()
{
if (withText && !HasFlag(revision.Flags, RevisionFlags::TextDeleted))
diff --git a/DumpObjects/DumpRevision.h b/DumpObjects/DumpRevision.h
index 47e3009..833fed1 100644
--- a/DumpObjects/DumpRevision.h
+++ b/DumpObjects/DumpRevision.h
@@ -17,6 +17,8 @@
std::uint32_t textGroupId;
std::uint8_t textId;
+ // is set when text group is set from outside
+ bool textSaved;
std::uint32_t textLength;
std::uint64_t textOffset;
@@ -48,6 +50,8 @@
static void WriteCore(std::ostream &stream, Revision &revision,
std::uint8_t modelFormatId, bool withText);
static std::uint32_t LengthCore(const Revision &revision, std::uint8_t
modelFormatId, bool withText);
+ void SetTextGroup(std::uint32_t textGroupId, std::uint8_t textId);
+
// deletes text of the current revision from the dump, if there is any
void DeleteText();
};
\ No newline at end of file
diff --git a/DumpObjects/TextGroup.cpp b/DumpObjects/TextGroup.cpp
index 7314741..120546c 100644
--- a/DumpObjects/TextGroup.cpp
+++ b/DumpObjects/TextGroup.cpp
@@ -8,9 +8,6 @@
void TextGroup::EnsureCompressed()
{
- if (texts.empty())
- throw DumpException();
-
if (!compressedTexts.empty())
return;
@@ -151,6 +148,14 @@
return result;
}
+void TextGroup::SetCompressedTexts(const std::string& newCompressedTexts)
+{
+ if (!compressedTexts.empty() || !texts.empty())
+ throw DumpException();
+
+ compressedTexts = newCompressedTexts;
+}
+
bool TextGroup::IsEmpty() const
{
return std::all_of(texts.begin(), texts.end(), [](const std::string &s) {
return s == deletedTextMark; });
diff --git a/DumpObjects/TextGroup.h b/DumpObjects/TextGroup.h
index 7184db8..0ea50e3 100644
--- a/DumpObjects/TextGroup.h
+++ b/DumpObjects/TextGroup.h
@@ -7,7 +7,6 @@
private:
std::uint32_t textGroupId;
std::vector<std::string> texts;
- std::string compressedTexts;
// if a group is not editable, new texts cannot be added, but old texts
can still be deleted
bool isEditable;
@@ -20,6 +19,8 @@
virtual void WriteInternal() override;
virtual void UpdateIndex(Offset offset, bool overwrite) override;
public:
+ std::string compressedTexts;
+
TextGroup(std::weak_ptr<WritableDump> dump, std::uint32_t textGroupId);
virtual uint32_t NewLength() override;
@@ -31,5 +32,7 @@
bool DeleteText(std::uint8_t textId);
std::string GetText(std::uint8_t textId) const;
+ void SetCompressedTexts(const std::string& compressedTexts);
+
bool IsFull() const;
};
\ No newline at end of file
diff --git a/DumpWriters/CompositeWriter.cpp b/DumpWriters/CompositeWriter.cpp
index ad75049..4e53dd0 100644
--- a/DumpWriters/CompositeWriter.cpp
+++ b/DumpWriters/CompositeWriter.cpp
@@ -34,8 +34,8 @@
writer->SetDumpKind(dumpKind);
}
-void CompositeWriter::EndDump()
+void CompositeWriter::Complete()
{
for (auto &writer : writers)
- writer->EndDump();
+ writer->Complete();
}
\ No newline at end of file
diff --git a/DumpWriters/CompositeWriter.h b/DumpWriters/CompositeWriter.h
index 8966f52..2967d3f 100644
--- a/DumpWriters/CompositeWriter.h
+++ b/DumpWriters/CompositeWriter.h
@@ -20,5 +20,5 @@
virtual void EndPage() override;
virtual void SetSiteInfo(const std::shared_ptr<const SiteInfo> siteInfo)
override;
virtual void SetDumpKind(DumpKind dumpKind) override;
- virtual void EndDump() override;
+ virtual void Complete() override;
};
\ No newline at end of file
diff --git a/DumpWriters/DumpWriter.cpp b/DumpWriters/DumpWriter.cpp
index 55af7df..1154b20 100644
--- a/DumpWriters/DumpWriter.cpp
+++ b/DumpWriters/DumpWriter.cpp
@@ -137,7 +137,7 @@
}
}
-void DumpWriter::EndDump()
+void DumpWriter::Complete()
{
for (std::uint32_t i = 0; i < unvisitedPageIds.size(); i++)
{
@@ -162,5 +162,8 @@
}
}
- dump->Complete();
+ dump->Complete(diffWriter.get());
+
+ if (diffWriter != nullptr)
+ diffWriter->Complete();
}
\ No newline at end of file
diff --git a/DumpWriters/DumpWriter.h b/DumpWriters/DumpWriter.h
index 2d99090..c70cbc9 100644
--- a/DumpWriters/DumpWriter.h
+++ b/DumpWriters/DumpWriter.h
@@ -29,5 +29,5 @@
virtual void EndPage() override;
virtual void SetSiteInfo(const std::shared_ptr<const SiteInfo> siteInfo)
override;
virtual void SetDumpKind(DumpKind dumpKind) override;
- virtual void EndDump() override;
+ virtual void Complete() override;
};
\ No newline at end of file
diff --git a/DumpWriters/IDumpWriter.h b/DumpWriters/IDumpWriter.h
index 42cde90..cc50438 100644
--- a/DumpWriters/IDumpWriter.h
+++ b/DumpWriters/IDumpWriter.h
@@ -14,7 +14,7 @@
virtual void EndPage() = 0;
virtual void SetSiteInfo(const std::shared_ptr<const SiteInfo> siteInfo) =
0;
virtual void SetDumpKind(DumpKind dumpKind) = 0;
- virtual void EndDump() = 0;
+ virtual void Complete() = 0;
virtual ~IDumpWriter() {}
};
diff --git a/DumpWriters/WriterWrapper.cpp b/DumpWriters/WriterWrapper.cpp
index 697f89f..886b255 100644
--- a/DumpWriters/WriterWrapper.cpp
+++ b/DumpWriters/WriterWrapper.cpp
@@ -20,7 +20,7 @@
wrapped->SetSiteInfo(siteInfo);
}
-void WriterWrapper::EndDump()
+void WriterWrapper::Complete()
{
- wrapped->EndDump();
+ wrapped->Complete();
}
\ No newline at end of file
diff --git a/DumpWriters/WriterWrapper.h b/DumpWriters/WriterWrapper.h
index 7718133..d520177 100644
--- a/DumpWriters/WriterWrapper.h
+++ b/DumpWriters/WriterWrapper.h
@@ -16,5 +16,5 @@
virtual void EndPage() override;
virtual void SetSiteInfo(const std::shared_ptr<const SiteInfo> siteInfo)
override;
virtual void SetDumpKind(DumpKind dumpKind) override = 0;
- virtual void EndDump() override;
+ virtual void Complete() override;
};
\ No newline at end of file
diff --git a/Incremental dumps.vcxproj b/Incremental dumps.vcxproj
index cce07ea..81882bb 100644
--- a/Incremental dumps.vcxproj
+++ b/Incremental dumps.vcxproj
@@ -84,6 +84,7 @@
<ClCompile Include="Diff\ChangeProcessor.cpp" />
<ClCompile Include="Diff\Changes\Change.cpp" />
<ClCompile Include="Diff\Changes\DeleteRevisionChange.cpp" />
+ <ClCompile Include="Diff\Changes\DiffTextGroup.cpp" />
<ClCompile Include="Diff\Changes\FullDeletePageChange.cpp" />
<ClCompile Include="Diff\Changes\PartialDeletePageChange.cpp" />
<ClCompile Include="Diff\DiffReader.cpp" />
@@ -107,6 +108,7 @@
<ClInclude Include="Diff\ChangeProcessor.h" />
<ClInclude Include="Diff\Changes\Change.h" />
<ClInclude Include="Diff\Changes\DeleteRevisionChange.h" />
+ <ClInclude Include="Diff\Changes\DiffTextGroup.h" />
<ClInclude Include="Diff\Changes\FullDeletePageChange.h" />
<ClInclude Include="Diff\Changes\PartialDeletePageChange.h" />
<ClInclude Include="Diff\DiffReader.h" />
diff --git a/TODO.txt b/TODO.txt
index 81942cd..151e49f 100644
--- a/TODO.txt
+++ b/TODO.txt
@@ -1,13 +1,13 @@
short term:
- don't write empty unsaved indexes
- compression of metadata and indexes
-- better (group) compression of diff dumps
- - one group for id and dd (no compressing twice)
+- better error messages (including bad filename)
- idumps r to stdout
- std::
- encapsulate public fields? (e.g. in WritableDump)
- deal with Readable/WritableDump
- extract base36 code from DumpRevision
+- save dumpkind
- do something with build process
long term:
diff --git a/TextGroupsManager.cpp b/TextGroupsManager.cpp
index dba23ca..900159f 100644
--- a/TextGroupsManager.cpp
+++ b/TextGroupsManager.cpp
@@ -6,6 +6,12 @@
: dump(dump), textGroupModified(false)
{ }
+std::unique_ptr<TextGroup> TextGroupsManager::CreateNewGroup()
+{
+ std::uint32_t newId = getNewId(*dump.lock()->textGroupIdIndex);
+ return std::unique_ptr<TextGroup>(new TextGroup(dump, newId));
+}
+
std::pair<std::uint32_t, std::uint8_t> TextGroupsManager::AddTextToGroup(const
std::string& text)
{
auto dumpRef = dump.lock();
@@ -14,8 +20,7 @@
{
auto streamPos = dumpRef->stream->tellg();
- std::uint32_t newId = getNewId(*dumpRef->textGroupIdIndex);
- currentGroup = std::unique_ptr<TextGroup>(new TextGroup(dump, newId));
+ currentGroup = CreateNewGroup();
textGroupModified = false;
// this method can be written in the middle of writing an object,
@@ -30,19 +35,22 @@
return std::make_pair(textGroupId, textId);
}
-void TextGroupsManager::EndGroup()
+void TextGroupsManager::EndGroup(DiffWriter* diffWriter)
{
if (textGroupModified)
currentGroup->Write();
+
+ if (diffWriter != nullptr)
+ diffWriter->SetTextGroup(currentGroup->compressedTexts);
currentGroup = nullptr;
textGroupModified = false;
}
-void TextGroupsManager::WriteTextGroupIfFull()
+void TextGroupsManager::WriteTextGroupIfFull(DiffWriter* diffWriter)
{
if (currentGroup != nullptr && currentGroup->IsFull())
- EndGroup();
+ EndGroup(diffWriter);
}
void TextGroupsManager::DeleteTextFromGroup(std::uint32_t textGroupId,
std::uint8_t textId)
@@ -73,7 +81,7 @@
{
if (currentGroup == nullptr || currentGroup->GetTextGroupId() !=
textGroupId)
{
- EndGroup();
+ EndGroup(nullptr);
currentGroup = std::unique_ptr<TextGroup>(new TextGroup(dump,
textGroupId));
textGroupModified = false;
@@ -82,8 +90,17 @@
return currentGroup->GetText(textId);
}
-void TextGroupsManager::Complete()
+std::uint32_t TextGroupsManager::ImportTextGroup(const std::string&
compressedTexts)
{
- EndGroup();
+ auto group = CreateNewGroup();
+ group->SetCompressedTexts(compressedTexts);
+ group->Write();
+
+ return group->GetTextGroupId();
+}
+
+void TextGroupsManager::Complete(DiffWriter* diffWriter)
+{
+ EndGroup(diffWriter);
EndDeletedGroup();
}
\ No newline at end of file
diff --git a/TextGroupsManager.h b/TextGroupsManager.h
index e854482..337f62f 100644
--- a/TextGroupsManager.h
+++ b/TextGroupsManager.h
@@ -5,6 +5,7 @@
#include <utility>
#include <memory>
#include "DumpObjects/TextGroup.h"
+#include "Diff/DiffWriter.h"
class TextGroupsManager
{
@@ -15,17 +16,21 @@
std::unique_ptr<TextGroup> currentDeletedGroup;
bool textGroupModified;
+ std::unique_ptr<TextGroup> CreateNewGroup();
+
+ void EndGroup(DiffWriter* diffWriter);
void EndDeletedGroup();
public:
TextGroupsManager(std::weak_ptr<WritableDump> dump);
std::pair<std::uint32_t, std::uint8_t> AddTextToGroup(const std::string&
text);
- void EndGroup();
- void WriteTextGroupIfFull();
+ void WriteTextGroupIfFull(DiffWriter* diffWriter);
void DeleteTextFromGroup(std::uint32_t textGroupId, std::uint8_t textId);
std::string GetTextFromGroup(std::uint32_t textGroupId, std::uint8_t
textId);
- void Complete();
+ std::uint32_t ImportTextGroup(const std::string& compressedTexts);
+
+ void Complete(DiffWriter* diffWriter);
};
\ No newline at end of file
diff --git a/main.cpp b/main.cpp
index 5a20936..28f3080 100644
--- a/main.cpp
+++ b/main.cpp
@@ -163,7 +163,7 @@
XmlMediawikiProcessor::Process(&writer, inputFileName);
- writer.EndDump();
+ writer.Complete();
}
void updateDump(std::queue<std::string> ¶meters)
@@ -210,7 +210,7 @@
XmlMediawikiProcessor::Process(&writer, dumpBackupStream);
- writer.EndDump();
+ writer.Complete();
}
void readDump(std::string dumpFileName, std::string outputFileName)
--
To view, visit https://gerrit.wikimedia.org/r/83994
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I2d357218056f9324afa21d586c4b40d935b84830
Gerrit-PatchSet: 3
Gerrit-Project: operations/dumps/incremental
Gerrit-Branch: gsoc
Gerrit-Owner: Petr Onderka <[email protected]>
Gerrit-Reviewer: Petr Onderka <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits