Petr Onderka has submitted this change and it was merged.
Change subject: Changed handling of undeleted pages in diff dumps
......................................................................
Changed handling of undeleted pages in diff dumps
Change-Id: I63ce069375cdfa71a19021ed06bdd846683b2fdb
---
M CMakeLists.txt
M Diff/ChangeProcessor.cpp
M Diff/ChangeProcessor.h
M Diff/ChangeVisitor.h
M Diff/Changes/Change.h
D Diff/Changes/DeletePageChange.cpp
A Diff/Changes/FullDeletePageChange.cpp
R Diff/Changes/FullDeletePageChange.h
A Diff/Changes/PartialDeletePageChange.cpp
C Diff/Changes/PartialDeletePageChange.h
M Diff/DiffReader.cpp
M Diff/DiffWriter.cpp
M Diff/DiffWriter.h
M Dump.cpp
M Dump.h
M DumpWriters/DumpWriter.cpp
M Incremental dumps.vcxproj
17 files changed, 176 insertions(+), 78 deletions(-)
Approvals:
Petr Onderka: Verified; Looks good to me, approved
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 95e3d45..486fbf9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,7 +8,8 @@
set(SOURCES_files_ClCompile
Diff/ChangeProcessor.cpp
Diff/Changes/Change.cpp
- Diff/Changes/DeletePageChange.cpp
+ Diff/Changes/FullDeletePageChange.cpp
+ Diff/Changes/PartialDeletePageChange.cpp
Diff/Changes/DeleteRevisionChange.cpp
Diff/Changes/NewModelFormatChange.cpp
Diff/Changes/NewPageChange.cpp
@@ -69,7 +70,8 @@
set(SOURCES_files_ClInclude
Diff/ChangeProcessor.h
Diff/Changes/Change.h
- Diff/Changes/DeletePageChange.h
+ Diff/Changes/FullDeletePageChange.h
+ Diff/Changes/PartialDeletePageChange.h
Diff/Changes/DeleteRevisionChange.h
Diff/Changes/NewModelFormatChange.h
Diff/Changes/NewPageChange.h
diff --git a/Diff/ChangeProcessor.cpp b/Diff/ChangeProcessor.cpp
index bbdbc5b..5829e1c 100644
--- a/Diff/ChangeProcessor.cpp
+++ b/Diff/ChangeProcessor.cpp
@@ -70,8 +70,6 @@
dumpRevision.Write();
currentPage->page.RevisionIds.insert(change.revision.RevisionId);
-
- visitedRevisionIds.insert(change.revision.RevisionId);
}
void ChangeProcessor::Process(RevisionChange change)
@@ -112,22 +110,26 @@
dumpRevision.Write();
currentPage->page.RevisionIds.insert(revision.RevisionId);
-
- visitedRevisionIds.insert(change.revisionChanges.RevisionId);
}
void ChangeProcessor::Process(DeleteRevisionChange change)
{
- dump->DeleteRevision(change.revisionId, visitedRevisionIds);
+ dump->DeleteRevision(change.revisionId);
auto &revisionIds = currentPage->page.RevisionIds;
auto deletedPos = std::find(revisionIds.begin(), revisionIds.end(),
change.revisionId);
- revisionIds.erase(deletedPos);
+ if (deletedPos != revisionIds.end())
+ revisionIds.erase(deletedPos);
}
-void ChangeProcessor::Process(DeletePageChange change)
+void ChangeProcessor::Process(FullDeletePageChange change)
{
- dump->DeletePage(change.pageId, visitedRevisionIds);
+ dump->DeletePageFull(change.pageId);
+}
+
+void ChangeProcessor::Process(PartialDeletePageChange change)
+{
+ dump->DeletePagePartial(change.pageId);
}
void ChangeProcessor::End()
@@ -135,4 +137,4 @@
WritePage();
dump->WriteIndexes();
-}
+}
\ No newline at end of file
diff --git a/Diff/ChangeProcessor.h b/Diff/ChangeProcessor.h
index 6253867..8628efd 100644
--- a/Diff/ChangeProcessor.h
+++ b/Diff/ChangeProcessor.h
@@ -11,8 +11,6 @@
std::unique_ptr<DumpPage> currentPage;
- std::unordered_set<std::uint32_t> visitedRevisionIds;
-
void WritePage();
public:
ChangeProcessor(std::shared_ptr<WritableDump> dump);
@@ -24,7 +22,8 @@
void Process(NewRevisionChange change);
void Process(RevisionChange change);
void Process(DeleteRevisionChange change);
- void Process(DeletePageChange change);
+ void Process(FullDeletePageChange change);
+ void Process(PartialDeletePageChange change);
// has to be called after processing is complete
void End();
diff --git a/Diff/ChangeVisitor.h b/Diff/ChangeVisitor.h
index d2f59a6..26ea54a 100644
--- a/Diff/ChangeVisitor.h
+++ b/Diff/ChangeVisitor.h
@@ -7,7 +7,8 @@
#include "Changes/NewRevisionChange.h"
#include "Changes/RevisionChange.h"
#include "Changes/DeleteRevisionChange.h"
-#include "Changes/DeletePageChange.h"
+#include "Changes/FullDeletePageChange.h"
+#include "Changes/PartialDeletePageChange.h"
class ChangeVisitor
{
@@ -19,5 +20,6 @@
virtual void Visit(NewRevisionChange &change) = 0;
virtual void Visit(RevisionChange &change) = 0;
virtual void Visit(DeleteRevisionChange &change) = 0;
- virtual void Visit(DeletePageChange &change) = 0;
+ virtual void Visit(FullDeletePageChange &change) = 0;
+ virtual void Visit(PartialDeletePageChange &change) = 0;
};
\ No newline at end of file
diff --git a/Diff/Changes/Change.h b/Diff/Changes/Change.h
index aab409a..ef56629 100644
--- a/Diff/Changes/Change.h
+++ b/Diff/Changes/Change.h
@@ -6,17 +6,18 @@
enum class ChangeKind : std::uint8_t
{
- SiteInfo = 0x01,
+ SiteInfo = 0x01,
- NewPage = 0x10,
- ChangePage = 0x11,
- DeletePage = 0x12,
+ NewPage = 0x10,
+ ChangePage = 0x11,
+ DeletePageFull = 0x12,
+ DeletePagePartial = 0x13,
- NewRevision = 0x20,
- ChangeRevision = 0x21,
- DeleteRevision = 0x22,
+ NewRevision = 0x20,
+ ChangeRevision = 0x21,
+ DeleteRevision = 0x22,
- NewModelFormat = 0x30
+ NewModelFormat = 0x30
};
class Change : public DumpObjectBase
diff --git a/Diff/Changes/DeletePageChange.cpp
b/Diff/Changes/DeletePageChange.cpp
deleted file mode 100644
index a4833ce..0000000
--- a/Diff/Changes/DeletePageChange.cpp
+++ /dev/null
@@ -1,26 +0,0 @@
-#include "DeletePageChange.h"
-#include "../ChangeVisitor.h"
-
-DeletePageChange DeletePageChange::Read(std::istream &stream)
-{
- std::uint32_t pageId;
- ReadValue(stream, pageId);
-
- return DeletePageChange(pageId);
-}
-
-void DeletePageChange::WriteInternal()
-{
- WriteValue(ChangeKind::DeletePage);
- WriteValue(pageId);
-}
-
-std::uint32_t DeletePageChange::NewLength()
-{
- return ValueSize(ChangeKind::DeletePage) + ValueSize(pageId);
-}
-
-void DeletePageChange::Accept(ChangeVisitor &visitor)
-{
- visitor.Visit(*this);
-}
\ No newline at end of file
diff --git a/Diff/Changes/FullDeletePageChange.cpp
b/Diff/Changes/FullDeletePageChange.cpp
new file mode 100644
index 0000000..a23d918
--- /dev/null
+++ b/Diff/Changes/FullDeletePageChange.cpp
@@ -0,0 +1,26 @@
+#include "FullDeletePageChange.h"
+#include "../ChangeVisitor.h"
+
+FullDeletePageChange FullDeletePageChange::Read(std::istream &stream)
+{
+ std::uint32_t pageId;
+ ReadValue(stream, pageId);
+
+ return FullDeletePageChange(pageId);
+}
+
+void FullDeletePageChange::WriteInternal()
+{
+ WriteValue(ChangeKind::DeletePageFull);
+ WriteValue(pageId);
+}
+
+std::uint32_t FullDeletePageChange::NewLength()
+{
+ return ValueSize(ChangeKind::DeletePageFull) + ValueSize(pageId);
+}
+
+void FullDeletePageChange::Accept(ChangeVisitor &visitor)
+{
+ visitor.Visit(*this);
+}
\ No newline at end of file
diff --git a/Diff/Changes/DeletePageChange.h
b/Diff/Changes/FullDeletePageChange.h
similarity index 63%
rename from Diff/Changes/DeletePageChange.h
rename to Diff/Changes/FullDeletePageChange.h
index 98c6f4b..79761fc 100644
--- a/Diff/Changes/DeletePageChange.h
+++ b/Diff/Changes/FullDeletePageChange.h
@@ -2,16 +2,16 @@
#include "Change.h"
-class DeletePageChange : public Change
+class FullDeletePageChange : public Change
{
public:
std::uint32_t pageId;
- DeletePageChange(std::uint32_t pageId)
+ FullDeletePageChange(std::uint32_t pageId)
: pageId(pageId)
{}
- static DeletePageChange Read(std::istream &stream);
+ static FullDeletePageChange Read(std::istream &stream);
virtual void WriteInternal() override;
virtual std::uint32_t NewLength() override;
diff --git a/Diff/Changes/PartialDeletePageChange.cpp
b/Diff/Changes/PartialDeletePageChange.cpp
new file mode 100644
index 0000000..d3e8247
--- /dev/null
+++ b/Diff/Changes/PartialDeletePageChange.cpp
@@ -0,0 +1,26 @@
+#include "PartialDeletePageChange.h"
+#include "../ChangeVisitor.h"
+
+PartialDeletePageChange PartialDeletePageChange::Read(std::istream &stream)
+{
+ std::uint32_t pageId;
+ ReadValue(stream, pageId);
+
+ return PartialDeletePageChange(pageId);
+}
+
+void PartialDeletePageChange::WriteInternal()
+{
+ WriteValue(ChangeKind::DeletePagePartial);
+ WriteValue(pageId);
+}
+
+std::uint32_t PartialDeletePageChange::NewLength()
+{
+ return ValueSize(ChangeKind::DeletePagePartial) + ValueSize(pageId);
+}
+
+void PartialDeletePageChange::Accept(ChangeVisitor &visitor)
+{
+ visitor.Visit(*this);
+}
\ No newline at end of file
diff --git a/Diff/Changes/DeletePageChange.h
b/Diff/Changes/PartialDeletePageChange.h
similarity index 62%
copy from Diff/Changes/DeletePageChange.h
copy to Diff/Changes/PartialDeletePageChange.h
index 98c6f4b..65db24e 100644
--- a/Diff/Changes/DeletePageChange.h
+++ b/Diff/Changes/PartialDeletePageChange.h
@@ -2,16 +2,16 @@
#include "Change.h"
-class DeletePageChange : public Change
+class PartialDeletePageChange : public Change
{
public:
std::uint32_t pageId;
- DeletePageChange(std::uint32_t pageId)
+ PartialDeletePageChange(std::uint32_t pageId)
: pageId(pageId)
{}
- static DeletePageChange Read(std::istream &stream);
+ static PartialDeletePageChange Read(std::istream &stream);
virtual void WriteInternal() override;
virtual std::uint32_t NewLength() override;
diff --git a/Diff/DiffReader.cpp b/Diff/DiffReader.cpp
index 614ad3c..abb11c3 100644
--- a/Diff/DiffReader.cpp
+++ b/Diff/DiffReader.cpp
@@ -63,8 +63,11 @@
case ChangeKind::DeleteRevision:
changeProcessor.Process(DeleteRevisionChange::Read(*stream));
break;
- case ChangeKind::DeletePage:
- changeProcessor.Process(DeletePageChange::Read(*stream));
+ case ChangeKind::DeletePageFull:
+ changeProcessor.Process(FullDeletePageChange::Read(*stream));
+ break;
+ case ChangeKind::DeletePagePartial:
+ changeProcessor.Process(PartialDeletePageChange::Read(*stream));
break;
default:
throw DumpException();
diff --git a/Diff/DiffWriter.cpp b/Diff/DiffWriter.cpp
index b4d6bf7..e192d46 100644
--- a/Diff/DiffWriter.cpp
+++ b/Diff/DiffWriter.cpp
@@ -7,7 +7,8 @@
#include "Changes/NewRevisionChange.h"
#include "Changes/RevisionChange.h"
#include "Changes/DeleteRevisionChange.h"
-#include "Changes/DeletePageChange.h"
+#include "Changes/FullDeletePageChange.h"
+#include "Changes/PartialDeletePageChange.h"
#include "../DumpObjects/FileHeader.h"
void DiffWriter::EnsurePageWritten()
@@ -139,10 +140,9 @@
{
if (!dumpStarted)
throw DumpException();
- if (!pageStarted)
- throw DumpException();
- EnsurePageWritten();
+ if (pageStarted)
+ EnsurePageWritten();
DeleteRevisionChange change(revisionId);
change.Write(stream.get());
@@ -161,13 +161,24 @@
pageStarted = false;
}
-void DiffWriter::DeletePage(const std::uint32_t pageId)
+void DiffWriter::DeletePageFull(std::uint32_t pageId)
{
if (!dumpStarted)
throw DumpException();
if (pageStarted)
throw DumpException();
- DeletePageChange change(pageId);
+ FullDeletePageChange change(pageId);
+ change.Write(stream.get());
+}
+
+void DiffWriter::DeletePagePartial(std::uint32_t pageId)
+{
+ if (!dumpStarted)
+ throw DumpException();
+ if (pageStarted)
+ throw DumpException();
+
+ PartialDeletePageChange change(pageId);
change.Write(stream.get());
}
\ No newline at end of file
diff --git a/Diff/DiffWriter.h b/Diff/DiffWriter.h
index 4f66281..76546ca 100644
--- a/Diff/DiffWriter.h
+++ b/Diff/DiffWriter.h
@@ -41,5 +41,6 @@
void EndPage();
- void DeletePage(const std::uint32_t pageId);
+ void DeletePageFull(std::uint32_t pageId);
+ void DeletePagePartial(std::uint32_t pageId);
};
\ No newline at end of file
diff --git a/Dump.cpp b/Dump.cpp
index a52be7a..cfc1091 100644
--- a/Dump.cpp
+++ b/Dump.cpp
@@ -91,25 +91,51 @@
modelFormatIndex->Write();
}
-void WritableDump::DeletePage(std::uint32_t pageId, const
std::unordered_set<std::uint32_t> &doNotDeleteRevisions)
+std::pair<bool, std::vector<std::uint32_t>> WritableDump::DeletePage(
+ std::uint32_t pageId, bool full, const std::unordered_set<std::uint32_t>
&doNotDeleteRevisions)
{
Offset offset = pageIdIndex->Get(pageId);
DumpPage page(self, pageId);
std::uint32_t length = page.NewLength();
- for (auto revisionId : page.page.RevisionIds)
+ bool allRevisionsDeleted = true;
+ std::vector<std::uint32_t> deletedRevisions;
+
+ if (full)
{
- DeleteRevision(revisionId, doNotDeleteRevisions);
+ for (auto revisionId : page.page.RevisionIds)
+ {
+ if (DeleteRevision(revisionId, doNotDeleteRevisions))
+ deletedRevisions.push_back(revisionId);
+ else
+ allRevisionsDeleted = false;
+ }
}
pageIdIndex->Remove(pageId);
spaceManager->Delete(offset.value, length);
+
+ if (allRevisionsDeleted)
+ return std::make_pair(true, std::vector<std::uint32_t>());
+
+ return std::make_pair(false, deletedRevisions);
}
-void WritableDump::DeleteRevision(std::uint32_t revisionId, const
std::unordered_set<std::uint32_t> &doNotDeleteRevisions)
+void WritableDump::DeletePagePartial(std::uint32_t pageId)
+{
+ DeletePage(pageId, false, std::unordered_set<std::uint32_t>());
+}
+
+std::pair<bool, std::vector<std::uint32_t>> WritableDump::DeletePageFull(
+ std::uint32_t pageId, const std::unordered_set<std::uint32_t>
&doNotDeleteRevisions)
+{
+ return DeletePage(pageId, true, doNotDeleteRevisions);
+}
+
+bool WritableDump::DeleteRevision(std::uint32_t revisionId, const
std::unordered_set<std::uint32_t> &doNotDeleteRevisions)
{
if (doNotDeleteRevisions.count(revisionId) != 0)
- return;
+ return false;
Offset offset = revisionIdIndex->Get(revisionId);
DumpRevision revision(self, revisionId, false);
@@ -117,6 +143,8 @@
revisionIdIndex->Remove(revisionId);
spaceManager->Delete(offset.value, length);
+
+ return true;
}
const std::string WritableDump::WikitextModel = "wikitext";
diff --git a/Dump.h b/Dump.h
index c8aef2d..e797b79 100644
--- a/Dump.h
+++ b/Dump.h
@@ -22,6 +22,7 @@
class SpaceManager;
+// TODO: ReadableDump is unusable; either merge it with WritableDump, or make
it usable
class ReadableDump
{
protected:
@@ -29,7 +30,7 @@
ReadableDump(unique_ptr<iostream> stream);
public:
- // TODO: others should not be able to steal this stream
+ // TODO: others should not be able to steal these
unique_ptr<iostream> stream;
unique_ptr<Index<uint32_t, Offset>> pageIdIndex;
unique_ptr<Index<uint32_t, Offset>> revisionIdIndex;
@@ -52,6 +53,8 @@
WritableDump(string fileName);
void init(std::weak_ptr<WritableDump> self);
+ std::pair<bool, std::vector<std::uint32_t>> DeletePage(
+ std::uint32_t pageId, bool fullDelete, const
std::unordered_set<std::uint32_t> &doNotDeleteRevisions);
public:
static const std::string WikitextModel;
static const std::string WikitextFormat;
@@ -64,9 +67,15 @@
// it's necessary to call this after writing is finished
void WriteIndexes();
- // also recursively deletes revisions of the given page
- void DeletePage(std::uint32_t pageId, const
std::unordered_set<std::uint32_t> &doNotDeleteRevisions =
std::unordered_set<std::uint32_t>());
- void DeleteRevision(std::uint32_t revisionId, const
std::unordered_set<std::uint32_t> &doNotDeleteRevisions =
std::unordered_set<std::uint32_t>());
+ void DeletePagePartial(std::uint32_t pageId);
+ // also deletes revisions of the given page
+ // first member of returned pair indicates whether all revisions of the
page were deleted
+ // if it's false, second member contains ids of revisions that were
actually deleted
+ std::pair<bool, std::vector<std::uint32_t>> DeletePageFull(
+ std::uint32_t pageId, const std::unordered_set<std::uint32_t>
&doNotDeleteRevisions = std::unordered_set<std::uint32_t>());
+ // returns whether the revision was actually deleted
+ bool DeleteRevision(
+ std::uint32_t revisionId, const std::unordered_set<std::uint32_t>
&doNotDeleteRevisions = std::unordered_set<std::uint32_t>());
std::uint8_t GetIdForModelFormat(std::string model, std::string format,
bool &isNew);
std::pair<std::string, std::string> GetModelFormat(std::uint8_t id);
diff --git a/DumpWriters/DumpWriter.cpp b/DumpWriters/DumpWriter.cpp
index c6a4aa5..96cfa67 100644
--- a/DumpWriters/DumpWriter.cpp
+++ b/DumpWriters/DumpWriter.cpp
@@ -105,10 +105,22 @@
{
if (unvisitedPageIds.at(i))
{
- dump->DeletePage(i, newRevisionIds);
+ auto deletedRevisions = dump->DeletePageFull(i, newRevisionIds);
if (diffWriter != nullptr)
- diffWriter->DeletePage(i);
+ {
+ if (deletedRevisions.first)
+ {
+ diffWriter->DeletePageFull(i);
+ }
+ else
+ {
+ diffWriter->DeletePagePartial(i);
+
+ for (auto revisionId : deletedRevisions.second)
+ diffWriter->DeleteRevision(revisionId);
+ }
+ }
}
}
diff --git a/Incremental dumps.vcxproj b/Incremental dumps.vcxproj
index 82f65ed..dfb84b4 100644
--- a/Incremental dumps.vcxproj
+++ b/Incremental dumps.vcxproj
@@ -83,8 +83,9 @@
<ClCompile Include="CollectionHelpers.cpp" />
<ClCompile Include="Diff\ChangeProcessor.cpp" />
<ClCompile Include="Diff\Changes\Change.cpp" />
- <ClCompile Include="Diff\Changes\DeletePageChange.cpp" />
<ClCompile Include="Diff\Changes\DeleteRevisionChange.cpp" />
+ <ClCompile Include="Diff\Changes\FullDeletePageChange.cpp" />
+ <ClCompile Include="Diff\Changes\PartialDeletePageChange.cpp" />
<ClCompile Include="Diff\DiffReader.cpp" />
<ClCompile Include="Diff\DiffWriter.cpp" />
<ClCompile Include="Diff\Changes\NewModelFormatChange.cpp" />
@@ -105,8 +106,9 @@
<ClInclude Include="CollectionHelpers.h" />
<ClInclude Include="Diff\ChangeProcessor.h" />
<ClInclude Include="Diff\Changes\Change.h" />
- <ClInclude Include="Diff\Changes\DeletePageChange.h" />
<ClInclude Include="Diff\Changes\DeleteRevisionChange.h" />
+ <ClInclude Include="Diff\Changes\FullDeletePageChange.h" />
+ <ClInclude Include="Diff\Changes\PartialDeletePageChange.h" />
<ClInclude Include="Diff\ChangeVisitor.h" />
<ClInclude Include="Diff\DiffReader.h" />
<ClInclude Include="Diff\DiffWriter.h" />
--
To view, visit https://gerrit.wikimedia.org/r/81699
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I63ce069375cdfa71a19021ed06bdd846683b2fdb
Gerrit-PatchSet: 1
Gerrit-Project: operations/dumps/incremental
Gerrit-Branch: gsoc
Gerrit-Owner: Petr Onderka <[email protected]>
Gerrit-Reviewer: Petr Onderka <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits