Petr Onderka has submitted this change and it was merged.

Change subject: Changed handling of undeleted pages in diff dumps
......................................................................


Changed handling of undeleted pages in diff dumps

Change-Id: I63ce069375cdfa71a19021ed06bdd846683b2fdb
---
M CMakeLists.txt
M Diff/ChangeProcessor.cpp
M Diff/ChangeProcessor.h
M Diff/ChangeVisitor.h
M Diff/Changes/Change.h
D Diff/Changes/DeletePageChange.cpp
A Diff/Changes/FullDeletePageChange.cpp
R Diff/Changes/FullDeletePageChange.h
A Diff/Changes/PartialDeletePageChange.cpp
C Diff/Changes/PartialDeletePageChange.h
M Diff/DiffReader.cpp
M Diff/DiffWriter.cpp
M Diff/DiffWriter.h
M Dump.cpp
M Dump.h
M DumpWriters/DumpWriter.cpp
M Incremental dumps.vcxproj
17 files changed, 176 insertions(+), 78 deletions(-)

Approvals:
  Petr Onderka: Verified; Looks good to me, approved



diff --git a/CMakeLists.txt b/CMakeLists.txt
index 95e3d45..486fbf9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,7 +8,8 @@
 set(SOURCES_files_ClCompile
   Diff/ChangeProcessor.cpp
   Diff/Changes/Change.cpp
-  Diff/Changes/DeletePageChange.cpp
+  Diff/Changes/FullDeletePageChange.cpp
+  Diff/Changes/PartialDeletePageChange.cpp
   Diff/Changes/DeleteRevisionChange.cpp
   Diff/Changes/NewModelFormatChange.cpp
   Diff/Changes/NewPageChange.cpp
@@ -69,7 +70,8 @@
 set(SOURCES_files_ClInclude
   Diff/ChangeProcessor.h
   Diff/Changes/Change.h
-  Diff/Changes/DeletePageChange.h
+  Diff/Changes/FullDeletePageChange.h
+  Diff/Changes/PartialDeletePageChange.h
   Diff/Changes/DeleteRevisionChange.h
   Diff/Changes/NewModelFormatChange.h
   Diff/Changes/NewPageChange.h
diff --git a/Diff/ChangeProcessor.cpp b/Diff/ChangeProcessor.cpp
index bbdbc5b..5829e1c 100644
--- a/Diff/ChangeProcessor.cpp
+++ b/Diff/ChangeProcessor.cpp
@@ -70,8 +70,6 @@
     dumpRevision.Write();
 
     currentPage->page.RevisionIds.insert(change.revision.RevisionId);
-
-    visitedRevisionIds.insert(change.revision.RevisionId);
 }
 
 void ChangeProcessor::Process(RevisionChange change)
@@ -112,22 +110,26 @@
     dumpRevision.Write();
 
     currentPage->page.RevisionIds.insert(revision.RevisionId);
-
-    visitedRevisionIds.insert(change.revisionChanges.RevisionId);
 }
 
 void ChangeProcessor::Process(DeleteRevisionChange change)
 {
-    dump->DeleteRevision(change.revisionId, visitedRevisionIds);
+    dump->DeleteRevision(change.revisionId);
 
     auto &revisionIds = currentPage->page.RevisionIds;
     auto deletedPos = std::find(revisionIds.begin(), revisionIds.end(), 
change.revisionId);
-    revisionIds.erase(deletedPos);
+    if (deletedPos != revisionIds.end())
+        revisionIds.erase(deletedPos);
 }
 
-void ChangeProcessor::Process(DeletePageChange change)
+void ChangeProcessor::Process(FullDeletePageChange change)
 {
-    dump->DeletePage(change.pageId, visitedRevisionIds);
+    dump->DeletePageFull(change.pageId);
+}
+
+void ChangeProcessor::Process(PartialDeletePageChange change)
+{
+    dump->DeletePagePartial(change.pageId);
 }
 
 void ChangeProcessor::End()
@@ -135,4 +137,4 @@
     WritePage();
 
     dump->WriteIndexes();
-}
+}
\ No newline at end of file
diff --git a/Diff/ChangeProcessor.h b/Diff/ChangeProcessor.h
index 6253867..8628efd 100644
--- a/Diff/ChangeProcessor.h
+++ b/Diff/ChangeProcessor.h
@@ -11,8 +11,6 @@
 
     std::unique_ptr<DumpPage> currentPage;
 
-    std::unordered_set<std::uint32_t> visitedRevisionIds;
-
     void WritePage();
 public:
     ChangeProcessor(std::shared_ptr<WritableDump> dump);
@@ -24,7 +22,8 @@
     void Process(NewRevisionChange change);
     void Process(RevisionChange change);
     void Process(DeleteRevisionChange change);
-    void Process(DeletePageChange change);
+    void Process(FullDeletePageChange change);
+    void Process(PartialDeletePageChange change);
 
     // has to be called after processing is complete
     void End();
diff --git a/Diff/ChangeVisitor.h b/Diff/ChangeVisitor.h
index d2f59a6..26ea54a 100644
--- a/Diff/ChangeVisitor.h
+++ b/Diff/ChangeVisitor.h
@@ -7,7 +7,8 @@
 #include "Changes/NewRevisionChange.h"
 #include "Changes/RevisionChange.h"
 #include "Changes/DeleteRevisionChange.h"
-#include "Changes/DeletePageChange.h"
+#include "Changes/FullDeletePageChange.h"
+#include "Changes/PartialDeletePageChange.h"
 
 class ChangeVisitor
 {
@@ -19,5 +20,6 @@
     virtual void Visit(NewRevisionChange &change) = 0;
     virtual void Visit(RevisionChange &change) = 0;
     virtual void Visit(DeleteRevisionChange &change) = 0;
-    virtual void Visit(DeletePageChange &change) = 0;
+    virtual void Visit(FullDeletePageChange &change) = 0;
+    virtual void Visit(PartialDeletePageChange &change) = 0;
 };
\ No newline at end of file
diff --git a/Diff/Changes/Change.h b/Diff/Changes/Change.h
index aab409a..ef56629 100644
--- a/Diff/Changes/Change.h
+++ b/Diff/Changes/Change.h
@@ -6,17 +6,18 @@
 
 enum class ChangeKind : std::uint8_t
 {
-    SiteInfo       = 0x01,
+    SiteInfo          = 0x01,
 
-    NewPage        = 0x10,
-    ChangePage     = 0x11,
-    DeletePage     = 0x12,
+    NewPage           = 0x10,
+    ChangePage        = 0x11,
+    DeletePageFull    = 0x12,
+    DeletePagePartial = 0x13,
 
-    NewRevision    = 0x20,
-    ChangeRevision = 0x21,
-    DeleteRevision = 0x22,
+    NewRevision       = 0x20,
+    ChangeRevision    = 0x21,
+    DeleteRevision    = 0x22,
 
-    NewModelFormat = 0x30
+    NewModelFormat    = 0x30
 };
 
 class Change : public DumpObjectBase
diff --git a/Diff/Changes/DeletePageChange.cpp 
b/Diff/Changes/DeletePageChange.cpp
deleted file mode 100644
index a4833ce..0000000
--- a/Diff/Changes/DeletePageChange.cpp
+++ /dev/null
@@ -1,26 +0,0 @@
-#include "DeletePageChange.h"
-#include "../ChangeVisitor.h"
-
-DeletePageChange DeletePageChange::Read(std::istream &stream)
-{
-    std::uint32_t pageId;
-    ReadValue(stream, pageId);
-
-    return DeletePageChange(pageId);
-}
-
-void DeletePageChange::WriteInternal()
-{
-    WriteValue(ChangeKind::DeletePage);
-    WriteValue(pageId);
-}
-
-std::uint32_t DeletePageChange::NewLength()
-{
-    return ValueSize(ChangeKind::DeletePage) + ValueSize(pageId);
-}
-
-void DeletePageChange::Accept(ChangeVisitor &visitor)
-{
-    visitor.Visit(*this);
-}
\ No newline at end of file
diff --git a/Diff/Changes/FullDeletePageChange.cpp 
b/Diff/Changes/FullDeletePageChange.cpp
new file mode 100644
index 0000000..a23d918
--- /dev/null
+++ b/Diff/Changes/FullDeletePageChange.cpp
@@ -0,0 +1,26 @@
+#include "FullDeletePageChange.h"
+#include "../ChangeVisitor.h"
+
+FullDeletePageChange FullDeletePageChange::Read(std::istream &stream)
+{
+    std::uint32_t pageId;
+    ReadValue(stream, pageId);
+
+    return FullDeletePageChange(pageId);
+}
+
+void FullDeletePageChange::WriteInternal()
+{
+    WriteValue(ChangeKind::DeletePageFull);
+    WriteValue(pageId);
+}
+
+std::uint32_t FullDeletePageChange::NewLength()
+{
+    return ValueSize(ChangeKind::DeletePageFull) + ValueSize(pageId);
+}
+
+void FullDeletePageChange::Accept(ChangeVisitor &visitor)
+{
+    visitor.Visit(*this);
+}
\ No newline at end of file
diff --git a/Diff/Changes/DeletePageChange.h 
b/Diff/Changes/FullDeletePageChange.h
similarity index 63%
rename from Diff/Changes/DeletePageChange.h
rename to Diff/Changes/FullDeletePageChange.h
index 98c6f4b..79761fc 100644
--- a/Diff/Changes/DeletePageChange.h
+++ b/Diff/Changes/FullDeletePageChange.h
@@ -2,16 +2,16 @@
 
 #include "Change.h"
 
-class DeletePageChange : public Change
+class FullDeletePageChange : public Change
 {
 public:
     std::uint32_t pageId;
 
-    DeletePageChange(std::uint32_t pageId)
+    FullDeletePageChange(std::uint32_t pageId)
         : pageId(pageId)
     {}
 
-    static DeletePageChange Read(std::istream &stream);
+    static FullDeletePageChange Read(std::istream &stream);
     virtual void WriteInternal() override;
     virtual std::uint32_t NewLength() override;
 
diff --git a/Diff/Changes/PartialDeletePageChange.cpp 
b/Diff/Changes/PartialDeletePageChange.cpp
new file mode 100644
index 0000000..d3e8247
--- /dev/null
+++ b/Diff/Changes/PartialDeletePageChange.cpp
@@ -0,0 +1,26 @@
+#include "PartialDeletePageChange.h"
+#include "../ChangeVisitor.h"
+
+PartialDeletePageChange PartialDeletePageChange::Read(std::istream &stream)
+{
+    std::uint32_t pageId;
+    ReadValue(stream, pageId);
+
+    return PartialDeletePageChange(pageId);
+}
+
+void PartialDeletePageChange::WriteInternal()
+{
+    WriteValue(ChangeKind::DeletePagePartial);
+    WriteValue(pageId);
+}
+
+std::uint32_t PartialDeletePageChange::NewLength()
+{
+    return ValueSize(ChangeKind::DeletePagePartial) + ValueSize(pageId);
+}
+
+void PartialDeletePageChange::Accept(ChangeVisitor &visitor)
+{
+    visitor.Visit(*this);
+}
\ No newline at end of file
diff --git a/Diff/Changes/DeletePageChange.h 
b/Diff/Changes/PartialDeletePageChange.h
similarity index 62%
copy from Diff/Changes/DeletePageChange.h
copy to Diff/Changes/PartialDeletePageChange.h
index 98c6f4b..65db24e 100644
--- a/Diff/Changes/DeletePageChange.h
+++ b/Diff/Changes/PartialDeletePageChange.h
@@ -2,16 +2,16 @@
 
 #include "Change.h"
 
-class DeletePageChange : public Change
+class PartialDeletePageChange : public Change
 {
 public:
     std::uint32_t pageId;
 
-    DeletePageChange(std::uint32_t pageId)
+    PartialDeletePageChange(std::uint32_t pageId)
         : pageId(pageId)
     {}
 
-    static DeletePageChange Read(std::istream &stream);
+    static PartialDeletePageChange Read(std::istream &stream);
     virtual void WriteInternal() override;
     virtual std::uint32_t NewLength() override;
 
diff --git a/Diff/DiffReader.cpp b/Diff/DiffReader.cpp
index 614ad3c..abb11c3 100644
--- a/Diff/DiffReader.cpp
+++ b/Diff/DiffReader.cpp
@@ -63,8 +63,11 @@
         case ChangeKind::DeleteRevision:
             changeProcessor.Process(DeleteRevisionChange::Read(*stream));
             break;
-        case ChangeKind::DeletePage:
-            changeProcessor.Process(DeletePageChange::Read(*stream));
+        case ChangeKind::DeletePageFull:
+            changeProcessor.Process(FullDeletePageChange::Read(*stream));
+            break;
+        case ChangeKind::DeletePagePartial:
+            changeProcessor.Process(PartialDeletePageChange::Read(*stream));
             break;
         default:
             throw DumpException();
diff --git a/Diff/DiffWriter.cpp b/Diff/DiffWriter.cpp
index b4d6bf7..e192d46 100644
--- a/Diff/DiffWriter.cpp
+++ b/Diff/DiffWriter.cpp
@@ -7,7 +7,8 @@
 #include "Changes/NewRevisionChange.h"
 #include "Changes/RevisionChange.h"
 #include "Changes/DeleteRevisionChange.h"
-#include "Changes/DeletePageChange.h"
+#include "Changes/FullDeletePageChange.h"
+#include "Changes/PartialDeletePageChange.h"
 #include "../DumpObjects/FileHeader.h"
 
 void DiffWriter::EnsurePageWritten()
@@ -139,10 +140,9 @@
 {
     if (!dumpStarted)
         throw DumpException();
-    if (!pageStarted)
-        throw DumpException();
 
-    EnsurePageWritten();
+    if (pageStarted)
+        EnsurePageWritten();
 
     DeleteRevisionChange change(revisionId);
     change.Write(stream.get());
@@ -161,13 +161,24 @@
     pageStarted = false;
 }
 
-void DiffWriter::DeletePage(const std::uint32_t pageId)
+void DiffWriter::DeletePageFull(std::uint32_t pageId)
 {
     if (!dumpStarted)
         throw DumpException();
     if (pageStarted)
         throw DumpException();
 
-    DeletePageChange change(pageId);
+    FullDeletePageChange change(pageId);
+    change.Write(stream.get());
+}
+
+void DiffWriter::DeletePagePartial(std::uint32_t pageId)
+{
+    if (!dumpStarted)
+        throw DumpException();
+    if (pageStarted)
+        throw DumpException();
+
+    PartialDeletePageChange change(pageId);
     change.Write(stream.get());
 }
\ No newline at end of file
diff --git a/Diff/DiffWriter.h b/Diff/DiffWriter.h
index 4f66281..76546ca 100644
--- a/Diff/DiffWriter.h
+++ b/Diff/DiffWriter.h
@@ -41,5 +41,6 @@
 
     void EndPage();
 
-    void DeletePage(const std::uint32_t pageId);
+    void DeletePageFull(std::uint32_t pageId);
+    void DeletePagePartial(std::uint32_t pageId);
 };
\ No newline at end of file
diff --git a/Dump.cpp b/Dump.cpp
index a52be7a..cfc1091 100644
--- a/Dump.cpp
+++ b/Dump.cpp
@@ -91,25 +91,51 @@
     modelFormatIndex->Write();
 }
 
-void WritableDump::DeletePage(std::uint32_t pageId, const 
std::unordered_set<std::uint32_t> &doNotDeleteRevisions)
+std::pair<bool, std::vector<std::uint32_t>> WritableDump::DeletePage(
+    std::uint32_t pageId, bool full, const std::unordered_set<std::uint32_t> 
&doNotDeleteRevisions)
 {
     Offset offset = pageIdIndex->Get(pageId);
     DumpPage page(self, pageId);
     std::uint32_t length = page.NewLength();
 
-    for (auto revisionId : page.page.RevisionIds)
+    bool allRevisionsDeleted = true;
+    std::vector<std::uint32_t> deletedRevisions;
+
+    if (full)
     {
-        DeleteRevision(revisionId, doNotDeleteRevisions);
+        for (auto revisionId : page.page.RevisionIds)
+        {
+            if (DeleteRevision(revisionId, doNotDeleteRevisions))
+                deletedRevisions.push_back(revisionId);
+            else
+                allRevisionsDeleted = false;
+        }
     }
 
     pageIdIndex->Remove(pageId);
     spaceManager->Delete(offset.value, length);
+
+    if (allRevisionsDeleted)
+        return std::make_pair(true, std::vector<std::uint32_t>());
+    
+    return std::make_pair(false, deletedRevisions);
 }
 
-void WritableDump::DeleteRevision(std::uint32_t revisionId, const 
std::unordered_set<std::uint32_t> &doNotDeleteRevisions)
+void WritableDump::DeletePagePartial(std::uint32_t pageId)
+{
+    DeletePage(pageId, false, std::unordered_set<std::uint32_t>());
+}
+
+std::pair<bool, std::vector<std::uint32_t>> WritableDump::DeletePageFull(
+    std::uint32_t pageId, const std::unordered_set<std::uint32_t> 
&doNotDeleteRevisions)
+{
+    return DeletePage(pageId, true, doNotDeleteRevisions);
+}
+
+bool WritableDump::DeleteRevision(std::uint32_t revisionId, const 
std::unordered_set<std::uint32_t> &doNotDeleteRevisions)
 {
     if (doNotDeleteRevisions.count(revisionId) != 0)
-        return;
+        return false;
 
     Offset offset = revisionIdIndex->Get(revisionId);
     DumpRevision revision(self, revisionId, false);
@@ -117,6 +143,8 @@
 
     revisionIdIndex->Remove(revisionId);
     spaceManager->Delete(offset.value, length);
+
+    return true;
 }
 
 const std::string WritableDump::WikitextModel = "wikitext";
diff --git a/Dump.h b/Dump.h
index c8aef2d..e797b79 100644
--- a/Dump.h
+++ b/Dump.h
@@ -22,6 +22,7 @@
 
 class SpaceManager;
 
+// TODO: ReadableDump is unusable; either merge it with WritableDump, or make 
it usable
 class ReadableDump
 {
 protected:
@@ -29,7 +30,7 @@
 
     ReadableDump(unique_ptr<iostream> stream);
 public:
-    // TODO: others should not be able to steal this stream
+    // TODO: others should not be able to steal these
     unique_ptr<iostream> stream;
     unique_ptr<Index<uint32_t, Offset>> pageIdIndex;
     unique_ptr<Index<uint32_t, Offset>> revisionIdIndex;
@@ -52,6 +53,8 @@
     WritableDump(string fileName);
     void init(std::weak_ptr<WritableDump> self);
 
+    std::pair<bool, std::vector<std::uint32_t>> DeletePage(
+        std::uint32_t pageId, bool fullDelete, const 
std::unordered_set<std::uint32_t> &doNotDeleteRevisions);
 public:
     static const std::string WikitextModel;
     static const std::string WikitextFormat;
@@ -64,9 +67,15 @@
     // it's necessary to call this after writing is finished
     void WriteIndexes();
 
-    // also recursively deletes revisions of the given page
-    void DeletePage(std::uint32_t pageId, const 
std::unordered_set<std::uint32_t> &doNotDeleteRevisions = 
std::unordered_set<std::uint32_t>());
-    void DeleteRevision(std::uint32_t revisionId, const 
std::unordered_set<std::uint32_t> &doNotDeleteRevisions = 
std::unordered_set<std::uint32_t>());
+    void DeletePagePartial(std::uint32_t pageId);
+    // also deletes revisions of the given page
+    // first member of returned pair indicates whether all revisions of the 
page were deleted
+    // if it's false, second member contains ids of revisions that were 
actually deleted
+    std::pair<bool, std::vector<std::uint32_t>> DeletePageFull(
+        std::uint32_t pageId, const std::unordered_set<std::uint32_t> 
&doNotDeleteRevisions = std::unordered_set<std::uint32_t>());
+    // returns whether the revision was actually deleted
+    bool DeleteRevision(
+        std::uint32_t revisionId, const std::unordered_set<std::uint32_t> 
&doNotDeleteRevisions = std::unordered_set<std::uint32_t>());
 
     std::uint8_t GetIdForModelFormat(std::string model, std::string format, 
bool &isNew);
     std::pair<std::string, std::string> GetModelFormat(std::uint8_t id);
diff --git a/DumpWriters/DumpWriter.cpp b/DumpWriters/DumpWriter.cpp
index c6a4aa5..96cfa67 100644
--- a/DumpWriters/DumpWriter.cpp
+++ b/DumpWriters/DumpWriter.cpp
@@ -105,10 +105,22 @@
     {
         if (unvisitedPageIds.at(i))
         {
-            dump->DeletePage(i, newRevisionIds);
+            auto deletedRevisions = dump->DeletePageFull(i, newRevisionIds);
 
             if (diffWriter != nullptr)
-                diffWriter->DeletePage(i);
+            {
+                if (deletedRevisions.first)
+                {
+                    diffWriter->DeletePageFull(i);
+                }
+                else
+                {
+                    diffWriter->DeletePagePartial(i);
+
+                    for (auto revisionId : deletedRevisions.second)
+                        diffWriter->DeleteRevision(revisionId);
+                }
+            }
         }
     }
 
diff --git a/Incremental dumps.vcxproj b/Incremental dumps.vcxproj
index 82f65ed..dfb84b4 100644
--- a/Incremental dumps.vcxproj
+++ b/Incremental dumps.vcxproj
@@ -83,8 +83,9 @@
     <ClCompile Include="CollectionHelpers.cpp" />
     <ClCompile Include="Diff\ChangeProcessor.cpp" />
     <ClCompile Include="Diff\Changes\Change.cpp" />
-    <ClCompile Include="Diff\Changes\DeletePageChange.cpp" />
     <ClCompile Include="Diff\Changes\DeleteRevisionChange.cpp" />
+    <ClCompile Include="Diff\Changes\FullDeletePageChange.cpp" />
+    <ClCompile Include="Diff\Changes\PartialDeletePageChange.cpp" />
     <ClCompile Include="Diff\DiffReader.cpp" />
     <ClCompile Include="Diff\DiffWriter.cpp" />
     <ClCompile Include="Diff\Changes\NewModelFormatChange.cpp" />
@@ -105,8 +106,9 @@
     <ClInclude Include="CollectionHelpers.h" />
     <ClInclude Include="Diff\ChangeProcessor.h" />
     <ClInclude Include="Diff\Changes\Change.h" />
-    <ClInclude Include="Diff\Changes\DeletePageChange.h" />
     <ClInclude Include="Diff\Changes\DeleteRevisionChange.h" />
+    <ClInclude Include="Diff\Changes\FullDeletePageChange.h" />
+    <ClInclude Include="Diff\Changes\PartialDeletePageChange.h" />
     <ClInclude Include="Diff\ChangeVisitor.h" />
     <ClInclude Include="Diff\DiffReader.h" />
     <ClInclude Include="Diff\DiffWriter.h" />

-- 
To view, visit https://gerrit.wikimedia.org/r/81699
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I63ce069375cdfa71a19021ed06bdd846683b2fdb
Gerrit-PatchSet: 1
Gerrit-Project: operations/dumps/incremental
Gerrit-Branch: gsoc
Gerrit-Owner: Petr Onderka <[email protected]>
Gerrit-Reviewer: Petr Onderka <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to