Petr Onderka has submitted this change and it was merged.

Change subject: Write page after its revisions
......................................................................


Write page after its revisions

This is necessary when creating big dumps from XML.
Without this, all revisions of a page have to be kept in memory,
including their texts.
A downside is that this will increase seeking when reading.

Change-Id: Ib6ea9fe67671df7a5686f6a4e441ab95ff6dba01
---
M DumpObjects/DumpPage.cpp
M DumpObjects/DumpPage.h
M DumpWriters/DumpWriter.cpp
M DumpWriters/DumpWriter.h
4 files changed, 36 insertions(+), 50 deletions(-)

Approvals:
  Petr Onderka: Verified; Looks good to me, approved



diff --git a/DumpObjects/DumpPage.cpp b/DumpObjects/DumpPage.cpp
index 1d074be..b5e695b 100644
--- a/DumpObjects/DumpPage.cpp
+++ b/DumpObjects/DumpPage.cpp
@@ -40,35 +40,28 @@
 void DumpPage::Write()
 {
     if (wasLoaded && originalPage == page)
-    {
-        if (diffWriter != nullptr)
-            diffWriter->StartExistingPage(page.PageId);
-
         return;
-    }
 
     DumpObject::Write();
 }
 
-void DumpPage::Write(DiffWriter *diffWriter)
+void DumpPage::WriteDiff(DiffWriter &diffWriter)
 {
-    this->diffWriter = diffWriter;
-    Write();
-    this->diffWriter = nullptr;
+    if (wasLoaded)
+    {
+        if (originalPage == page)
+            diffWriter.StartExistingPage(page.PageId);
+        else
+            diffWriter.StartExistingPage(originalPage, page);
+    }
+    else
+        diffWriter.StartNewPage(page);
 }
 
 void DumpPage::WriteInternal()
 {
     WriteValue((uint8_t)DumpObjectKind::Page);
     WriteCore(*stream, page, true);
-
-    if (diffWriter != nullptr)
-    {
-        if (wasLoaded)
-            diffWriter->StartExistingPage(originalPage, page);
-        else
-            diffWriter->StartNewPage(page);
-    }
 }
 
 void DumpPage::UpdateIndex(Offset offset, bool overwrite)
@@ -87,13 +80,13 @@
 }
 
 DumpPage::DumpPage(weak_ptr<WritableDump> dump, uint32_t pageId)
-    : DumpObject(dump), page(), diffWriter()
+    : DumpObject(dump), page()
 {
     Load(pageId);
 }
 
 DumpPage::DumpPage(weak_ptr<WritableDump> dump, Offset offset)
-    : DumpObject(dump), page(), diffWriter()
+    : DumpObject(dump), page()
 {
     auto dumpRef = dump.lock();
 
diff --git a/DumpObjects/DumpPage.h b/DumpObjects/DumpPage.h
index 4c230bf..3d1e16d 100644
--- a/DumpObjects/DumpPage.h
+++ b/DumpObjects/DumpPage.h
@@ -11,8 +11,6 @@
     Page originalPage;
     bool wasLoaded;
 
-    DiffWriter *diffWriter;
-
     void Load(std::uint32_t pageId);
     static Page Read(std::shared_ptr<WritableDump> dump, Offset offset);
 protected:
@@ -25,7 +23,7 @@
     DumpPage(std::weak_ptr<WritableDump> dump, Offset offset);
 
     virtual void Write() override;
-    void Write(DiffWriter *diffWriter);
+    void WriteDiff(DiffWriter& diffWriter);
     virtual std::uint32_t NewLength() override;
 
     static Page ReadCore(std::istream &stream, bool includeRevisionIds);
diff --git a/DumpWriters/DumpWriter.cpp b/DumpWriters/DumpWriter.cpp
index 96cfa67..d1423fe 100644
--- a/DumpWriters/DumpWriter.cpp
+++ b/DumpWriters/DumpWriter.cpp
@@ -31,39 +31,38 @@
     oldPage = this->page->page;
     this->page->page = *page;
     unset(unvisitedPageIds, pageId);
+
+    if (diffWriter != nullptr)
+        this->page->WriteDiff(*diffWriter);
 }
 
 void DumpWriter::AddRevision(const std::shared_ptr<const Revision> revision)
 {
     page->page.RevisionIds.insert(revision->RevisionId);
-    revisions.push_back(revision);
+
+    DumpRevision dumpRevision(dump, revision->RevisionId, false);
+    dumpRevision.revision = *revision;
+
+    if (diffWriter != nullptr)
+    {
+        bool isNew;
+        std::uint8_t modelFormatId = dumpRevision.GetModelFormatId(isNew);
+
+        if (isNew)
+            diffWriter->NewModelFormat(modelFormatId, 
dumpRevision.revision.Model, dumpRevision.revision.Format);
+    }
+
+    bool newRevision = !contains(oldPage.RevisionIds, revision->RevisionId);
+
+    if (newRevision)
+        newRevisionIds.insert(revision->RevisionId);
+
+    dumpRevision.Write(diffWriter.get(), newRevision);
 }
 
 void DumpWriter::EndPage()
 {
-    page->Write(diffWriter.get());
-
-    for (auto revision : revisions)
-    {
-        DumpRevision dumpRevision(dump, revision->RevisionId, false);
-        dumpRevision.revision = *revision;
-
-        if (diffWriter != nullptr)
-        {
-            bool isNew;
-            std::uint8_t modelFormatId = dumpRevision.GetModelFormatId(isNew);
-
-            if (isNew)
-                diffWriter->NewModelFormat(modelFormatId, 
dumpRevision.revision.Model, dumpRevision.revision.Format);
-        }
-
-        bool newRevision = !contains(oldPage.RevisionIds, 
revision->RevisionId);
-
-        if (newRevision)
-            newRevisionIds.insert(revision->RevisionId);
-
-        dumpRevision.Write(diffWriter.get(), newRevision);
-    }
+    page->Write();
 
     auto deletedRevisionIds = except(oldPage.RevisionIds, 
page->page.RevisionIds);
 
@@ -78,7 +77,6 @@
     if (diffWriter != nullptr)
         diffWriter->EndPage();
     page = nullptr;
-    revisions.clear();
 }
 
 void DumpWriter::SetDumpKind(DumpKind dumpKind)
diff --git a/DumpWriters/DumpWriter.h b/DumpWriters/DumpWriter.h
index 20df235..233f9ea 100644
--- a/DumpWriters/DumpWriter.h
+++ b/DumpWriters/DumpWriter.h
@@ -12,9 +12,6 @@
     std::unique_ptr<DiffWriter> diffWriter;
 
     std::unique_ptr<DumpPage> page;
-    // this is necessary, so that page object can be on disk before its 
revision objects
-    // it shouldn't waste too much memory, because these revisions don't 
contain any text
-    std::vector<std::shared_ptr<const Revision>> revisions;
     Page oldPage;
 
     std::vector<bool> unvisitedPageIds;

-- 
To view, visit https://gerrit.wikimedia.org/r/83784
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Ib6ea9fe67671df7a5686f6a4e441ab95ff6dba01
Gerrit-PatchSet: 1
Gerrit-Project: operations/dumps/incremental
Gerrit-Branch: gsoc
Gerrit-Owner: Petr Onderka <[email protected]>
Gerrit-Reviewer: Petr Onderka <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to