Petr Onderka has uploaded a new change for review.
https://gerrit.wikimedia.org/r/83784
Change subject: Write page after its revisions
......................................................................
Write page after its revisions
This is necessary when creating big dumps from XML.
Without this, all revisions of a page have to be kept in memory,
including their texts.
A downside is that this will increase seeking when reading.
Change-Id: Ib6ea9fe67671df7a5686f6a4e441ab95ff6dba01
---
M DumpObjects/DumpPage.cpp
M DumpObjects/DumpPage.h
M DumpWriters/DumpWriter.cpp
M DumpWriters/DumpWriter.h
4 files changed, 36 insertions(+), 50 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/operations/dumps/incremental
refs/changes/84/83784/1
diff --git a/DumpObjects/DumpPage.cpp b/DumpObjects/DumpPage.cpp
index 1d074be..b5e695b 100644
--- a/DumpObjects/DumpPage.cpp
+++ b/DumpObjects/DumpPage.cpp
@@ -40,35 +40,28 @@
void DumpPage::Write()
{
if (wasLoaded && originalPage == page)
- {
- if (diffWriter != nullptr)
- diffWriter->StartExistingPage(page.PageId);
-
return;
- }
DumpObject::Write();
}
-void DumpPage::Write(DiffWriter *diffWriter)
+void DumpPage::WriteDiff(DiffWriter &diffWriter)
{
- this->diffWriter = diffWriter;
- Write();
- this->diffWriter = nullptr;
+ if (wasLoaded)
+ {
+ if (originalPage == page)
+ diffWriter.StartExistingPage(page.PageId);
+ else
+ diffWriter.StartExistingPage(originalPage, page);
+ }
+ else
+ diffWriter.StartNewPage(page);
}
void DumpPage::WriteInternal()
{
WriteValue((uint8_t)DumpObjectKind::Page);
WriteCore(*stream, page, true);
-
- if (diffWriter != nullptr)
- {
- if (wasLoaded)
- diffWriter->StartExistingPage(originalPage, page);
- else
- diffWriter->StartNewPage(page);
- }
}
void DumpPage::UpdateIndex(Offset offset, bool overwrite)
@@ -87,13 +80,13 @@
}
DumpPage::DumpPage(weak_ptr<WritableDump> dump, uint32_t pageId)
- : DumpObject(dump), page(), diffWriter()
+ : DumpObject(dump), page()
{
Load(pageId);
}
DumpPage::DumpPage(weak_ptr<WritableDump> dump, Offset offset)
- : DumpObject(dump), page(), diffWriter()
+ : DumpObject(dump), page()
{
auto dumpRef = dump.lock();
diff --git a/DumpObjects/DumpPage.h b/DumpObjects/DumpPage.h
index 4c230bf..3d1e16d 100644
--- a/DumpObjects/DumpPage.h
+++ b/DumpObjects/DumpPage.h
@@ -11,8 +11,6 @@
Page originalPage;
bool wasLoaded;
- DiffWriter *diffWriter;
-
void Load(std::uint32_t pageId);
static Page Read(std::shared_ptr<WritableDump> dump, Offset offset);
protected:
@@ -25,7 +23,7 @@
DumpPage(std::weak_ptr<WritableDump> dump, Offset offset);
virtual void Write() override;
- void Write(DiffWriter *diffWriter);
+ void WriteDiff(DiffWriter& diffWriter);
virtual std::uint32_t NewLength() override;
static Page ReadCore(std::istream &stream, bool includeRevisionIds);
diff --git a/DumpWriters/DumpWriter.cpp b/DumpWriters/DumpWriter.cpp
index 96cfa67..d1423fe 100644
--- a/DumpWriters/DumpWriter.cpp
+++ b/DumpWriters/DumpWriter.cpp
@@ -31,39 +31,38 @@
oldPage = this->page->page;
this->page->page = *page;
unset(unvisitedPageIds, pageId);
+
+ if (diffWriter != nullptr)
+ this->page->WriteDiff(*diffWriter);
}
void DumpWriter::AddRevision(const std::shared_ptr<const Revision> revision)
{
page->page.RevisionIds.insert(revision->RevisionId);
- revisions.push_back(revision);
+
+ DumpRevision dumpRevision(dump, revision->RevisionId, false);
+ dumpRevision.revision = *revision;
+
+ if (diffWriter != nullptr)
+ {
+ bool isNew;
+ std::uint8_t modelFormatId = dumpRevision.GetModelFormatId(isNew);
+
+ if (isNew)
+ diffWriter->NewModelFormat(modelFormatId,
dumpRevision.revision.Model, dumpRevision.revision.Format);
+ }
+
+ bool newRevision = !contains(oldPage.RevisionIds, revision->RevisionId);
+
+ if (newRevision)
+ newRevisionIds.insert(revision->RevisionId);
+
+ dumpRevision.Write(diffWriter.get(), newRevision);
}
void DumpWriter::EndPage()
{
- page->Write(diffWriter.get());
-
- for (auto revision : revisions)
- {
- DumpRevision dumpRevision(dump, revision->RevisionId, false);
- dumpRevision.revision = *revision;
-
- if (diffWriter != nullptr)
- {
- bool isNew;
- std::uint8_t modelFormatId = dumpRevision.GetModelFormatId(isNew);
-
- if (isNew)
- diffWriter->NewModelFormat(modelFormatId,
dumpRevision.revision.Model, dumpRevision.revision.Format);
- }
-
- bool newRevision = !contains(oldPage.RevisionIds,
revision->RevisionId);
-
- if (newRevision)
- newRevisionIds.insert(revision->RevisionId);
-
- dumpRevision.Write(diffWriter.get(), newRevision);
- }
+ page->Write();
auto deletedRevisionIds = except(oldPage.RevisionIds,
page->page.RevisionIds);
@@ -78,7 +77,6 @@
if (diffWriter != nullptr)
diffWriter->EndPage();
page = nullptr;
- revisions.clear();
}
void DumpWriter::SetDumpKind(DumpKind dumpKind)
diff --git a/DumpWriters/DumpWriter.h b/DumpWriters/DumpWriter.h
index 20df235..233f9ea 100644
--- a/DumpWriters/DumpWriter.h
+++ b/DumpWriters/DumpWriter.h
@@ -12,9 +12,6 @@
std::unique_ptr<DiffWriter> diffWriter;
std::unique_ptr<DumpPage> page;
- // this is necessary, so that page object can be on disk before its
revision objects
- // it shouldn't waste too much memory, because these revisions don't
contain any text
- std::vector<std::shared_ptr<const Revision>> revisions;
Page oldPage;
std::vector<bool> unvisitedPageIds;
--
To view, visit https://gerrit.wikimedia.org/r/83784
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: Ib6ea9fe67671df7a5686f6a4e441ab95ff6dba01
Gerrit-PatchSet: 1
Gerrit-Project: operations/dumps/incremental
Gerrit-Branch: gsoc
Gerrit-Owner: Petr Onderka <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits