Petr Onderka has submitted this change and it was merged.
Change subject: Make sure revisions of a page are sorted by their id
......................................................................
Make sure revisions of a page are sorted by their id
Change-Id: Id5a79bf9b4ee7a7d3fed9c340db34f4673207b0c
---
M CollectionHelpers.h
M Diff/ChangeProcessor.cpp
M Diff/Changes/PageChange.cpp
M DumpObjects/DumpPage.cpp
M DumpObjects/DumpTraits.h
M DumpWriters/DumpWriter.cpp
M Objects/Page.h
M TODO.txt
8 files changed, 82 insertions(+), 29 deletions(-)
Approvals:
Petr Onderka: Verified; Looks good to me, approved
diff --git a/CollectionHelpers.h b/CollectionHelpers.h
index 9cee037..b9abde6 100644
--- a/CollectionHelpers.h
+++ b/CollectionHelpers.h
@@ -1,6 +1,7 @@
#pragma once
#include <vector>
+#include <set>
#include <unordered_set>
#include <algorithm>
@@ -22,13 +23,13 @@
}
template <typename T>
-bool contains(T& container, const typename T::value_type& value)
+bool contains(const std::set<T> &container, const T &value)
{
- return std::find(container.begin(), container.end(), value) !=
container.end();
+ return container.count(value) > 0;
}
template <typename T>
-std::unordered_set<T> except(std::vector<T> container1, std::vector<T>
container2)
+std::unordered_set<T> except(const std::vector<T> &container1, const
std::vector<T> &container2)
{
std::unordered_set<T> set(container1.begin(), container1.end());
@@ -38,6 +39,16 @@
return set;
}
+template <typename T>
+std::set<T> except(const std::set<T> &container1, const std::set<T>
&container2)
+{
+ std::set<T> result = container1;
+
+ for (auto item : container2)
+ result.erase(item);
+
+ return result;
+}
void set(std::vector<bool> &vector, std::size_t index);
diff --git a/Diff/ChangeProcessor.cpp b/Diff/ChangeProcessor.cpp
index ca9613a..bbdbc5b 100644
--- a/Diff/ChangeProcessor.cpp
+++ b/Diff/ChangeProcessor.cpp
@@ -69,7 +69,7 @@
dumpRevision.SetModelFormatId(change.modelFormatId);
dumpRevision.Write();
- currentPage->page.RevisionIds.push_back(change.revision.RevisionId);
+ currentPage->page.RevisionIds.insert(change.revision.RevisionId);
visitedRevisionIds.insert(change.revision.RevisionId);
}
@@ -111,8 +111,7 @@
dumpRevision.Write();
- if (!contains(currentPage->page.RevisionIds, revision.RevisionId))
- currentPage->page.RevisionIds.push_back(revision.RevisionId);
+ currentPage->page.RevisionIds.insert(revision.RevisionId);
visitedRevisionIds.insert(change.revisionChanges.RevisionId);
}
diff --git a/Diff/Changes/PageChange.cpp b/Diff/Changes/PageChange.cpp
index 0533e9d..d4ee622 100644
--- a/Diff/Changes/PageChange.cpp
+++ b/Diff/Changes/PageChange.cpp
@@ -24,7 +24,7 @@
{
pageChanges = newPage;
// revision ids are not needed here, so we can save some memory
- pageChanges.RevisionIds = std::vector<std::uint32_t>();
+ pageChanges.RevisionIds = std::set<std::uint32_t>();
flags = PageChangeFlags::NoChanges;
diff --git a/DumpObjects/DumpPage.cpp b/DumpObjects/DumpPage.cpp
index c4fb343..1d074be 100644
--- a/DumpObjects/DumpPage.cpp
+++ b/DumpObjects/DumpPage.cpp
@@ -106,12 +106,12 @@
{
Page page;
- page.PageId = DumpTraits<uint32_t>::Read(stream);
- page.Namespace = DumpTraits<int16_t>::Read(stream);
- page.Title = DumpTraits<string>::Read(stream);
- page.RedirectTarget = DumpTraits<string>::Read(stream);
+ page.PageId = DumpTraits<std::uint32_t>::Read(stream);
+ page.Namespace = DumpTraits<std::int16_t>::Read(stream);
+ page.Title = DumpTraits<std::string>::Read(stream);
+ page.RedirectTarget = DumpTraits<std::string>::Read(stream);
if (includeRevisionIds)
- page.RevisionIds = DumpTraits<vector<uint32_t>>::Read(stream);
+ page.RevisionIds = DumpTraits<std::set<std::uint32_t>>::Read(stream);
return page;
}
diff --git a/DumpObjects/DumpTraits.h b/DumpObjects/DumpTraits.h
index faa5a9f..7c82c01 100644
--- a/DumpObjects/DumpTraits.h
+++ b/DumpObjects/DumpTraits.h
@@ -7,6 +7,7 @@
#include <iostream>
#include <array>
#include <vector>
+#include <set>
#include <map>
#include "../DumpException.h"
@@ -264,16 +265,16 @@
};
template<typename T>
-class DumpTraits<vector<T>>
+class DumpTraits<std::vector<T>>
{
public:
- static vector<T> Read(istream &stream)
+ static std::vector<T> Read(std::istream &stream)
{
- uint32_t count = DumpTraits<uint32_t>::Read(stream);
+ std::uint32_t count = DumpTraits<std::uint32_t>::Read(stream);
- vector<T> result;
+ std::vector<T> result;
- for (uint32_t i = 0; i < count; i++)
+ for (std::uint32_t i = 0; i < count; i++)
{
result.push_back(DumpTraits<T>::Read(stream));
}
@@ -281,14 +282,14 @@
return result;
}
- static void Write(ostream &stream, const vector<T> &value)
+ static void Write(std::ostream &stream, const std::vector<T> &value)
{
auto length = value.size();
- if (length >= numeric_limits<uint32_t>::max())
+ if (length >= std::numeric_limits<std::uint32_t>::max())
throw DumpException();
- DumpTraits<uint32_t>::Write(stream, length);
+ DumpTraits<std::uint32_t>::Write(stream, length);
for (T item : value)
{
@@ -296,9 +297,9 @@
}
}
- static uint32_t DumpSize(const vector<T> &value)
+ static std::uint32_t DumpSize(const std::vector<T> &value)
{
- uint32_t size = DumpTraits<uint32_t>::DumpSize(value.size());
+ std::uint32_t size = DumpTraits<std::uint32_t>::DumpSize(value.size());
for (T item : value)
{
@@ -346,6 +347,52 @@
}
};
+template<typename T>
+class DumpTraits<std::set<T>>
+{
+public:
+ static std::set<T> Read(std::istream &stream)
+ {
+ std::uint32_t count = DumpTraits<std::uint32_t>::Read(stream);
+
+ std::set<T> result;
+
+ for (std::uint32_t i = 0; i < count; i++)
+ {
+ result.insert(result.end(), DumpTraits<T>::Read(stream));
+ }
+
+ return result;
+ }
+
+ static void Write(std::ostream &stream, const std::set<T> &value)
+ {
+ auto length = value.size();
+
+ if (length >= std::numeric_limits<std::uint32_t>::max())
+ throw DumpException();
+
+ DumpTraits<std::uint32_t>::Write(stream, length);
+
+ for (T item : value)
+ {
+ DumpTraits<T>::Write(stream, item);
+ }
+ }
+
+ static std::uint32_t DumpSize(const std::set<T> &value)
+ {
+ std::uint32_t size = DumpTraits<std::uint32_t>::DumpSize(value.size());
+
+ for (T item : value)
+ {
+ size += DumpTraits<T>::DumpSize(item);
+ }
+
+ return size;
+ }
+};
+
template<typename TKey, typename TValue>
class DumpTraits<std::map<TKey, TValue>>
{
diff --git a/DumpWriters/DumpWriter.cpp b/DumpWriters/DumpWriter.cpp
index 2ac4611..dad24e9 100644
--- a/DumpWriters/DumpWriter.cpp
+++ b/DumpWriters/DumpWriter.cpp
@@ -35,7 +35,7 @@
void DumpWriter::AddRevision(const std::shared_ptr<const Revision> revision)
{
- page->page.RevisionIds.push_back(revision->RevisionId);
+ page->page.RevisionIds.insert(revision->RevisionId);
revisions.push_back(revision);
}
diff --git a/Objects/Page.h b/Objects/Page.h
index bc63c17..de27a05 100644
--- a/Objects/Page.h
+++ b/Objects/Page.h
@@ -2,7 +2,7 @@
#include <cstdint>
#include <string>
-#include <vector>
+#include <set>
class Page
{
@@ -13,7 +13,7 @@
// if empty, the page is not a redirect
std::string RedirectTarget;
- std::vector<std::uint32_t> RevisionIds;
+ std::set<std::uint32_t> RevisionIds;
Page();
};
diff --git a/TODO.txt b/TODO.txt
index 96f258a..7e5fa71 100644
--- a/TODO.txt
+++ b/TODO.txt
@@ -1,8 +1,4 @@
This is a list of smaller things that should (or could) be done.
-short term:
-- dump name and timestamp
-- order revisions by id
-
long term:
-
https://www.mediawiki.org/wiki/User:ArielGlenn/Dumps_new_format_%28deltas,_changesets%29#Open_issues_on_GSOC_2013_project
--
To view, visit https://gerrit.wikimedia.org/r/80987
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: Id5a79bf9b4ee7a7d3fed9c340db34f4673207b0c
Gerrit-PatchSet: 1
Gerrit-Project: operations/dumps/incremental
Gerrit-Branch: gsoc
Gerrit-Owner: Petr Onderka <[email protected]>
Gerrit-Reviewer: Petr Onderka <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits