Petr Onderka has uploaded a new change for review. https://gerrit.wikimedia.org/r/80987
Change subject: Make sure revisions of a page are sorted by their id ...................................................................... Make sure revisions of a page are sorted by their id Change-Id: Id5a79bf9b4ee7a7d3fed9c340db34f4673207b0c --- M CollectionHelpers.h M Diff/ChangeProcessor.cpp M Diff/Changes/PageChange.cpp M DumpObjects/DumpPage.cpp M DumpObjects/DumpTraits.h M DumpWriters/DumpWriter.cpp M Objects/Page.h M TODO.txt 8 files changed, 82 insertions(+), 29 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/operations/dumps/incremental refs/changes/87/80987/1 diff --git a/CollectionHelpers.h b/CollectionHelpers.h index 9cee037..b9abde6 100644 --- a/CollectionHelpers.h +++ b/CollectionHelpers.h @@ -1,6 +1,7 @@ #pragma once #include <vector> +#include <set> #include <unordered_set> #include <algorithm> @@ -22,13 +23,13 @@ } template <typename T> -bool contains(T& container, const typename T::value_type& value) +bool contains(const std::set<T> &container, const T &value) { - return std::find(container.begin(), container.end(), value) != container.end(); + return container.count(value) > 0; } template <typename T> -std::unordered_set<T> except(std::vector<T> container1, std::vector<T> container2) +std::unordered_set<T> except(const std::vector<T> &container1, const std::vector<T> &container2) { std::unordered_set<T> set(container1.begin(), container1.end()); @@ -38,6 +39,16 @@ return set; } +template <typename T> +std::set<T> except(const std::set<T> &container1, const std::set<T> &container2) +{ + std::set<T> result = container1; + + for (auto item : container2) + result.erase(item); + + return result; +} void set(std::vector<bool> &vector, std::size_t index); diff --git a/Diff/ChangeProcessor.cpp b/Diff/ChangeProcessor.cpp index ca9613a..bbdbc5b 100644 --- a/Diff/ChangeProcessor.cpp +++ b/Diff/ChangeProcessor.cpp @@ -69,7 +69,7 @@ dumpRevision.SetModelFormatId(change.modelFormatId); dumpRevision.Write(); - currentPage->page.RevisionIds.push_back(change.revision.RevisionId); + currentPage->page.RevisionIds.insert(change.revision.RevisionId); visitedRevisionIds.insert(change.revision.RevisionId); } @@ -111,8 +111,7 @@ dumpRevision.Write(); - if (!contains(currentPage->page.RevisionIds, revision.RevisionId)) - currentPage->page.RevisionIds.push_back(revision.RevisionId); + currentPage->page.RevisionIds.insert(revision.RevisionId); visitedRevisionIds.insert(change.revisionChanges.RevisionId); } diff --git a/Diff/Changes/PageChange.cpp b/Diff/Changes/PageChange.cpp index 0533e9d..d4ee622 100644 --- a/Diff/Changes/PageChange.cpp +++ b/Diff/Changes/PageChange.cpp @@ -24,7 +24,7 @@ { pageChanges = newPage; // revision ids are not needed here, so we can save some memory - pageChanges.RevisionIds = std::vector<std::uint32_t>(); + pageChanges.RevisionIds = std::set<std::uint32_t>(); flags = PageChangeFlags::NoChanges; diff --git a/DumpObjects/DumpPage.cpp b/DumpObjects/DumpPage.cpp index c4fb343..1d074be 100644 --- a/DumpObjects/DumpPage.cpp +++ b/DumpObjects/DumpPage.cpp @@ -106,12 +106,12 @@ { Page page; - page.PageId = DumpTraits<uint32_t>::Read(stream); - page.Namespace = DumpTraits<int16_t>::Read(stream); - page.Title = DumpTraits<string>::Read(stream); - page.RedirectTarget = DumpTraits<string>::Read(stream); + page.PageId = DumpTraits<std::uint32_t>::Read(stream); + page.Namespace = DumpTraits<std::int16_t>::Read(stream); + page.Title = DumpTraits<std::string>::Read(stream); + page.RedirectTarget = DumpTraits<std::string>::Read(stream); if (includeRevisionIds) - page.RevisionIds = DumpTraits<vector<uint32_t>>::Read(stream); + page.RevisionIds = DumpTraits<std::set<std::uint32_t>>::Read(stream); return page; } diff --git a/DumpObjects/DumpTraits.h b/DumpObjects/DumpTraits.h index faa5a9f..7c82c01 100644 --- a/DumpObjects/DumpTraits.h +++ b/DumpObjects/DumpTraits.h @@ -7,6 +7,7 @@ #include <iostream> #include <array> #include <vector> +#include <set> #include <map> #include "../DumpException.h" @@ -264,16 +265,16 @@ }; template<typename T> -class DumpTraits<vector<T>> +class DumpTraits<std::vector<T>> { public: - static vector<T> Read(istream &stream) + static std::vector<T> Read(std::istream &stream) { - uint32_t count = DumpTraits<uint32_t>::Read(stream); + std::uint32_t count = DumpTraits<std::uint32_t>::Read(stream); - vector<T> result; + std::vector<T> result; - for (uint32_t i = 0; i < count; i++) + for (std::uint32_t i = 0; i < count; i++) { result.push_back(DumpTraits<T>::Read(stream)); } @@ -281,14 +282,14 @@ return result; } - static void Write(ostream &stream, const vector<T> &value) + static void Write(std::ostream &stream, const std::vector<T> &value) { auto length = value.size(); - if (length >= numeric_limits<uint32_t>::max()) + if (length >= std::numeric_limits<std::uint32_t>::max()) throw DumpException(); - DumpTraits<uint32_t>::Write(stream, length); + DumpTraits<std::uint32_t>::Write(stream, length); for (T item : value) { @@ -296,9 +297,9 @@ } } - static uint32_t DumpSize(const vector<T> &value) + static std::uint32_t DumpSize(const std::vector<T> &value) { - uint32_t size = DumpTraits<uint32_t>::DumpSize(value.size()); + std::uint32_t size = DumpTraits<std::uint32_t>::DumpSize(value.size()); for (T item : value) { @@ -346,6 +347,52 @@ } }; +template<typename T> +class DumpTraits<std::set<T>> +{ +public: + static std::set<T> Read(std::istream &stream) + { + std::uint32_t count = DumpTraits<std::uint32_t>::Read(stream); + + std::set<T> result; + + for (std::uint32_t i = 0; i < count; i++) + { + result.insert(result.end(), DumpTraits<T>::Read(stream)); + } + + return result; + } + + static void Write(std::ostream &stream, const std::set<T> &value) + { + auto length = value.size(); + + if (length >= std::numeric_limits<std::uint32_t>::max()) + throw DumpException(); + + DumpTraits<std::uint32_t>::Write(stream, length); + + for (T item : value) + { + DumpTraits<T>::Write(stream, item); + } + } + + static std::uint32_t DumpSize(const std::set<T> &value) + { + std::uint32_t size = DumpTraits<std::uint32_t>::DumpSize(value.size()); + + for (T item : value) + { + size += DumpTraits<T>::DumpSize(item); + } + + return size; + } +}; + template<typename TKey, typename TValue> class DumpTraits<std::map<TKey, TValue>> { diff --git a/DumpWriters/DumpWriter.cpp b/DumpWriters/DumpWriter.cpp index 2ac4611..dad24e9 100644 --- a/DumpWriters/DumpWriter.cpp +++ b/DumpWriters/DumpWriter.cpp @@ -35,7 +35,7 @@ void DumpWriter::AddRevision(const std::shared_ptr<const Revision> revision) { - page->page.RevisionIds.push_back(revision->RevisionId); + page->page.RevisionIds.insert(revision->RevisionId); revisions.push_back(revision); } diff --git a/Objects/Page.h b/Objects/Page.h index bc63c17..de27a05 100644 --- a/Objects/Page.h +++ b/Objects/Page.h @@ -2,7 +2,7 @@ #include <cstdint> #include <string> -#include <vector> +#include <set> class Page { @@ -13,7 +13,7 @@ // if empty, the page is not a redirect std::string RedirectTarget; - std::vector<std::uint32_t> RevisionIds; + std::set<std::uint32_t> RevisionIds; Page(); }; diff --git a/TODO.txt b/TODO.txt index 96f258a..7e5fa71 100644 --- a/TODO.txt +++ b/TODO.txt @@ -1,8 +1,4 @@ This is a list of smaller things that should (or could) be done. -short term: -- dump name and timestamp -- order revisions by id - long term: - https://www.mediawiki.org/wiki/User:ArielGlenn/Dumps_new_format_%28deltas,_changesets%29#Open_issues_on_GSOC_2013_project -- To view, visit https://gerrit.wikimedia.org/r/80987 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Id5a79bf9b4ee7a7d3fed9c340db34f4673207b0c Gerrit-PatchSet: 1 Gerrit-Project: operations/dumps/incremental Gerrit-Branch: gsoc Gerrit-Owner: Petr Onderka <gsv...@gmail.com> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits