Petr Onderka has submitted this change and it was merged.

Change subject: Make sure revisions of a page are sorted by their id
......................................................................


Make sure revisions of a page are sorted by their id

Change-Id: Id5a79bf9b4ee7a7d3fed9c340db34f4673207b0c
---
M CollectionHelpers.h
M Diff/ChangeProcessor.cpp
M Diff/Changes/PageChange.cpp
M DumpObjects/DumpPage.cpp
M DumpObjects/DumpTraits.h
M DumpWriters/DumpWriter.cpp
M Objects/Page.h
M TODO.txt
8 files changed, 82 insertions(+), 29 deletions(-)

Approvals:
  Petr Onderka: Verified; Looks good to me, approved



diff --git a/CollectionHelpers.h b/CollectionHelpers.h
index 9cee037..b9abde6 100644
--- a/CollectionHelpers.h
+++ b/CollectionHelpers.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <vector>
+#include <set>
 #include <unordered_set>
 #include <algorithm>
 
@@ -22,13 +23,13 @@
 }
 
 template <typename T>
-bool contains(T& container, const typename T::value_type& value)
+bool contains(const std::set<T> &container, const T &value)
 {
-    return std::find(container.begin(), container.end(), value) != 
container.end();
+    return container.count(value) > 0;
 }
 
 template <typename T>
-std::unordered_set<T> except(std::vector<T> container1, std::vector<T> 
container2)
+std::unordered_set<T> except(const std::vector<T> &container1, const 
std::vector<T> &container2)
 {
     std::unordered_set<T> set(container1.begin(), container1.end());
 
@@ -38,6 +39,16 @@
     return set;
 }
 
+template <typename T>
+std::set<T> except(const std::set<T> &container1, const std::set<T> 
&container2)
+{
+    std::set<T> result = container1;
+
+    for (auto item : container2)
+        result.erase(item);
+
+    return result;
+}
 
 void set(std::vector<bool> &vector, std::size_t index);
 
diff --git a/Diff/ChangeProcessor.cpp b/Diff/ChangeProcessor.cpp
index ca9613a..bbdbc5b 100644
--- a/Diff/ChangeProcessor.cpp
+++ b/Diff/ChangeProcessor.cpp
@@ -69,7 +69,7 @@
     dumpRevision.SetModelFormatId(change.modelFormatId);
     dumpRevision.Write();
 
-    currentPage->page.RevisionIds.push_back(change.revision.RevisionId);
+    currentPage->page.RevisionIds.insert(change.revision.RevisionId);
 
     visitedRevisionIds.insert(change.revision.RevisionId);
 }
@@ -111,8 +111,7 @@
 
     dumpRevision.Write();
 
-    if (!contains(currentPage->page.RevisionIds, revision.RevisionId))
-        currentPage->page.RevisionIds.push_back(revision.RevisionId);
+    currentPage->page.RevisionIds.insert(revision.RevisionId);
 
     visitedRevisionIds.insert(change.revisionChanges.RevisionId);
 }
diff --git a/Diff/Changes/PageChange.cpp b/Diff/Changes/PageChange.cpp
index 0533e9d..d4ee622 100644
--- a/Diff/Changes/PageChange.cpp
+++ b/Diff/Changes/PageChange.cpp
@@ -24,7 +24,7 @@
 {
     pageChanges = newPage;
     // revision ids are not needed here, so we can save some memory
-    pageChanges.RevisionIds = std::vector<std::uint32_t>();
+    pageChanges.RevisionIds = std::set<std::uint32_t>();
 
     flags = PageChangeFlags::NoChanges;
 
diff --git a/DumpObjects/DumpPage.cpp b/DumpObjects/DumpPage.cpp
index c4fb343..1d074be 100644
--- a/DumpObjects/DumpPage.cpp
+++ b/DumpObjects/DumpPage.cpp
@@ -106,12 +106,12 @@
 {
     Page page;
 
-    page.PageId = DumpTraits<uint32_t>::Read(stream);
-    page.Namespace = DumpTraits<int16_t>::Read(stream);
-    page.Title = DumpTraits<string>::Read(stream);
-    page.RedirectTarget = DumpTraits<string>::Read(stream);
+    page.PageId = DumpTraits<std::uint32_t>::Read(stream);
+    page.Namespace = DumpTraits<std::int16_t>::Read(stream);
+    page.Title = DumpTraits<std::string>::Read(stream);
+    page.RedirectTarget = DumpTraits<std::string>::Read(stream);
     if (includeRevisionIds)
-        page.RevisionIds = DumpTraits<vector<uint32_t>>::Read(stream);
+        page.RevisionIds = DumpTraits<std::set<std::uint32_t>>::Read(stream);
 
     return page;
 }
diff --git a/DumpObjects/DumpTraits.h b/DumpObjects/DumpTraits.h
index faa5a9f..7c82c01 100644
--- a/DumpObjects/DumpTraits.h
+++ b/DumpObjects/DumpTraits.h
@@ -7,6 +7,7 @@
 #include <iostream>
 #include <array>
 #include <vector>
+#include <set>
 #include <map>
 #include "../DumpException.h"
 
@@ -264,16 +265,16 @@
 };
 
 template<typename T>
-class DumpTraits<vector<T>>
+class DumpTraits<std::vector<T>>
 {
 public:
-    static vector<T> Read(istream &stream)
+    static std::vector<T> Read(std::istream &stream)
     {
-        uint32_t count = DumpTraits<uint32_t>::Read(stream);
+        std::uint32_t count = DumpTraits<std::uint32_t>::Read(stream);
 
-        vector<T> result;
+        std::vector<T> result;
 
-        for (uint32_t i = 0; i < count; i++)
+        for (std::uint32_t i = 0; i < count; i++)
         {
             result.push_back(DumpTraits<T>::Read(stream));
         }
@@ -281,14 +282,14 @@
         return result;
     }
 
-    static void Write(ostream &stream, const vector<T> &value)
+    static void Write(std::ostream &stream, const std::vector<T> &value)
     {
         auto length = value.size();
 
-        if (length >= numeric_limits<uint32_t>::max())
+        if (length >= std::numeric_limits<std::uint32_t>::max())
             throw DumpException();
 
-        DumpTraits<uint32_t>::Write(stream, length);
+        DumpTraits<std::uint32_t>::Write(stream, length);
 
         for (T item : value)
         {
@@ -296,9 +297,9 @@
         }
     }
 
-    static uint32_t DumpSize(const vector<T> &value)
+    static std::uint32_t DumpSize(const std::vector<T> &value)
     {
-        uint32_t size = DumpTraits<uint32_t>::DumpSize(value.size());
+        std::uint32_t size = DumpTraits<std::uint32_t>::DumpSize(value.size());
 
         for (T item : value)
         {
@@ -346,6 +347,52 @@
     }
 };
 
+template<typename T>
+class DumpTraits<std::set<T>>
+{
+public:
+    static std::set<T> Read(std::istream &stream)
+    {
+        std::uint32_t count = DumpTraits<std::uint32_t>::Read(stream);
+
+        std::set<T> result;
+
+        for (std::uint32_t i = 0; i < count; i++)
+        {
+            result.insert(result.end(), DumpTraits<T>::Read(stream));
+        }
+
+        return result;
+    }
+
+    static void Write(std::ostream &stream, const std::set<T> &value)
+    {
+        auto length = value.size();
+
+        if (length >= std::numeric_limits<std::uint32_t>::max())
+            throw DumpException();
+
+        DumpTraits<std::uint32_t>::Write(stream, length);
+
+        for (T item : value)
+        {
+            DumpTraits<T>::Write(stream, item);
+        }
+    }
+
+    static std::uint32_t DumpSize(const std::set<T> &value)
+    {
+        std::uint32_t size = DumpTraits<std::uint32_t>::DumpSize(value.size());
+
+        for (T item : value)
+        {
+            size += DumpTraits<T>::DumpSize(item);
+        }
+
+        return size;
+    }
+};
+
 template<typename TKey, typename TValue>
 class DumpTraits<std::map<TKey, TValue>>
 {
diff --git a/DumpWriters/DumpWriter.cpp b/DumpWriters/DumpWriter.cpp
index 2ac4611..dad24e9 100644
--- a/DumpWriters/DumpWriter.cpp
+++ b/DumpWriters/DumpWriter.cpp
@@ -35,7 +35,7 @@
 
 void DumpWriter::AddRevision(const std::shared_ptr<const Revision> revision)
 {
-    page->page.RevisionIds.push_back(revision->RevisionId);
+    page->page.RevisionIds.insert(revision->RevisionId);
     revisions.push_back(revision);
 }
 
diff --git a/Objects/Page.h b/Objects/Page.h
index bc63c17..de27a05 100644
--- a/Objects/Page.h
+++ b/Objects/Page.h
@@ -2,7 +2,7 @@
 
 #include <cstdint>
 #include <string>
-#include <vector>
+#include <set>
 
 class Page
 {
@@ -13,7 +13,7 @@
     // if empty, the page is not a redirect
     std::string RedirectTarget;
 
-    std::vector<std::uint32_t> RevisionIds;
+    std::set<std::uint32_t> RevisionIds;
 
     Page();
 };
diff --git a/TODO.txt b/TODO.txt
index 96f258a..7e5fa71 100644
--- a/TODO.txt
+++ b/TODO.txt
@@ -1,8 +1,4 @@
 This is a list of smaller things that should (or could) be done.
 
-short term:
-- dump name and timestamp
-- order revisions by id
-
 long term:
 - 
https://www.mediawiki.org/wiki/User:ArielGlenn/Dumps_new_format_%28deltas,_changesets%29#Open_issues_on_GSOC_2013_project

-- 
To view, visit https://gerrit.wikimedia.org/r/80987
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Id5a79bf9b4ee7a7d3fed9c340db34f4673207b0c
Gerrit-PatchSet: 1
Gerrit-Project: operations/dumps/incremental
Gerrit-Branch: gsoc
Gerrit-Owner: Petr Onderka <gsv...@gmail.com>
Gerrit-Reviewer: Petr Onderka <gsv...@gmail.com>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to