Petr Onderka has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/81662


Change subject: Performed some of Platonides' suggestions
......................................................................

Performed some of Platonides' suggestions

+ fixed a bug with creating new dumps

Change-Id: I05c2c5202b97848bbc5a7a8374713fe9280d4497
---
M Diff/DiffReader.cpp
M Diff/DiffReader.h
M Diff/DiffWriter.cpp
M Dump.cpp
M Dump.h
M DumpObjects/DumpRevision.cpp
M DumpObjects/DumpTraits.h
M DumpObjects/FileHeader.cpp
M DumpObjects/FileHeader.h
M DumpWriters/DumpWriter.cpp
10 files changed, 158 insertions(+), 27 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/operations/dumps/incremental 
refs/changes/62/81662/1

diff --git a/Diff/DiffReader.cpp b/Diff/DiffReader.cpp
index 0658d77..614ad3c 100644
--- a/Diff/DiffReader.cpp
+++ b/Diff/DiffReader.cpp
@@ -3,6 +3,8 @@
 #include "../DumpObjects/DumpObject.h"
 #include "../DumpObjects/FileHeader.h"
 
+const std::string Diff::MagicNumber = "MWDD";
+
 DiffReader::DiffReader(std::string fileName, ChangeProcessor &changeProcessor)
     : stream(std::unique_ptr<std::istream>(new std::ifstream(fileName, 
std::ios::binary))),
         changeProcessor(changeProcessor)
@@ -10,14 +12,14 @@
 
 void DiffReader::Read()
 {
-    std::string magicNumber(4, '\0');
-    stream->read(&magicNumber.at(0), 4);
+    std::string magicNumber(Diff::MagicNumber.length(), '\0');
+    stream->read(&magicNumber.at(0), Diff::MagicNumber.length());
 
     std::uint8_t fileFormatVersion, fileDataVersion;
     DumpObject::ReadValue(*stream, fileFormatVersion);
     DumpObject::ReadValue(*stream, fileDataVersion);
 
-    if (magicNumber != "WMDD"
+    if (magicNumber != Diff::MagicNumber
         || fileFormatVersion != FileHeader::FileFormatVersion
         || fileDataVersion != FileHeader::FileDataVersion)
         throw new DumpException();
diff --git a/Diff/DiffReader.h b/Diff/DiffReader.h
index 64d1a22..c0b8d96 100644
--- a/Diff/DiffReader.h
+++ b/Diff/DiffReader.h
@@ -2,6 +2,14 @@
 
 #include "ChangeProcessor.h"
 
+class Diff
+{
+private:
+    Diff(){};
+public:
+    static const std::string MagicNumber;
+};
+
 class DiffReader
 {
 private:
diff --git a/Diff/DiffWriter.cpp b/Diff/DiffWriter.cpp
index c3b65cd..b4d6bf7 100644
--- a/Diff/DiffWriter.cpp
+++ b/Diff/DiffWriter.cpp
@@ -1,5 +1,6 @@
 #include <fstream>
 #include "DiffWriter.h"
+#include "DiffReader.h"
 #include "Changes/SiteInfoChange.h"
 #include "Changes/NewPageChange.h"
 #include "Changes/NewModelFormatChange.h"
@@ -35,7 +36,7 @@
 
     this->dumpKind = dumpKind;
 
-    stream->write("WMDD", 4);
+    stream->write(Diff::MagicNumber.data(), Diff::MagicNumber.length());
     DumpObject::WriteValue(*stream, FileHeader::FileFormatVersion);
     DumpObject::WriteValue(*stream, FileHeader::FileDataVersion);
     DumpObject::WriteValue(*stream, dumpKind);
diff --git a/Dump.cpp b/Dump.cpp
index b63d220..a52be7a 100644
--- a/Dump.cpp
+++ b/Dump.cpp
@@ -55,10 +55,12 @@
         stream->clear();
         fileHeader = FileHeader(self);
         fileHeader.Write();
+        isNew = true;
     }
     else
     {
         fileHeader = FileHeader::Read(*this);
+        isNew = false;
     }
 
     spaceManager = unique_ptr<SpaceManager>(new SpaceManager(self));
diff --git a/Dump.h b/Dump.h
index 05e0293..c8aef2d 100644
--- a/Dump.h
+++ b/Dump.h
@@ -37,6 +37,8 @@
 
     unique_ptr<DumpSiteInfo> siteInfo;
 
+    bool isNew;
+
     ReadableDump(string fileName);
 
     std::weak_ptr<WritableDump> GetSelf() const;
diff --git a/DumpObjects/DumpRevision.cpp b/DumpObjects/DumpRevision.cpp
index f0474d2..1178153 100644
--- a/DumpObjects/DumpRevision.cpp
+++ b/DumpObjects/DumpRevision.cpp
@@ -154,6 +154,122 @@
     this->modelFormatId = modelFormatId;
 }
 
+char getCharForDigit(std::uint8_t digit, std::uint16_t base)
+{
+    if (base >= 2 && base <= 36)
+    {
+        if (digit >= base)
+            throw DumpException();
+
+        if (digit < 10)
+            return '0' + digit;
+
+        return 'a' + (digit - 10);
+    }
+
+    if (base == 256)
+    {
+        return (char)digit;
+    }
+
+    throw DumpException();
+}
+
+std::uint8_t getDigitForChar(char c, std::uint16_t base)
+{
+    if (base >= 2 && base <= 36)
+    {
+        std::uint8_t result;
+
+        if (c >= '0' && c <= '9')
+            result = c - '0';
+        else if (c >= 'a' && c <= 'z')
+            result = 10 + (c - 'a');
+        else
+            throw DumpException();
+
+        if (result >= base)
+            throw DumpException();
+
+        return result;
+    }
+
+    if (base == 256)
+    {
+        return (std::uint8_t)c;
+    }
+
+    throw DumpException();
+}
+
+// ported version of wfBaseConvert
+std::string baseConvert(const std::string& input, std::uint16_t sourceBase, 
std::uint16_t destBase, std::uint8_t pad)
+{
+    std::string result;
+    result.reserve(pad);
+
+    std::vector<std::uint8_t> inDigits;
+
+    std::transform(input.begin(), input.end(), std::back_inserter(inDigits), 
[=](char c) { return getDigitForChar(c, sourceBase); });
+
+    while (!inDigits.empty())
+    {
+        std::uint16_t work = 0;
+        std::vector<std::uint8_t> workDigits;
+
+        for (std::uint8_t digit : inDigits)
+        {
+            work *= sourceBase;
+            work += digit;
+
+            if (!workDigits.empty() || work >= destBase)
+                workDigits.push_back(work / destBase);
+
+            work %= destBase;
+        }
+
+        result.push_back(getCharForDigit(work, destBase));
+
+        inDigits = workDigits;
+    }
+
+    if (pad > result.length())
+        result.resize(pad, getCharForDigit(0, destBase));
+
+    std::reverse(result.begin(), result.end());
+
+    return result;
+}
+
+std::string convertFromBase36(const std::string& input)
+{
+    const std::uint8_t expectedLength = 20;
+
+    auto result = baseConvert(input, 36, 256, expectedLength);
+
+    if (result.length() != expectedLength)
+        throw DumpException();
+
+    std::reverse(result.begin(), result.end());
+
+    return result;
+}
+
+// note: modifies its input
+std::string convertToBase36(std::string& input)
+{
+    std::reverse(input.begin(), input.end());
+
+    const std::uint8_t expectedLength = 31;
+
+    auto result =  baseConvert(input, 256, 36, expectedLength);
+
+    if (result.length() != expectedLength)
+        throw DumpException();
+
+    return result;
+}
+
 Revision DumpRevision::ReadCore(std::istream &stream, std::uint8_t 
&modelFormatId, bool withText, bool loadText)
 {
     Revision revision;
@@ -174,7 +290,13 @@
 
     if (!HasFlag(revision.Flags, RevisionFlags::TextDeleted))
     {
-        revision.Sha1 = DumpTraits<std::string>::Read(stream);
+        std::string rawSha1;
+        rawSha1.reserve(20);
+
+        for (int i = 0; i < 20; i++)
+            rawSha1.push_back(DumpTraits<char>::Read(stream));
+
+        revision.Sha1 = convertToBase36(rawSha1);
 
         if (withText)
         {
@@ -209,8 +331,9 @@
     
     if (!HasFlag(revision.Flags, RevisionFlags::TextDeleted))
     {
-        // TODO: convert from base36 for saving
-        WriteValue(stream, revision.Sha1);
+        auto convertedSha1 = convertFromBase36(revision.Sha1);
+        for (int i = 0; i < 20; i++)
+            WriteValue(stream, convertedSha1[i]);
 
         if (withText)
             DumpTraits<std::string>::WriteLong(stream, 
revision.GetCompressedText());
diff --git a/DumpObjects/DumpTraits.h b/DumpObjects/DumpTraits.h
index 7c82c01..ba7f3cf 100644
--- a/DumpObjects/DumpTraits.h
+++ b/DumpObjects/DumpTraits.h
@@ -55,7 +55,7 @@
     {
         std::array<char, Size> bytes;
 
-        stream.read(&bytes.at(0), Size);
+        stream.read(bytes.data(), Size);
 
         T result = 0;
         for (int i = 0; i < Size; i++)
@@ -75,7 +75,7 @@
             bytes.at(i) = (value >> (8 * i)) & 0xFF;
         }
 
-        stream.write(&bytes.at(0), Size);
+        stream.write(bytes.data(), Size);
     }
 
     static uint32_t DumpSize(const T value = 0)
@@ -401,18 +401,13 @@
     {
         uint16_t count = DumpTraits<uint16_t>::Read(stream);
 
-        std::vector<TKey> keys;
-
-        for (int i = 0; i < count; i++)
-        {
-            keys.push_back(DumpTraits<TKey>::Read(stream));
-        }
-
         std::map<TKey, TValue> result;
 
         for (int i = 0; i < count; i++)
         {
-            result.insert(std::pair<TKey, TValue>(keys[i], 
DumpTraits<TValue>::Read(stream)));
+            auto key = DumpTraits<TKey>::Read(stream);
+            auto value = DumpTraits<TValue>::Read(stream);
+            result.insert(std::make_pair(key, value));
         }
 
         return result;
@@ -430,10 +425,6 @@
         for (auto pair : value)
         {
             DumpTraits<TKey>::Write(stream, pair.first);
-        }
-
-        for (auto pair : value)
-        {
             DumpTraits<TValue>::Write(stream, pair.second);
         }
     }
diff --git a/DumpObjects/FileHeader.cpp b/DumpObjects/FileHeader.cpp
index fe59865..851a714 100644
--- a/DumpObjects/FileHeader.cpp
+++ b/DumpObjects/FileHeader.cpp
@@ -3,6 +3,8 @@
 #include "../Dump.h"
 #include "../DumpException.h"
 
+const std::string FileHeader::MagicNumber = "MWID";
+
 FileHeader::FileHeader(
     DumpKind kind,
     Offset fileEnd, Offset pageIdIndexRoot, Offset revisionIdIndexRoot, Offset 
modelFormatIndexRoot,
@@ -13,7 +15,7 @@
 
 void FileHeader::WriteInternal()
 {
-    stream->write("WMID", 4);
+    stream->write(MagicNumber.data(), MagicNumber.length());
     WriteValue(FileFormatVersion);
     WriteValue(FileDataVersion);
     WriteValue(Kind);
@@ -42,14 +44,14 @@
 {
     istream &stream = *(dump.stream);
 
-    std::string magicNumber(4, '\0');
-    stream.read(&magicNumber.at(0), 4);
+    std::string magicNumber(MagicNumber.length(), '\0');
+    stream.read(&magicNumber.at(0), MagicNumber.length());
 
     std::uint8_t fileFormatVersion, fileDataVersion;
     ReadValue(stream, fileFormatVersion);
     ReadValue(stream, fileDataVersion);
 
-    if (magicNumber != "WMID"
+    if (magicNumber != MagicNumber
         || fileFormatVersion != FileFormatVersion
         || fileDataVersion != FileDataVersion)
         throw new DumpException();
diff --git a/DumpObjects/FileHeader.h b/DumpObjects/FileHeader.h
index 712b90a..cffbcab 100644
--- a/DumpObjects/FileHeader.h
+++ b/DumpObjects/FileHeader.h
@@ -19,6 +19,7 @@
 protected:
     void WriteInternal();
 public:
+    static const std::string MagicNumber;
     static const std::uint8_t FileFormatVersion = 1;
     static const std::uint8_t FileDataVersion = 1;
 
diff --git a/DumpWriters/DumpWriter.cpp b/DumpWriters/DumpWriter.cpp
index dad24e9..c6a4aa5 100644
--- a/DumpWriters/DumpWriter.cpp
+++ b/DumpWriters/DumpWriter.cpp
@@ -86,8 +86,7 @@
     if (withText)
         dumpKind |= DumpKind::Pages;
 
-    // empty name means it's a new dump
-    if (dump->siteInfo->name.empty())
+    if (dump->isNew)
     {
         dump->fileHeader.Kind = dumpKind;
         dump->fileHeader.Write();

-- 
To view, visit https://gerrit.wikimedia.org/r/81662
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I05c2c5202b97848bbc5a7a8374713fe9280d4497
Gerrit-PatchSet: 1
Gerrit-Project: operations/dumps/incremental
Gerrit-Branch: gsoc
Gerrit-Owner: Petr Onderka <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to