Petr Onderka has uploaded a new change for review.
https://gerrit.wikimedia.org/r/81662
Change subject: Performed some of Platonides' suggestions
......................................................................
Performed some of Platonides' suggestions
+ fixed a bug with creating new dumps
Change-Id: I05c2c5202b97848bbc5a7a8374713fe9280d4497
---
M Diff/DiffReader.cpp
M Diff/DiffReader.h
M Diff/DiffWriter.cpp
M Dump.cpp
M Dump.h
M DumpObjects/DumpRevision.cpp
M DumpObjects/DumpTraits.h
M DumpObjects/FileHeader.cpp
M DumpObjects/FileHeader.h
M DumpWriters/DumpWriter.cpp
10 files changed, 158 insertions(+), 27 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/operations/dumps/incremental
refs/changes/62/81662/1
diff --git a/Diff/DiffReader.cpp b/Diff/DiffReader.cpp
index 0658d77..614ad3c 100644
--- a/Diff/DiffReader.cpp
+++ b/Diff/DiffReader.cpp
@@ -3,6 +3,8 @@
#include "../DumpObjects/DumpObject.h"
#include "../DumpObjects/FileHeader.h"
+const std::string Diff::MagicNumber = "MWDD";
+
DiffReader::DiffReader(std::string fileName, ChangeProcessor &changeProcessor)
: stream(std::unique_ptr<std::istream>(new std::ifstream(fileName,
std::ios::binary))),
changeProcessor(changeProcessor)
@@ -10,14 +12,14 @@
void DiffReader::Read()
{
- std::string magicNumber(4, '\0');
- stream->read(&magicNumber.at(0), 4);
+ std::string magicNumber(Diff::MagicNumber.length(), '\0');
+ stream->read(&magicNumber.at(0), Diff::MagicNumber.length());
std::uint8_t fileFormatVersion, fileDataVersion;
DumpObject::ReadValue(*stream, fileFormatVersion);
DumpObject::ReadValue(*stream, fileDataVersion);
- if (magicNumber != "WMDD"
+ if (magicNumber != Diff::MagicNumber
|| fileFormatVersion != FileHeader::FileFormatVersion
|| fileDataVersion != FileHeader::FileDataVersion)
throw new DumpException();
diff --git a/Diff/DiffReader.h b/Diff/DiffReader.h
index 64d1a22..c0b8d96 100644
--- a/Diff/DiffReader.h
+++ b/Diff/DiffReader.h
@@ -2,6 +2,14 @@
#include "ChangeProcessor.h"
+class Diff
+{
+private:
+ Diff(){};
+public:
+ static const std::string MagicNumber;
+};
+
class DiffReader
{
private:
diff --git a/Diff/DiffWriter.cpp b/Diff/DiffWriter.cpp
index c3b65cd..b4d6bf7 100644
--- a/Diff/DiffWriter.cpp
+++ b/Diff/DiffWriter.cpp
@@ -1,5 +1,6 @@
#include <fstream>
#include "DiffWriter.h"
+#include "DiffReader.h"
#include "Changes/SiteInfoChange.h"
#include "Changes/NewPageChange.h"
#include "Changes/NewModelFormatChange.h"
@@ -35,7 +36,7 @@
this->dumpKind = dumpKind;
- stream->write("WMDD", 4);
+ stream->write(Diff::MagicNumber.data(), Diff::MagicNumber.length());
DumpObject::WriteValue(*stream, FileHeader::FileFormatVersion);
DumpObject::WriteValue(*stream, FileHeader::FileDataVersion);
DumpObject::WriteValue(*stream, dumpKind);
diff --git a/Dump.cpp b/Dump.cpp
index b63d220..a52be7a 100644
--- a/Dump.cpp
+++ b/Dump.cpp
@@ -55,10 +55,12 @@
stream->clear();
fileHeader = FileHeader(self);
fileHeader.Write();
+ isNew = true;
}
else
{
fileHeader = FileHeader::Read(*this);
+ isNew = false;
}
spaceManager = unique_ptr<SpaceManager>(new SpaceManager(self));
diff --git a/Dump.h b/Dump.h
index 05e0293..c8aef2d 100644
--- a/Dump.h
+++ b/Dump.h
@@ -37,6 +37,8 @@
unique_ptr<DumpSiteInfo> siteInfo;
+ bool isNew;
+
ReadableDump(string fileName);
std::weak_ptr<WritableDump> GetSelf() const;
diff --git a/DumpObjects/DumpRevision.cpp b/DumpObjects/DumpRevision.cpp
index f0474d2..1178153 100644
--- a/DumpObjects/DumpRevision.cpp
+++ b/DumpObjects/DumpRevision.cpp
@@ -154,6 +154,122 @@
this->modelFormatId = modelFormatId;
}
+char getCharForDigit(std::uint8_t digit, std::uint16_t base)
+{
+ if (base >= 2 && base <= 36)
+ {
+ if (digit >= base)
+ throw DumpException();
+
+ if (digit < 10)
+ return '0' + digit;
+
+ return 'a' + (digit - 10);
+ }
+
+ if (base == 256)
+ {
+ return (char)digit;
+ }
+
+ throw DumpException();
+}
+
+std::uint8_t getDigitForChar(char c, std::uint16_t base)
+{
+ if (base >= 2 && base <= 36)
+ {
+ std::uint8_t result;
+
+ if (c >= '0' && c <= '9')
+ result = c - '0';
+ else if (c >= 'a' && c <= 'z')
+ result = 10 + (c - 'a');
+ else
+ throw DumpException();
+
+ if (result >= base)
+ throw DumpException();
+
+ return result;
+ }
+
+ if (base == 256)
+ {
+ return (std::uint8_t)c;
+ }
+
+ throw DumpException();
+}
+
+// ported version of wfBaseConvert
+std::string baseConvert(const std::string& input, std::uint16_t sourceBase,
std::uint16_t destBase, std::uint8_t pad)
+{
+ std::string result;
+ result.reserve(pad);
+
+ std::vector<std::uint8_t> inDigits;
+
+ std::transform(input.begin(), input.end(), std::back_inserter(inDigits),
[=](char c) { return getDigitForChar(c, sourceBase); });
+
+ while (!inDigits.empty())
+ {
+ std::uint16_t work = 0;
+ std::vector<std::uint8_t> workDigits;
+
+ for (std::uint8_t digit : inDigits)
+ {
+ work *= sourceBase;
+ work += digit;
+
+ if (!workDigits.empty() || work >= destBase)
+ workDigits.push_back(work / destBase);
+
+ work %= destBase;
+ }
+
+ result.push_back(getCharForDigit(work, destBase));
+
+ inDigits = workDigits;
+ }
+
+ if (pad > result.length())
+ result.resize(pad, getCharForDigit(0, destBase));
+
+ std::reverse(result.begin(), result.end());
+
+ return result;
+}
+
+std::string convertFromBase36(const std::string& input)
+{
+ const std::uint8_t expectedLength = 20;
+
+ auto result = baseConvert(input, 36, 256, expectedLength);
+
+ if (result.length() != expectedLength)
+ throw DumpException();
+
+ std::reverse(result.begin(), result.end());
+
+ return result;
+}
+
+// note: modifies its input
+std::string convertToBase36(std::string& input)
+{
+ std::reverse(input.begin(), input.end());
+
+ const std::uint8_t expectedLength = 31;
+
+ auto result = baseConvert(input, 256, 36, expectedLength);
+
+ if (result.length() != expectedLength)
+ throw DumpException();
+
+ return result;
+}
+
Revision DumpRevision::ReadCore(std::istream &stream, std::uint8_t
&modelFormatId, bool withText, bool loadText)
{
Revision revision;
@@ -174,7 +290,13 @@
if (!HasFlag(revision.Flags, RevisionFlags::TextDeleted))
{
- revision.Sha1 = DumpTraits<std::string>::Read(stream);
+ std::string rawSha1;
+ rawSha1.reserve(20);
+
+ for (int i = 0; i < 20; i++)
+ rawSha1.push_back(DumpTraits<char>::Read(stream));
+
+ revision.Sha1 = convertToBase36(rawSha1);
if (withText)
{
@@ -209,8 +331,9 @@
if (!HasFlag(revision.Flags, RevisionFlags::TextDeleted))
{
- // TODO: convert from base36 for saving
- WriteValue(stream, revision.Sha1);
+ auto convertedSha1 = convertFromBase36(revision.Sha1);
+ for (int i = 0; i < 20; i++)
+ WriteValue(stream, convertedSha1[i]);
if (withText)
DumpTraits<std::string>::WriteLong(stream,
revision.GetCompressedText());
diff --git a/DumpObjects/DumpTraits.h b/DumpObjects/DumpTraits.h
index 7c82c01..ba7f3cf 100644
--- a/DumpObjects/DumpTraits.h
+++ b/DumpObjects/DumpTraits.h
@@ -55,7 +55,7 @@
{
std::array<char, Size> bytes;
- stream.read(&bytes.at(0), Size);
+ stream.read(bytes.data(), Size);
T result = 0;
for (int i = 0; i < Size; i++)
@@ -75,7 +75,7 @@
bytes.at(i) = (value >> (8 * i)) & 0xFF;
}
- stream.write(&bytes.at(0), Size);
+ stream.write(bytes.data(), Size);
}
static uint32_t DumpSize(const T value = 0)
@@ -401,18 +401,13 @@
{
uint16_t count = DumpTraits<uint16_t>::Read(stream);
- std::vector<TKey> keys;
-
- for (int i = 0; i < count; i++)
- {
- keys.push_back(DumpTraits<TKey>::Read(stream));
- }
-
std::map<TKey, TValue> result;
for (int i = 0; i < count; i++)
{
- result.insert(std::pair<TKey, TValue>(keys[i],
DumpTraits<TValue>::Read(stream)));
+ auto key = DumpTraits<TKey>::Read(stream);
+ auto value = DumpTraits<TValue>::Read(stream);
+ result.insert(std::make_pair(key, value));
}
return result;
@@ -430,10 +425,6 @@
for (auto pair : value)
{
DumpTraits<TKey>::Write(stream, pair.first);
- }
-
- for (auto pair : value)
- {
DumpTraits<TValue>::Write(stream, pair.second);
}
}
diff --git a/DumpObjects/FileHeader.cpp b/DumpObjects/FileHeader.cpp
index fe59865..851a714 100644
--- a/DumpObjects/FileHeader.cpp
+++ b/DumpObjects/FileHeader.cpp
@@ -3,6 +3,8 @@
#include "../Dump.h"
#include "../DumpException.h"
+const std::string FileHeader::MagicNumber = "MWID";
+
FileHeader::FileHeader(
DumpKind kind,
Offset fileEnd, Offset pageIdIndexRoot, Offset revisionIdIndexRoot, Offset
modelFormatIndexRoot,
@@ -13,7 +15,7 @@
void FileHeader::WriteInternal()
{
- stream->write("WMID", 4);
+ stream->write(MagicNumber.data(), MagicNumber.length());
WriteValue(FileFormatVersion);
WriteValue(FileDataVersion);
WriteValue(Kind);
@@ -42,14 +44,14 @@
{
istream &stream = *(dump.stream);
- std::string magicNumber(4, '\0');
- stream.read(&magicNumber.at(0), 4);
+ std::string magicNumber(MagicNumber.length(), '\0');
+ stream.read(&magicNumber.at(0), MagicNumber.length());
std::uint8_t fileFormatVersion, fileDataVersion;
ReadValue(stream, fileFormatVersion);
ReadValue(stream, fileDataVersion);
- if (magicNumber != "WMID"
+ if (magicNumber != MagicNumber
|| fileFormatVersion != FileFormatVersion
|| fileDataVersion != FileDataVersion)
throw new DumpException();
diff --git a/DumpObjects/FileHeader.h b/DumpObjects/FileHeader.h
index 712b90a..cffbcab 100644
--- a/DumpObjects/FileHeader.h
+++ b/DumpObjects/FileHeader.h
@@ -19,6 +19,7 @@
protected:
void WriteInternal();
public:
+ static const std::string MagicNumber;
static const std::uint8_t FileFormatVersion = 1;
static const std::uint8_t FileDataVersion = 1;
diff --git a/DumpWriters/DumpWriter.cpp b/DumpWriters/DumpWriter.cpp
index dad24e9..c6a4aa5 100644
--- a/DumpWriters/DumpWriter.cpp
+++ b/DumpWriters/DumpWriter.cpp
@@ -86,8 +86,7 @@
if (withText)
dumpKind |= DumpKind::Pages;
- // empty name means it's a new dump
- if (dump->siteInfo->name.empty())
+ if (dump->isNew)
{
dump->fileHeader.Kind = dumpKind;
dump->fileHeader.Write();
--
To view, visit https://gerrit.wikimedia.org/r/81662
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I05c2c5202b97848bbc5a7a8374713fe9280d4497
Gerrit-PatchSet: 1
Gerrit-Project: operations/dumps/incremental
Gerrit-Branch: gsoc
Gerrit-Owner: Petr Onderka <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits