Petr Onderka has uploaded a new change for review.
https://gerrit.wikimedia.org/r/72993
Change subject: saving page metadata
......................................................................
saving page metadata
Change-Id: Iff5afe7457f6ad74b1faadeefe34fd10f0fbada7
---
M .gitignore
M Dump.cpp
M Dump.h
M DumpObjects/DumpObject.cpp
M DumpObjects/DumpObject.h
A DumpObjects/DumpObjectKind.h
A DumpObjects/DumpPage.cpp
A DumpObjects/DumpPage.h
M DumpObjects/DumpTraits.h
M DumpObjects/FileHeader.cpp
M DumpObjects/FileHeader.h
M DumpObjects/Offset.cpp
M DumpObjects/Offset.h
D DumpObjects/Page.h
D DumpWriter.h
A DumpWriters/DumpWriter.h
A DumpWriters/StubCurrentWriter.cpp
A DumpWriters/StubCurrentWriter.h
R DumpWriters/TestDumpWriter.cpp
A DumpWriters/TestDumpWriter.h
M Incremental dumps.vcxproj
M Indexes/Index.h
M Indexes/Index.tpp
M Indexes/IndexLeafNode.h
M Indexes/IndexLeafNode.tpp
M Indexes/IndexNode.h
M Indexes/IndexNode.tpp
A Objects/Page.h
R Objects/Revision.h
M SpaceManager.cpp
M SpaceManager.h
D TestDumpWriter.h
M XmlPageProcessor.cpp
M XmlPageProcessor.h
M XmlRevisionProcessor.h
M main.cpp
36 files changed, 507 insertions(+), 166 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/operations/dumps/incremental
refs/changes/93/72993/1
diff --git a/.gitignore b/.gitignore
index 9ea4b86..aa34183 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,3 +4,4 @@
*.vcxproj.filters
*.vcxproj.user
/Debug/
+/Release/
diff --git a/Dump.cpp b/Dump.cpp
index a0ad987..ea66e56 100644
--- a/Dump.cpp
+++ b/Dump.cpp
@@ -59,8 +59,8 @@
spaceManager = unique_ptr<SpaceManager>(new SpaceManager(self));
- pageIdIndex = unique_ptr<Index<int32_t, Offset>>(
- new Index<int32_t, Offset>(self, shared_ptr<Offset>(self.lock(),
&fileHeader.PageIdIndexRoot)));
+ pageIdIndex = unique_ptr<Index<uint32_t, Offset>>(
+ new Index<uint32_t, Offset>(self, shared_ptr<Offset>(self.lock(),
&fileHeader.PageIdIndexRoot)));
}
shared_ptr<WritableDump> WritableDump::Create(string fileName)
diff --git a/Dump.h b/Dump.h
index ddf0660..a082132 100644
--- a/Dump.h
+++ b/Dump.h
@@ -26,7 +26,7 @@
public:
// TODO: others should not be able to steal this stream
unique_ptr<iostream> stream;
- unique_ptr<Index<int32_t, Offset>> pageIdIndex;
+ unique_ptr<Index<uint32_t, Offset>> pageIdIndex;
ReadableDump(string fileName);
diff --git a/DumpObjects/DumpObject.cpp b/DumpObjects/DumpObject.cpp
index 5356958..32b88f2 100644
--- a/DumpObjects/DumpObject.cpp
+++ b/DumpObjects/DumpObject.cpp
@@ -9,8 +9,8 @@
{
auto dumpRef = dump.lock();
- int32_t newLength = NewLength();
- int64_t newOffset;
+ uint32_t newLength = NewLength();
+ uint64_t newOffset;
if (newLength == savedLength)
newOffset = savedOffset;
@@ -24,16 +24,23 @@
newOffset = spaceManager->GetSpace(newLength);
}
- ostream& stream = *(dumpRef->stream);
- stream.seekp(newOffset);
+ stream = dumpRef->stream.get();
+ stream->seekp(newOffset);
- Write(stream);
+ WriteInternal();
+
+ stream = nullptr;
savedOffset = newOffset;
savedLength = newLength;
+
+ UpdateIndex(newOffset);
}
-int64_t DumpObject::SavedOffset()
+void DumpObject::UpdateIndex(Offset offset)
+{}
+
+uint64_t DumpObject::SavedOffset() const
{
return savedOffset;
}
\ No newline at end of file
diff --git a/DumpObjects/DumpObject.h b/DumpObjects/DumpObject.h
index d12faf1..7a3e166 100644
--- a/DumpObjects/DumpObject.h
+++ b/DumpObjects/DumpObject.h
@@ -3,26 +3,47 @@
#include <cstdint>
#include <memory>
#include <iostream>
+#include "Offset.h"
class WritableDump;
-using std::int64_t;
+using std::uint64_t;
using std::unique_ptr;
using std::weak_ptr;
using std::ostream;
class DumpObject
{
+private:
+ ostream *stream;
protected:
weak_ptr<WritableDump> dump;
- int64_t savedOffset;
- int32_t savedLength;
+ uint64_t savedOffset;
+ uint32_t savedLength;
DumpObject(weak_ptr<WritableDump> dump);
- virtual void Write(ostream &stream) = 0;
+ virtual void WriteInternal() = 0;
+ virtual void UpdateIndex(Offset offset);
+ template<typename T>
+ void WriteValue(const T value);
+
+ template<typename T>
+ uint32_t ValueSize(const T value) const;
public:
virtual void Write();
- virtual int32_t NewLength() = 0;
- int64_t SavedOffset();
-};
\ No newline at end of file
+ virtual uint32_t NewLength() const = 0;
+ uint64_t SavedOffset() const;
+};
+
+template<typename T>
+void DumpObject::WriteValue(const T value)
+{
+ DumpTraits<T>::Write(*stream, value);
+}
+
+template<typename T>
+uint32_t DumpObject::ValueSize(const T value) const
+{
+ return DumpTraits<T>::DumpSize(value);
+}
\ No newline at end of file
diff --git a/DumpObjects/DumpObjectKind.h b/DumpObjects/DumpObjectKind.h
new file mode 100644
index 0000000..a0c2b15
--- /dev/null
+++ b/DumpObjects/DumpObjectKind.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include <cstdint>
+
+using std::uint8_t;
+
+enum class DumpObjectKind : uint8_t
+{
+ IndexLeafNode = 0x01,
+ IndexInnerNode = 0x02,
+
+ Page = 0x11,
+ Revision = 0x12
+};
\ No newline at end of file
diff --git a/DumpObjects/DumpPage.cpp b/DumpObjects/DumpPage.cpp
new file mode 100644
index 0000000..3736887
--- /dev/null
+++ b/DumpObjects/DumpPage.cpp
@@ -0,0 +1,66 @@
+#include "DumpPage.h"
+#include "DumpObjectKind.h"
+
+void DumpPage::Load(uint32_t pageId)
+{
+ auto dumpRef = dump.lock();
+ auto pageOffset = dumpRef->pageIdIndex->Get(pageId);
+ if (pageOffset.value == 0)
+ {
+ page = Page();
+ savedOffset = 0;
+ savedLength = 0;
+ }
+ else
+ {
+ page = Read(dumpRef, pageOffset);
+ savedOffset = pageOffset.value;
+ savedLength = NewLength();
+ }
+}
+
+Page DumpPage::Read(shared_ptr<WritableDump> dump, Offset offset)
+{
+ Page page;
+
+ auto &stream = *(dump->stream);
+ stream.seekp(offset.value);
+
+ auto kind = DumpTraits<uint8_t>::Read(stream);
+ if (kind != (uint8_t)DumpObjectKind::Page)
+ throw new DumpException();
+
+ page.PageId = DumpTraits<uint32_t>::Read(stream);
+ page.Namespace = DumpTraits<uint16_t>::Read(stream);
+ page.Title = DumpTraits<string>::Read(stream);
+ page.RedirectTarget = DumpTraits<string>::Read(stream);
+
+ return page;
+}
+
+void DumpPage::WriteInternal()
+{
+ WriteValue((uint8_t)DumpObjectKind::Page);
+ WriteValue(page.PageId);
+ WriteValue(page.Namespace);
+ WriteValue(page.Title);
+ WriteValue(page.RedirectTarget);
+}
+
+void DumpPage::UpdateIndex(Offset offset)
+{
+ auto dumpRef = dump.lock();
+ dumpRef->pageIdIndex->AddOrUpdate(page.PageId, offset);
+}
+
+uint32_t DumpPage::NewLength() const
+{
+ return ValueSize((uint8_t)DumpObjectKind::Page) + ValueSize(page.PageId)
+ + ValueSize(page.Namespace) + ValueSize(page.Title) +
ValueSize(page.RedirectTarget);
+}
+
+DumpPage::DumpPage(weak_ptr<WritableDump> dump, uint32_t pageId)
+ : DumpObject(dump), page()
+{
+ Load(pageId);
+}
\ No newline at end of file
diff --git a/DumpObjects/DumpPage.h b/DumpObjects/DumpPage.h
new file mode 100644
index 0000000..d2d0b58
--- /dev/null
+++ b/DumpObjects/DumpPage.h
@@ -0,0 +1,23 @@
+#pragma once
+
+#include "DumpObject.h"
+#include "../Dump.h"
+#include "../Objects/Page.h"
+
+using std::shared_ptr;
+
+class DumpPage : public DumpObject
+{
+private:
+ void Load(uint32_t pageId);
+ static Page Read(shared_ptr<WritableDump> dump, Offset offset);
+protected:
+ virtual void WriteInternal();
+ virtual void UpdateIndex(Offset offset);
+public:
+ Page page;
+
+ DumpPage(weak_ptr<WritableDump> dump, uint32_t pageId);
+
+ virtual uint32_t NewLength() const;
+};
\ No newline at end of file
diff --git a/DumpObjects/DumpTraits.h b/DumpObjects/DumpTraits.h
index 0464268..8044f76 100644
--- a/DumpObjects/DumpTraits.h
+++ b/DumpObjects/DumpTraits.h
@@ -1,9 +1,14 @@
#pragma once
#include <cstdint>
+#include <memory>
+#include <string>
#include <iostream>
+#include "../DumpException.h"
-using std::int32_t;
+using std::uint32_t;
+using std::unique_ptr;
+using std::string;
using std::istream;
using std::ostream;
@@ -21,32 +26,37 @@
value.Write(stream);
}
- static int32_t DumpSize()
+ static uint32_t DumpSize()
{
return T::DumpSize();
+ }
+
+ static uint32_t DumpSize(const T value)
+ {
+ return DumpSize();
}
};
template<>
-class DumpTraits<int32_t>
+class DumpTraits<uint32_t>
{
public:
- static int32_t Read(istream &stream)
+ static uint32_t Read(istream &stream)
{
char bytes[4];
stream.read(bytes, 4);
- int32_t result = 0;
- result |= (int32_t)(uint8_t)bytes[0];
- result |= (int32_t)(uint8_t)bytes[1] << 8;
- result |= (int32_t)(uint8_t)bytes[2] << 16;
- result |= (int32_t)(uint8_t)bytes[3] << 24;
+ uint32_t result = 0;
+ result |= (uint32_t)(uint8_t)bytes[0];
+ result |= (uint32_t)(uint8_t)bytes[1] << 8;
+ result |= (uint32_t)(uint8_t)bytes[2] << 16;
+ result |= (uint32_t)(uint8_t)bytes[3] << 24;
return result;
}
- static void Write(ostream &stream, const int32_t value)
+ static void Write(ostream &stream, const uint32_t value)
{
char bytes[4];
@@ -58,30 +68,96 @@
stream.write(bytes, 4);
}
- static int32_t DumpSize()
+ static uint32_t DumpSize(const uint32_t value = 0)
{
return 4;
}
};
template<>
-class DumpTraits<char>
+class DumpTraits<uint16_t>
{
public:
- static char Read(istream &stream)
+ static uint16_t Read(istream &stream)
+ {
+ char bytes[2];
+
+ stream.read(bytes, 2);
+
+ uint16_t result = 0;
+ result |= (uint16_t)(uint8_t)bytes[0];
+ result |= (uint16_t)(uint8_t)bytes[1] << 8;
+
+ return result;
+ }
+
+ static void Write(ostream &stream, const uint16_t value)
+ {
+ char bytes[2];
+
+ bytes[0] = value & 0xFF;
+ bytes[1] = (value >> 8) & 0xFF;
+
+ stream.write(bytes, 2);
+ }
+
+ static uint32_t DumpSize(const uint16_t value = 0)
+ {
+ return 2;
+ }
+};
+
+template<>
+class DumpTraits<uint8_t>
+{
+public:
+ static uint8_t Read(istream &stream)
{
char byte;
stream.read(&byte, 1);
return byte;
}
- static void Write(ostream &stream, const char value)
+ static void Write(ostream &stream, const uint8_t value)
{
- stream.write(&value, 1);
+ stream.put(value);
}
- static int32_t DumpSize()
+ static uint32_t DumpSize(const uint8_t value = 0)
{
return 1;
}
+};
+
+// for now, handle only strings of length up to 255
+template<>
+class DumpTraits<string>
+{
+public:
+ static string Read(istream &stream)
+ {
+ uint8_t count = DumpTraits<uint8_t>::Read(stream);
+
+ auto bytes = unique_ptr<char[]>(new char[count]);
+ stream.read(bytes.get(), count);
+
+ return string(bytes.get(), count);
+ }
+
+ static void Write(ostream &stream, const string value)
+ {
+ auto length = value.length();
+
+ if (length > 255)
+ throw DumpException();
+
+ DumpTraits<uint8_t>::Write(stream, length);
+
+ stream.write(value.data(), length);
+ }
+
+ static uint32_t DumpSize(const string value)
+ {
+ return DumpTraits<uint8_t>::DumpSize(value.length()) + value.length();
+ }
};
\ No newline at end of file
diff --git a/DumpObjects/FileHeader.cpp b/DumpObjects/FileHeader.cpp
index 19533f5..0f7c98c 100644
--- a/DumpObjects/FileHeader.cpp
+++ b/DumpObjects/FileHeader.cpp
@@ -6,24 +6,27 @@
: DumpObject(dump), FileEnd(fileEnd), PageIdIndexRoot(pageIdIndexRoot),
FreeSpaceIndexRoot(freeSpaceIndexRoot)
{}
-void FileHeader::Write(ostream &stream)
+void FileHeader::WriteInternal()
{
- stream.write("WMID", 4);
- stream.write(&FileFormatVersion, 1);
- stream.write(&FileDataVersion, 1);
+ stream->write("WMID", 4);
+ DumpTraits<uint8_t>::Write(*stream, FileFormatVersion);
+ DumpTraits<uint8_t>::Write(*stream, FileDataVersion);
- FileEnd.Write(stream);
- PageIdIndexRoot.Write(stream);
- FreeSpaceIndexRoot.Write(stream);
+ FileEnd.Write(*stream);
+ PageIdIndexRoot.Write(*stream);
+ FreeSpaceIndexRoot.Write(*stream);
}
void FileHeader::Write()
{
auto dumpRef = dump.lock();
- ostream &stream = *(dumpRef->stream);
+ stream = dumpRef->stream.get();
- stream.seekp(0);
- Write(stream);
+ stream->seekp(0);
+
+ WriteInternal();
+
+ stream = nullptr;
}
FileHeader FileHeader::Read(ReadableDump const &dump)
@@ -42,9 +45,9 @@
return FileHeader(fileEnd, pageIdIndexRoot, freeSpaceIndexRoot,
dump.GetSelf());
}
-int32_t FileHeader::NewLength()
+uint32_t FileHeader::NewLength() const
{
- return 6 + 3 * 6;
+ return 4 + 2 * DumpTraits<uint8_t>::DumpSize() + 3 *
DumpTraits<Offset>::DumpSize();
}
FileHeader::FileHeader(weak_ptr<WritableDump> dump)
diff --git a/DumpObjects/FileHeader.h b/DumpObjects/FileHeader.h
index 4ea97e8..7d6cf20 100644
--- a/DumpObjects/FileHeader.h
+++ b/DumpObjects/FileHeader.h
@@ -11,17 +11,19 @@
class FileHeader : public DumpObject
{
private:
+ ostream* stream;
+
FileHeader(Offset fileEnd, Offset pageIdIndexRoot, Offset
freeSpaceIndexRoot, weak_ptr<WritableDump> dump = weak_ptr<WritableDump>());
protected:
- virtual void Write(ostream &stream);
+ void WriteInternal();
public:
- static const char FileFormatVersion = 1;
- static const char FileDataVersion = 1;
+ static const uint8_t FileFormatVersion = 1;
+ static const uint8_t FileDataVersion = 1;
static FileHeader Read(ReadableDump const &dump);
virtual void Write();
- virtual int32_t NewLength();
+ virtual uint32_t NewLength() const;
Offset FileEnd;
Offset PageIdIndexRoot;
diff --git a/DumpObjects/Offset.cpp b/DumpObjects/Offset.cpp
index 1a2e2de..7cdb6c7 100644
--- a/DumpObjects/Offset.cpp
+++ b/DumpObjects/Offset.cpp
@@ -1,7 +1,7 @@
#include "Offset.h"
#include "../DumpException.h"
-Offset::Offset(int64_t value)
+Offset::Offset(uint64_t value)
: value(value)
{
if (value < 0 || value > 0xFFFFFFFFFFFF) // 6 bytes
@@ -28,18 +28,18 @@
stream.read(bytes, 6);
- int64_t offset = 0;
- offset |= (int64_t)(uint8_t)bytes[0];
- offset |= (int64_t)(uint8_t)bytes[1] << 8;
- offset |= (int64_t)(uint8_t)bytes[2] << 16;
- offset |= (int64_t)(uint8_t)bytes[3] << 24;
- offset |= (int64_t)(uint8_t)bytes[4] << 32;
- offset |= (int64_t)(uint8_t)bytes[5] << 40;
+ uint64_t offset = 0;
+ offset |= (uint64_t)(uint8_t)bytes[0];
+ offset |= (uint64_t)(uint8_t)bytes[1] << 8;
+ offset |= (uint64_t)(uint8_t)bytes[2] << 16;
+ offset |= (uint64_t)(uint8_t)bytes[3] << 24;
+ offset |= (uint64_t)(uint8_t)bytes[4] << 32;
+ offset |= (uint64_t)(uint8_t)bytes[5] << 40;
return Offset(offset);
}
-int32_t Offset::DumpSize()
+uint32_t Offset::DumpSize()
{
return 6;
}
diff --git a/DumpObjects/Offset.h b/DumpObjects/Offset.h
index c7cce0e..87f8f3f 100644
--- a/DumpObjects/Offset.h
+++ b/DumpObjects/Offset.h
@@ -3,19 +3,19 @@
#include <cstdint>
#include <iostream>
-using std::int64_t;
+using std::uint64_t;
using std::istream;
using std::ostream;
class Offset
{
public:
- int64_t value;
+ uint64_t value;
- Offset(int64_t value);
+ Offset(uint64_t value = 0);
void Write(ostream &stream) const;
static Offset Read(istream &stream);
- static int32_t DumpSize();
+ static uint32_t DumpSize();
};
bool operator <(const Offset &first, const Offset &second);
\ No newline at end of file
diff --git a/DumpObjects/Page.h b/DumpObjects/Page.h
deleted file mode 100644
index e4d416c..0000000
--- a/DumpObjects/Page.h
+++ /dev/null
@@ -1,11 +0,0 @@
-#pragma once
-
-#include <string>
-
-using std::string;
-
-class Page
-{
-public:
- string Title;
-};
\ No newline at end of file
diff --git a/DumpWriter.h b/DumpWriter.h
deleted file mode 100644
index 5b6c10c..0000000
--- a/DumpWriter.h
+++ /dev/null
@@ -1,15 +0,0 @@
-#pragma once
-
-#include <memory>
-
-#include "DumpObjects/Page.h"
-#include "DumpObjects/Revision.h"
-
-using std::shared_ptr;
-
-class DumpWriter
-{
-public:
- virtual void WritePage(const shared_ptr<const Page> page) = 0;
- virtual void WriteRevision(const shared_ptr<const Revision> revision) = 0;
-};
\ No newline at end of file
diff --git a/DumpWriters/DumpWriter.h b/DumpWriters/DumpWriter.h
new file mode 100644
index 0000000..56f6dac
--- /dev/null
+++ b/DumpWriters/DumpWriter.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include <memory>
+
+#include "../Objects/Page.h"
+#include "../Objects/Revision.h"
+
+using std::shared_ptr;
+
+class DumpWriter
+{
+public:
+ virtual void StartPage(const shared_ptr<const Page> page) = 0;
+ virtual void AddRevision(const shared_ptr<const Revision> revision) = 0;
+ virtual void EndPage() = 0;
+};
\ No newline at end of file
diff --git a/DumpWriters/StubCurrentWriter.cpp
b/DumpWriters/StubCurrentWriter.cpp
new file mode 100644
index 0000000..6c3e3a5
--- /dev/null
+++ b/DumpWriters/StubCurrentWriter.cpp
@@ -0,0 +1,21 @@
+#include "StubCurrentWriter.h"
+
+StubCurrentWriter::StubCurrentWriter(shared_ptr<WritableDump> dump)
+ : dump(dump)
+{}
+
+void StubCurrentWriter::StartPage(const shared_ptr<const Page> page)
+{
+ this->page = unique_ptr<DumpPage>(new DumpPage(dump, page->PageId));
+ this->page->page = *page;
+}
+
+void StubCurrentWriter::AddRevision(const shared_ptr<const Revision> revision)
+{
+}
+
+void StubCurrentWriter::EndPage()
+{
+ page->Write();
+ page = nullptr;
+}
\ No newline at end of file
diff --git a/DumpWriters/StubCurrentWriter.h b/DumpWriters/StubCurrentWriter.h
new file mode 100644
index 0000000..6ad1f9e
--- /dev/null
+++ b/DumpWriters/StubCurrentWriter.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include "DumpWriter.h"
+#include "../DumpObjects/DumpPage.h"
+
+class StubCurrentWriter : public DumpWriter
+{
+private:
+ shared_ptr<WritableDump> dump;
+ unique_ptr<DumpPage> page;
+public:
+ StubCurrentWriter(shared_ptr<WritableDump> dump);
+
+ virtual void StartPage(const shared_ptr<const Page> page);
+ virtual void AddRevision(const shared_ptr<const Revision> revision);
+ virtual void EndPage();
+};
\ No newline at end of file
diff --git a/TestDumpWriter.cpp b/DumpWriters/TestDumpWriter.cpp
similarity index 76%
rename from TestDumpWriter.cpp
rename to DumpWriters/TestDumpWriter.cpp
index 1b6e9b2..6b34eb2 100644
--- a/TestDumpWriter.cpp
+++ b/DumpWriters/TestDumpWriter.cpp
@@ -16,12 +16,16 @@
return subject;
}
-void TestDumpWriter::WritePage(const shared_ptr<const Page> page)
+void TestDumpWriter::StartPage(const shared_ptr<const Page> page)
{
cout << page->Title << "\n";
}
-void TestDumpWriter::WriteRevision(const shared_ptr<const Revision> revision)
+void TestDumpWriter::AddRevision(const shared_ptr<const Revision> revision)
{
cout << " " << ReplaceString(revision->Text, "\n", "\\n").substr(0, 78) <<
"\n";
+}
+
+void TestDumpWriter::EndPage()
+{
}
\ No newline at end of file
diff --git a/DumpWriters/TestDumpWriter.h b/DumpWriters/TestDumpWriter.h
new file mode 100644
index 0000000..638adb9
--- /dev/null
+++ b/DumpWriters/TestDumpWriter.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include <string>
+#include "DumpWriter.h"
+
+using std::string;
+
+class TestDumpWriter : public DumpWriter
+{
+private:
+ string ReplaceString(string subject, const string& search, const string&
replace);
+public:
+ virtual void StartPage(const shared_ptr<const Page> page);
+ virtual void AddRevision(const shared_ptr<const Revision> revision);
+ virtual void EndPage();
+};
\ No newline at end of file
diff --git a/Incremental dumps.vcxproj b/Incremental dumps.vcxproj
index 4b6a00f..c3e58f6 100644
--- a/Incremental dumps.vcxproj
+++ b/Incremental dumps.vcxproj
@@ -67,7 +67,7 @@
<Optimization>MaxSpeed</Optimization>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
-
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<SDLCheck>true</SDLCheck>
</ClCompile>
<Link>
@@ -84,6 +84,8 @@
<ClCompile Include="DumpObjects\DumpTraits.cpp" />
<ClCompile Include="DumpObjects\FileHeader.cpp" />
<ClInclude Include="DumpException.h" />
+ <ClInclude Include="DumpObjects\DumpObjectKind.h" />
+ <ClInclude Include="DumpObjects\DumpPage.h" />
<ClInclude Include="Indexes\Index.h" />
<ClInclude Include="Indexes\IndexLeafNode.tpp">
<FileType>CppCode</FileType>
@@ -93,10 +95,12 @@
<FileType>CppCode</FileType>
</ClInclude>
<ClInclude Include="Indexes\Index.tpp" />
+ <ClCompile Include="DumpObjects\DumpPage.cpp" />
<ClCompile Include="main.cpp" />
<ClCompile Include="DumpObjects\Offset.cpp" />
<ClCompile Include="SpaceManager.cpp" />
- <ClCompile Include="TestDumpWriter.cpp" />
+ <ClCompile Include="DumpWriters\TestDumpWriter.cpp" />
+ <ClCompile Include="DumpWriters\StubCurrentWriter.cpp" />
<ClCompile Include="XmlPageProcessor.cpp" />
<ClCompile Include="XmlRevisionProcessor.cpp" />
<ClCompile Include="XmlUtils.cpp" />
@@ -110,13 +114,13 @@
<ClInclude Include="Dump.h" />
<ClInclude Include="DumpObjects\DumpObject.h" />
<ClInclude Include="DumpObjects\DumpTraits.h" />
- <ClInclude Include="DumpWriter.h" />
+ <ClInclude Include="DumpWriters\DumpWriter.h" />
<ClInclude Include="DumpObjects\FileHeader.h" />
<ClInclude Include="Indexes\IndexLeafNode.h" />
<ClInclude Include="Indexes\IndexNode.h" />
<ClInclude Include="DumpObjects\Offset.h" />
- <ClInclude Include="DumpObjects\Page.h" />
- <ClInclude Include="DumpObjects\Revision.h" />
+ <ClInclude Include="Objects\Page.h" />
+ <ClInclude Include="Objects\Revision.h" />
<ClInclude Include="Indexes\Iterators\IndexIterator.h" />
<ClInclude Include="Indexes\Iterators\IndexIterator.tpp" />
<ClInclude Include="Indexes\Iterators\IndexLeafIterator.h" />
@@ -124,7 +128,8 @@
<ClInclude Include="Indexes\Iterators\IndexNodeIterator.h" />
<ClInclude Include="Indexes\Iterators\IndexNodeIterator.tpp" />
<ClInclude Include="SpaceManager.h" />
- <ClInclude Include="TestDumpWriter.h" />
+ <ClInclude Include="DumpWriters\TestDumpWriter.h" />
+ <ClInclude Include="DumpWriters\StubCurrentWriter.h" />
<ClInclude Include="XmlPageProcessor.h" />
<ClInclude Include="XmlRevisionProcessor.h" />
<ClInclude Include="XmlUtils.h" />
diff --git a/Indexes/Index.h b/Indexes/Index.h
index 37417fc..85a86ac 100644
--- a/Indexes/Index.h
+++ b/Indexes/Index.h
@@ -16,13 +16,12 @@
unique_ptr<IndexNode<TKey, TValue>> rootNode;
weak_ptr<WritableDump> dump;
weak_ptr<Offset> fileHeaderOffset;
-
- void Save();
public:
- Index(weak_ptr<WritableDump> dump, weak_ptr<Offset> fileHeaderOffset);
+ Index(weak_ptr<WritableDump> dump, weak_ptr<Offset> fileHeaderOffset, bool
delaySave = false);
- TValue operator[](TKey key);
+ TValue Get(TKey key);
void Add(TKey key, TValue value);
+ void AddOrUpdate(TKey key, TValue value);
void Remove(TKey key);
IndexIterator<TKey, TValue> begin() const;
diff --git a/Indexes/Index.tpp b/Indexes/Index.tpp
index 54281be..f99c952 100644
--- a/Indexes/Index.tpp
+++ b/Indexes/Index.tpp
@@ -5,24 +5,36 @@
using std::move;
template<typename TKey, typename TValue>
-Index<TKey, TValue>::Index(weak_ptr<WritableDump> dump, weak_ptr<Offset>
fileHeaderOffset)
+Index<TKey, TValue>::Index(weak_ptr<WritableDump> dump, weak_ptr<Offset>
fileHeaderOffset, bool delaySave)
: dump(dump), fileHeaderOffset(fileHeaderOffset)
{
auto offset = fileHeaderOffset.lock();
+ fileHeaderZero = false;
+
if (offset->value == 0)
{
rootNode = IndexNode<TKey, TValue>::CreateNew(dump);
- fileHeaderZero = true;
+
+ if (delaySave)
+ {
+ fileHeaderZero = true;
+ }
+ else
+ {
+ rootNode->Write();
+ fileHeaderOffset.lock()->value = rootNode->SavedOffset();
+ dump.lock()->fileHeader.Write();
+ }
}
else
rootNode = IndexNode<TKey, TValue>::Read(dump, offset->value);
}
template<typename TKey, typename TValue>
-TValue Index<TKey, TValue>::operator[](TKey key)
+TValue Index<TKey, TValue>::Get(TKey key)
{
- return (*rootNode)[key];
+ return rootNode->Get(key);
}
template<typename TKey, typename TValue>
@@ -40,6 +52,20 @@
}
template<typename TKey, typename TValue>
+void Index<TKey, TValue>::AddOrUpdate(TKey key, TValue value)
+{
+ rootNode->AddOrUpdate(key, value);
+
+ if (fileHeaderZero)
+ {
+ fileHeaderOffset.lock()->value = rootNode->SavedOffset();
+ dump.lock()->fileHeader.Write();
+
+ fileHeaderZero = false;
+ }
+}
+
+template<typename TKey, typename TValue>
void Index<TKey, TValue>::Remove(TKey key)
{
rootNode->Remove(key);
diff --git a/Indexes/IndexLeafNode.h b/Indexes/IndexLeafNode.h
index 0e94d78..f93960a 100644
--- a/Indexes/IndexLeafNode.h
+++ b/Indexes/IndexLeafNode.h
@@ -9,21 +9,22 @@
class IndexLeafNode : public IndexNode<TKey, TValue>
{
private:
- static const int Size = 10; // has to be at most 128 for now
+ static const int Size = 255;
map<TKey, TValue> map;
protected:
- virtual void Write(ostream &stream);
+ virtual void WriteInternal();
public:
static unique_ptr<IndexNode> Read(weak_ptr<WritableDump> dump, istream
&stream);
IndexLeafNode(weak_ptr<WritableDump> dump);
using DumpObject::Write;
- virtual int32_t NewLength();
+ virtual uint32_t NewLength() const;
- virtual TValue operator[](TKey key);
+ virtual TValue Get(TKey key);
virtual void Add(TKey key, TValue value);
+ virtual void AddOrUpdate(TKey key, TValue value);
virtual void Remove(TKey key);
virtual shared_ptr<IndexNodeIterator<TKey, TValue>> begin() const;
diff --git a/Indexes/IndexLeafNode.tpp b/Indexes/IndexLeafNode.tpp
index 422a3b1..edd36f6 100644
--- a/Indexes/IndexLeafNode.tpp
+++ b/Indexes/IndexLeafNode.tpp
@@ -7,9 +7,12 @@
using std::vector;
template<typename TKey, typename TValue>
-TValue IndexLeafNode<TKey, TValue>::operator[](TKey key)
+TValue IndexLeafNode<TKey, TValue>::Get(TKey key)
{
- return map.find(key)->second;
+ auto found = map.find(key);
+ if (found == map.end())
+ return TValue();
+ return found->second;
}
template<typename TKey, typename TValue>
@@ -26,12 +29,25 @@
}
template<typename TKey, typename TValue>
+void IndexLeafNode<TKey, TValue>::AddOrUpdate(TKey key, TValue value)
+{
+ auto pos = map.find(key);
+ if (pos == map.end())
+ {
+ Add(key, value);
+ }
+ else
+ {
+ pos->second = value; // will this work?
+ }
+}
+
+template<typename TKey, typename TValue>
void IndexLeafNode<TKey, TValue>::Remove(const TKey key)
{
map.erase(key);
Write();
}
-
template<typename TKey, typename TValue>
IndexLeafNode<TKey, TValue>::IndexLeafNode(weak_ptr<WritableDump> dump)
@@ -44,7 +60,7 @@
{
auto node = new IndexLeafNode<TKey, TValue>(dump);
- char count = DumpTraits<char>::Read(stream);
+ uint8_t count = DumpTraits<uint8_t>::Read(stream);
vector<TKey> keys;
@@ -62,26 +78,26 @@
}
template<typename TKey, typename TValue>
-void IndexLeafNode<TKey, TValue>::Write(ostream &stream)
+void IndexLeafNode<TKey, TValue>::WriteInternal()
{
- DumpTraits<char>::Write(stream, (char)NodeKind::LeafNode);
- DumpTraits<char>::Write(stream, map.size());
+ WriteValue((uint8_t)DumpObjectKind::IndexLeafNode);
+ WriteValue((uint8_t)map.size());
for (auto pair : map)
{
- DumpTraits<TKey>::Write(stream, pair.first);
+ WriteValue(pair.first);
}
for (auto pair : map)
{
- DumpTraits<TValue>::Write(stream, pair.second);
+ WriteValue(pair.second);
}
}
template<typename TKey, typename TValue>
-int32_t IndexLeafNode<TKey, TValue>::NewLength()
+uint32_t IndexLeafNode<TKey, TValue>::NewLength() const
{
- return 2 * DumpTraits<char>::DumpSize()
+ return 2 * DumpTraits<uint8_t>::DumpSize()
+ Size * (DumpTraits<TKey>::DumpSize() +
DumpTraits<TValue>::DumpSize());
}
diff --git a/Indexes/IndexNode.h b/Indexes/IndexNode.h
index 1908934..6265558 100644
--- a/Indexes/IndexNode.h
+++ b/Indexes/IndexNode.h
@@ -11,23 +11,19 @@
class IndexNode : public DumpObject
{
protected:
- virtual void Write(ostream &stream) = 0;
-
- enum class NodeKind : char
- {
- LeafNode = 1
- };
+ virtual void WriteInternal() = 0;
public:
- static unique_ptr<IndexNode> Read(weak_ptr<WritableDump> dump, int64_t
offset);
+ static unique_ptr<IndexNode> Read(weak_ptr<WritableDump> dump, uint64_t
offset);
static unique_ptr<IndexNode> CreateNew(weak_ptr<WritableDump> dump);
IndexNode(weak_ptr<WritableDump> dump);
using DumpObject::Write;
- virtual TValue operator[](TKey key) = 0;
+ virtual TValue Get(TKey key) = 0;
virtual void Add(TKey key, TValue value) = 0;
+ virtual void AddOrUpdate(TKey key, TValue value) = 0;
virtual void Remove(TKey key) = 0;
virtual shared_ptr<IndexNodeIterator<TKey, TValue>> begin() const = 0;
diff --git a/Indexes/IndexNode.tpp b/Indexes/IndexNode.tpp
index 9b562fa..0c13830 100644
--- a/Indexes/IndexNode.tpp
+++ b/Indexes/IndexNode.tpp
@@ -1,6 +1,7 @@
#include "IndexNode.h"
#include "IndexLeafNode.h"
#include "../DumpException.h"
+#include "../DumpObjects/DumpObjectKind.h"
template<typename TKey, typename TValue>
IndexNode<TKey, TValue>::IndexNode(weak_ptr<WritableDump> dump)
@@ -8,7 +9,7 @@
{}
template<typename TKey, typename TValue>
-unique_ptr<IndexNode<TKey, TValue>> IndexNode<TKey,
TValue>::Read(weak_ptr<WritableDump> dump, int64_t offset)
+unique_ptr<IndexNode<TKey, TValue>> IndexNode<TKey,
TValue>::Read(weak_ptr<WritableDump> dump, uint64_t offset)
{
auto dumpRef = dump.lock();
auto &stream = *(dumpRef->stream);
@@ -16,7 +17,7 @@
char byte;
stream.read(&byte, 1);
- if (byte == (char)NodeKind::LeafNode)
+ if (byte == (char)DumpObjectKind::IndexLeafNode)
{
auto result = IndexLeafNode<TKey, TValue>::Read(dump, stream);
result->savedOffset = offset;
diff --git a/Objects/Page.h b/Objects/Page.h
new file mode 100644
index 0000000..8c53e4d
--- /dev/null
+++ b/Objects/Page.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include <string>
+
+using std::string;
+
+class Page
+{
+public:
+ uint32_t PageId;
+ uint16_t Namespace;
+ string Title;
+ // if empty, the page is not a redirect
+ string RedirectTarget;
+};
\ No newline at end of file
diff --git a/DumpObjects/Revision.h b/Objects/Revision.h
similarity index 100%
rename from DumpObjects/Revision.h
rename to Objects/Revision.h
diff --git a/SpaceManager.cpp b/SpaceManager.cpp
index 334b4a9..8eca6bc 100644
--- a/SpaceManager.cpp
+++ b/SpaceManager.cpp
@@ -5,7 +5,7 @@
SpaceManager::SpaceManager(weak_ptr<WritableDump> dump)
: dump(dump),
- spaceIndex(dump, shared_ptr<Offset>(dump.lock(),
&dump.lock()->fileHeader.FreeSpaceIndexRoot)),
+ spaceIndex(dump, shared_ptr<Offset>(dump.lock(),
&dump.lock()->fileHeader.FreeSpaceIndexRoot), true),
spaceByLength()
{
for (auto value : spaceIndex)
@@ -14,7 +14,7 @@
}
}
-int64_t SpaceManager::GetSpace(int32_t length)
+uint64_t SpaceManager::GetSpace(uint32_t length)
{
auto foundSpace = spaceByLength.lower_bound(length);
if (foundSpace != spaceByLength.end())
@@ -46,7 +46,7 @@
}
}
-void SpaceManager::Delete(int64_t offset, int32_t length)
+void SpaceManager::Delete(uint64_t offset, uint32_t length)
{
// TODO: free space at the end just decrements fileEnd
// TODO: join consecutive free blocks
diff --git a/SpaceManager.h b/SpaceManager.h
index daf168f..40d396a 100644
--- a/SpaceManager.h
+++ b/SpaceManager.h
@@ -5,8 +5,8 @@
#include <map>
#include "Indexes/Index.h"
-using std::int32_t;
-using std::int64_t;
+using std::uint32_t;
+using std::uint64_t;
using std::weak_ptr;
using std::multimap;
@@ -16,10 +16,10 @@
{
private:
weak_ptr<WritableDump> dump;
- Index<Offset, int32_t> spaceIndex;
- multimap<int32_t, Offset> spaceByLength;
+ Index<Offset, uint32_t> spaceIndex;
+ multimap<uint32_t, Offset> spaceByLength;
public:
SpaceManager(weak_ptr<WritableDump> dump);
- int64_t GetSpace(int32_t length);
- void Delete(int64_t offset, int32_t length);
+ uint64_t GetSpace(uint32_t length);
+ void Delete(uint64_t offset, uint32_t length);
};
\ No newline at end of file
diff --git a/TestDumpWriter.h b/TestDumpWriter.h
deleted file mode 100644
index 717dab8..0000000
--- a/TestDumpWriter.h
+++ /dev/null
@@ -1,15 +0,0 @@
-#pragma once
-
-#include <string>
-#include "DumpWriter.h"
-
-using std::string;
-
-class TestDumpWriter : public DumpWriter
-{
-private:
- string ReplaceString(string subject, const string& search, const string&
replace);
-public:
- virtual void WritePage(const shared_ptr<Page const> page);
- virtual void WriteRevision(const shared_ptr<const Revision> revision);
-};
\ No newline at end of file
diff --git a/XmlPageProcessor.cpp b/XmlPageProcessor.cpp
index d8ae903..3afc920 100644
--- a/XmlPageProcessor.cpp
+++ b/XmlPageProcessor.cpp
@@ -1,9 +1,10 @@
#include "XmlPageProcessor.h"
#include "XmlRevisionProcessor.h"
#include "XmlUtils.h"
-#include "DumpWriter.h"
+#include "DumpWriters/DumpWriter.h"
using std::make_shared;
+using std::stoi;
void XmlPageProcessor::titleHandler(XML::Element &elem, void *userData)
{
@@ -11,13 +12,38 @@
processor->page->Title = readElementData(elem);
}
+void XmlPageProcessor::nsHandler(XML::Element &elem, void *userData)
+{
+ XmlPageProcessor* processor = (XmlPageProcessor*)userData;
+ processor->page->Namespace = stoi(readElementData(elem));
+}
+
+void XmlPageProcessor::idHandler(XML::Element &elem, void *userData)
+{
+ XmlPageProcessor* processor = (XmlPageProcessor*)userData;
+ processor->page->PageId = stoi(readElementData(elem));
+}
+
+void XmlPageProcessor::redirectHandler(XML::Element &elem, void *userData)
+{
+ XmlPageProcessor* processor = (XmlPageProcessor*)userData;
+
+ //processor->page->RedirectTarget = readElementData(elem);
+ processor->page->RedirectTarget = string(elem.GetAttribute("title"));
+}
+
void XmlPageProcessor::writePage()
{
if (!pageWritten)
{
- dumpWriter->WritePage(page);
+ dumpWriter->StartPage(page);
pageWritten = true;
}
+}
+
+void XmlPageProcessor::completePage()
+{
+ dumpWriter->EndPage();
}
XmlPageProcessor::XmlPageProcessor(const shared_ptr<Page> page, DumpWriter*
dumpWriter)
@@ -29,11 +55,14 @@
{
static int i = 0;
- if (i++ > 5)
+ if (i++ >= 255)
return;
XML::Handler handlers[] = {
XML::Handler("title", titleHandler),
+ XML::Handler("ns", nsHandler),
+ XML::Handler("id", idHandler),
+ XML::Handler("redirect", redirectHandler),
XML::Handler("revision", XmlRevisionProcessor::Handler),
XML::Handler::END
};
@@ -45,10 +74,11 @@
elem.Process(handlers, &pageProcessor);
pageProcessor.writePage();
+ pageProcessor.completePage();
}
void XmlPageProcessor::ProcessRevision(const shared_ptr<const Revision>
revision)
{
writePage();
- dumpWriter->WriteRevision(revision);
+ dumpWriter->AddRevision(revision);
}
\ No newline at end of file
diff --git a/XmlPageProcessor.h b/XmlPageProcessor.h
index 55deab3..e060267 100644
--- a/XmlPageProcessor.h
+++ b/XmlPageProcessor.h
@@ -2,8 +2,8 @@
#include <memory>
#include "XML/xmlinput.h"
-#include "DumpObjects/Page.h"
-#include "DumpWriter.h"
+#include "Objects/Page.h"
+#include "DumpWriters/DumpWriter.h"
using std::shared_ptr;
@@ -17,7 +17,11 @@
XmlPageProcessor(const shared_ptr<Page> page, DumpWriter* dumpWriter);
static void titleHandler(XML::Element &elem, void *userData);
+ static void nsHandler(XML::Element &elem, void *userData);
+ static void idHandler(XML::Element &elem, void *userData);
+ static void redirectHandler(XML::Element &elem, void *userData);
void writePage();
+ void completePage();
public:
static void Handler(XML::Element &elem, void *userData);
void ProcessRevision(const shared_ptr<const Revision> revision);
diff --git a/XmlRevisionProcessor.h b/XmlRevisionProcessor.h
index a5238db..28a6a37 100644
--- a/XmlRevisionProcessor.h
+++ b/XmlRevisionProcessor.h
@@ -1,7 +1,7 @@
#pragma once
#include "XML/xmlinput.h"
-#include "DumpObjects/Revision.h"
+#include "Objects/Revision.h"
class XmlRevisionProcessor
{
diff --git a/main.cpp b/main.cpp
index ec78254..06e7e01 100644
--- a/main.cpp
+++ b/main.cpp
@@ -1,7 +1,7 @@
#include <iostream>
#include "XML/xmlinput.h"
#include "XML/xmlfile.h"
-#include "TestDumpWriter.h"
+#include "DumpWriters/StubCurrentWriter.h"
#include "XmlPageProcessor.h"
#include "Dump.h"
@@ -30,7 +30,7 @@
int main(int argc, const char* argv[])
{
//StandardInputStream stream;
- /*XML::FileInputStream stream =
XML::FileInputStream("C:\\Users\\Svick\\Downloads\\tenwiki-20130622-pages-meta-history.xml");
+ XML::FileInputStream stream =
XML::FileInputStream("C:\\Users\\Svick\\Downloads\\tenwiki-20130622-pages-meta-history.xml");
XML::Input input(stream);
@@ -39,14 +39,16 @@
XML::Handler::END
};
- TestDumpWriter writer;
-
- input.Process(handlers, &writer);*/
-
shared_ptr<WritableDump> dump = WritableDump::Create("tmp/test.id");
+
+ StubCurrentWriter writer(dump);
+
+ input.Process(handlers, &writer);
+
+ /*shared_ptr<WritableDump> dump = WritableDump::Create("tmp/test.id");
auto offset = dump->spaceManager->GetSpace(102);
dump->spaceManager->Delete(offset, 102);
- dump->pageIdIndex->Add(1, 2);
+ dump->pageIdIndex->Add(1, 2);*/
}
\ No newline at end of file
--
To view, visit https://gerrit.wikimedia.org/r/72993
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: Iff5afe7457f6ad74b1faadeefe34fd10f0fbada7
Gerrit-PatchSet: 1
Gerrit-Project: operations/dumps/incremental
Gerrit-Branch: gsoc
Gerrit-Owner: Petr Onderka <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits