Petr Onderka has uploaded a new change for review.
https://gerrit.wikimedia.org/r/71995
Change subject: starting with dump format: file header
......................................................................
starting with dump format: file header
Change-Id: I29350dbe9de280efa1248a6c87d174fa6494d32e
---
A Dump.cpp
A Dump.h
A DumpObject.cpp
A DumpObject.h
A FileHeader.cpp
A FileHeader.h
M Incremental dumps.vcxproj
A Offset.cpp
A Offset.h
M main.cpp
10 files changed, 246 insertions(+), 2 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/operations/dumps/incremental
refs/changes/95/71995/1
diff --git a/Dump.cpp b/Dump.cpp
new file mode 100644
index 0000000..5a0d77f
--- /dev/null
+++ b/Dump.cpp
@@ -0,0 +1,49 @@
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <fstream>
+#include "Dump.h"
+
+using std::unique_ptr;
+using std::move;
+using std::string;
+using std::fstream;
+using std::ios;
+
+ReadableDump::ReadableDump(unique_ptr<iostream> stream)
+ : stream(move(stream))
+{}
+
+ReadableDump::ReadableDump(string fileName)
+ : stream(unique_ptr<fstream>(new fstream(fileName, ios::in | ios::binary)))
+{}
+
+unique_ptr<iostream> WritableDump::openStream(string fileName)
+{
+ fstream *stream = new fstream(fileName, ios::in | ios::out | ios::binary);
+
+ if (!stream->is_open())
+ {
+ // this feels dangerous, isn't there a better way?
+ stream = new fstream(fileName, ios::in | ios::out | ios::binary |
ios::trunc);
+ }
+
+ stream->exceptions(ios::failbit | ios::badbit);
+
+ return unique_ptr<iostream>(stream);
+}
+
+WritableDump::WritableDump(string fileName)
+ : ReadableDump(openStream(fileName))
+{
+ if (stream->peek() == EOF)
+ {
+ stream->clear();
+ fileHeader = FileHeader();
+ fileHeader.Write(stream, 0);
+ }
+ else
+ {
+ fileHeader = FileHeader::Read(stream);
+ }
+}
\ No newline at end of file
diff --git a/Dump.h b/Dump.h
new file mode 100644
index 0000000..e995d1b
--- /dev/null
+++ b/Dump.h
@@ -0,0 +1,35 @@
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <iostream>
+#include "FileHeader.h"
+
+using std::int64_t;
+using std::unique_ptr;
+using std::string;
+using std::iostream;
+
+class DumpException : std::exception
+{
+};
+
+class ReadableDump
+{
+protected:
+ unique_ptr<iostream> stream;
+ ReadableDump(unique_ptr<iostream> stream);
+public:
+ ReadableDump(string fileName);
+};
+
+class WritableDump : public ReadableDump
+{
+private:
+ FileHeader fileHeader;
+
+ static unique_ptr<iostream> openStream(string fileName);
+public:
+ WritableDump(string fileName);
+};
\ No newline at end of file
diff --git a/DumpObject.cpp b/DumpObject.cpp
new file mode 100644
index 0000000..ce6715f
--- /dev/null
+++ b/DumpObject.cpp
@@ -0,0 +1,7 @@
+#include "DumpObject.h"
+
+void DumpObject::Write(unique_ptr<iostream> const &stream, int64_t offset)
+{
+ stream->seekp(offset);
+ WriteInternal(stream);
+}
\ No newline at end of file
diff --git a/DumpObject.h b/DumpObject.h
new file mode 100644
index 0000000..f7b5c2f
--- /dev/null
+++ b/DumpObject.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <iostream>
+
+using std::int64_t;
+using std::unique_ptr;
+using std::iostream;
+
+class DumpObject
+{
+protected:
+ virtual void WriteInternal(unique_ptr<iostream> const &stream) = 0;
+public:
+ void Write(unique_ptr<iostream> const &stream, int64_t offset);
+};
\ No newline at end of file
diff --git a/FileHeader.cpp b/FileHeader.cpp
new file mode 100644
index 0000000..ddee2dc
--- /dev/null
+++ b/FileHeader.cpp
@@ -0,0 +1,37 @@
+#include "FileHeader.h"
+#include "Dump.h"
+
+FileHeader::FileHeader(Offset fileEnd, Offset pageIdIndexRoot, Offset
freeSpaceIndexRoot)
+ : FileEnd(fileEnd), PageIdIndexRoot(pageIdIndexRoot),
FreeSpaceIndexRoot(freeSpaceIndexRoot)
+{
+}
+
+void FileHeader::WriteInternal(unique_ptr<iostream> const &stream)
+{
+ stream->write("WMID", 4);
+ stream->write(&FileFormatVersion, 1);
+ stream->write(&FileDataVersion, 1);
+
+ FileEnd.Write(stream);
+ PageIdIndexRoot.Write(stream);
+ FreeSpaceIndexRoot.Write(stream);
+}
+
+FileHeader FileHeader::Read(unique_ptr<iostream> const &stream)
+{
+ char bytes[6];
+ stream->read(bytes, 6);
+ if (stream->fail() || strncmp(bytes, "WMID", 4) != 0 || bytes[4] !=
FileFormatVersion || bytes[5] != FileDataVersion)
+ throw new DumpException();
+
+ Offset fileEnd = Offset::Read(stream);
+ Offset pageIdIndexRoot = Offset::Read(stream);
+ Offset freeSpaceIndexRoot = Offset::Read(stream);
+
+ return FileHeader(fileEnd, pageIdIndexRoot, freeSpaceIndexRoot);
+}
+
+FileHeader::FileHeader()
+ : FileEnd(0), PageIdIndexRoot(0), FreeSpaceIndexRoot(0)
+{
+}
\ No newline at end of file
diff --git a/FileHeader.h b/FileHeader.h
new file mode 100644
index 0000000..20e1bb1
--- /dev/null
+++ b/FileHeader.h
@@ -0,0 +1,26 @@
+#pragma once
+
+#include <iostream>
+#include "DumpObject.h"
+#include "Offset.h"
+
+using std::istream;
+
+class FileHeader : public DumpObject
+{
+private:
+ FileHeader(Offset fileEnd, Offset pageIdIndexRoot, Offset
freeSpaceIndexRoot);
+protected:
+ virtual void WriteInternal(unique_ptr<iostream> const &stream);
+public:
+ static const char FileFormatVersion = 1;
+ static const char FileDataVersion = 1;
+
+ static FileHeader Read(unique_ptr<iostream> const &stream);
+
+ Offset FileEnd;
+ Offset PageIdIndexRoot;
+ Offset FreeSpaceIndexRoot;
+
+ FileHeader();
+};
\ No newline at end of file
diff --git a/Incremental dumps.vcxproj b/Incremental dumps.vcxproj
index c22e91a..63131ae 100644
--- a/Incremental dumps.vcxproj
+++ b/Incremental dumps.vcxproj
@@ -78,7 +78,11 @@
</Link>
</ItemDefinitionGroup>
<ItemGroup>
+ <ClCompile Include="Dump.cpp" />
+ <ClCompile Include="DumpObject.cpp" />
+ <ClCompile Include="FileHeader.cpp" />
<ClCompile Include="main.cpp" />
+ <ClCompile Include="Offset.cpp" />
<ClCompile Include="TestDumpWriter.cpp" />
<ClCompile Include="XmlPageProcessor.cpp" />
<ClCompile Include="XmlRevisionProcessor.cpp" />
@@ -90,7 +94,11 @@
<ClCompile Include="XML\xmloutput.cpp" />
</ItemGroup>
<ItemGroup>
+ <ClInclude Include="Dump.h" />
+ <ClInclude Include="DumpObject.h" />
<ClInclude Include="DumpWriter.h" />
+ <ClInclude Include="FileHeader.h" />
+ <ClInclude Include="Offset.h" />
<ClInclude Include="Page.h" />
<ClInclude Include="Revision.h" />
<ClInclude Include="TestDumpWriter.h" />
diff --git a/Offset.cpp b/Offset.cpp
new file mode 100644
index 0000000..d6097e0
--- /dev/null
+++ b/Offset.cpp
@@ -0,0 +1,43 @@
+#include "Offset.h"
+#include "Dump.h"
+
+Offset::Offset(int64_t value)
+ : value(value)
+{
+ if (value < 0 || value > 0xFFFFFFFFFFFF) // 6 bytes
+ throw DumpException();
+}
+
+void Offset::Write(unique_ptr<iostream> const &stream) const
+{
+ char bytes[6];
+
+ bytes[0] = value & 0xFF;
+ bytes[1] = (value >> 8) & 0xFF;
+ bytes[2] = (value >> 16) & 0xFF;
+ bytes[3] = (value >> 24) & 0xFF;
+ bytes[4] = (value >> 32) & 0xFF;
+ bytes[5] = (value >> 40) & 0xFF;
+
+ stream->write(bytes, 6);
+}
+
+Offset Offset::Read(unique_ptr<iostream> const &stream)
+{
+ char bytes[6];
+
+ stream->read(bytes, 6);
+
+ if (stream->fail())
+ throw new DumpException();
+
+ int64_t offset = 0;
+ offset |= (int64_t)bytes[0];
+ offset |= (int64_t)bytes[1] << 8;
+ offset |= (int64_t)bytes[2] << 16;
+ offset |= (int64_t)bytes[3] << 24;
+ offset |= (int64_t)bytes[4] << 32;
+ offset |= (int64_t)bytes[5] << 40;
+
+ return Offset(offset);
+}
\ No newline at end of file
diff --git a/Offset.h b/Offset.h
new file mode 100644
index 0000000..f0f451a
--- /dev/null
+++ b/Offset.h
@@ -0,0 +1,19 @@
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <iostream>
+
+using std::int64_t;
+using std::unique_ptr;
+using std::iostream;
+
+class Offset
+{
+public:
+ int64_t value;
+
+ Offset(int64_t value);
+ void Write(unique_ptr<iostream> const &stream) const;
+ static Offset Read(unique_ptr<iostream> const &stream);
+};
\ No newline at end of file
diff --git a/main.cpp b/main.cpp
index 0bb615a..229b7b2 100644
--- a/main.cpp
+++ b/main.cpp
@@ -4,6 +4,7 @@
#include "XML/xmlfile.h"
#include "TestDumpWriter.h"
#include "XmlPageProcessor.h"
+#include "Dump.h"
using std::string;
using std::cin;
@@ -31,7 +32,7 @@
int main(int argc, const char* argv[])
{
//StandardInputStream stream;
- XML::FileInputStream stream =
XML::FileInputStream("C:\\Users\\Svick\\Downloads\\tenwiki-20130622-pages-meta-history.xml");
+ /*XML::FileInputStream stream =
XML::FileInputStream("C:\\Users\\Svick\\Downloads\\tenwiki-20130622-pages-meta-history.xml");
XML::Input input(stream);
@@ -42,5 +43,7 @@
TestDumpWriter writer;
- input.Process(handlers, &writer);
+ input.Process(handlers, &writer);*/
+
+ WritableDump dump("test.id");
}
\ No newline at end of file
--
To view, visit https://gerrit.wikimedia.org/r/71995
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I29350dbe9de280efa1248a6c87d174fa6494d32e
Gerrit-PatchSet: 1
Gerrit-Project: operations/dumps/incremental
Gerrit-Branch: gsoc
Gerrit-Owner: Petr Onderka <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits