Petr Onderka has uploaded a new change for review.
https://gerrit.wikimedia.org/r/80579
Change subject: Documentation of command line parameters + other small changes
......................................................................
Documentation of command line parameters + other small changes
Change-Id: I355014afbe2c2a954e559125bd1ec4da56e7f16c
---
M Diff/Changes/RevisionChange.cpp
M DumpException.cpp
M DumpException.h
M DumpKind.cpp
M DumpKind.h
M DumpWriters/DumpWriter.cpp
M README.md
M main.cpp
8 files changed, 148 insertions(+), 33 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/operations/dumps/incremental
refs/changes/79/80579/1
diff --git a/Diff/Changes/RevisionChange.cpp b/Diff/Changes/RevisionChange.cpp
index 1f469a7..e480b5f 100644
--- a/Diff/Changes/RevisionChange.cpp
+++ b/Diff/Changes/RevisionChange.cpp
@@ -80,8 +80,8 @@
{
RevisionChange result(withText);
- ReadValue(stream, result.flags);
ReadValue(stream, result.revisionChanges.RevisionId);
+ ReadValue(stream, result.flags);
if (HasFlag(result.flags, RevisionChangeFlags::Flags))
ReadValue(stream, result.revisionChanges.Flags);
@@ -121,8 +121,8 @@
void RevisionChange::WriteInternal()
{
WriteValue(ChangeKind::ChangeRevision);
- WriteValue(flags);
WriteValue(revisionChanges.RevisionId);
+ WriteValue(flags);
if (HasFlag(flags, RevisionChangeFlags::Flags))
WriteValue(revisionChanges.Flags);
@@ -158,8 +158,8 @@
std::uint32_t result = 0;
result += ValueSize(ChangeKind::ChangeRevision);
- result += ValueSize(flags);
result += ValueSize(revisionChanges.RevisionId);
+ result += ValueSize(flags);
if (HasFlag(flags, RevisionChangeFlags::Flags))
result += ValueSize(revisionChanges.Flags);
diff --git a/DumpException.cpp b/DumpException.cpp
index 239fbb5..761cc94 100644
--- a/DumpException.cpp
+++ b/DumpException.cpp
@@ -4,7 +4,7 @@
: message(message)
{}
-const char* UserException::what() const throw()
+const char* UserException::what() const NOEXCEPT
{
return message.c_str();
}
diff --git a/DumpException.h b/DumpException.h
index 9aa01db..4b89a9f 100644
--- a/DumpException.h
+++ b/DumpException.h
@@ -3,6 +3,13 @@
#include <exception>
#include <string>
+// http://stackoverflow.com/a/18387764/41071
+#ifndef _MSC_VER
+#define NOEXCEPT noexcept
+#else
+#define NOEXCEPT
+#endif
+
class DumpException : public std::exception
{};
@@ -15,7 +22,7 @@
public:
UserException(const std::string &message);
- virtual const char* what() const throw() override;
+ virtual const char* what() const NOEXCEPT override;
};
// user exception that was caused by invalid parameters,
diff --git a/DumpKind.cpp b/DumpKind.cpp
index 0c82f26..a1900ff 100644
--- a/DumpKind.cpp
+++ b/DumpKind.cpp
@@ -34,4 +34,22 @@
{
first = first | second;
return first;
+}
+
+std::ostream& operator<<(std::ostream& stream, DumpKind dumpKind)
+{
+ if (IsStub(dumpKind))
+ stream << "stub";
+ else
+ stream << "pages";
+
+ if (IsCurrent(dumpKind))
+ stream << "-current";
+ else
+ stream << "-history";
+
+ if (IsArticles(dumpKind))
+ stream << "-articles";
+
+ return stream;
}
\ No newline at end of file
diff --git a/DumpKind.h b/DumpKind.h
index 3b25f0d..08ff048 100644
--- a/DumpKind.h
+++ b/DumpKind.h
@@ -1,6 +1,7 @@
#pragma once
#include <cstdint>
+#include <iostream>
enum class DumpKind : std::uint8_t
{
@@ -19,4 +20,6 @@
bool IsArticles(DumpKind dumpKind);
DumpKind operator |(DumpKind first, DumpKind second);
-DumpKind operator |=(DumpKind &first, DumpKind second);
\ No newline at end of file
+DumpKind operator |=(DumpKind &first, DumpKind second);
+
+std::ostream& operator<<(std::ostream& os, DumpKind obj);
\ No newline at end of file
diff --git a/DumpWriters/DumpWriter.cpp b/DumpWriters/DumpWriter.cpp
index 9ff4c5b..2ac4611 100644
--- a/DumpWriters/DumpWriter.cpp
+++ b/DumpWriters/DumpWriter.cpp
@@ -1,8 +1,9 @@
+#include <algorithm>
#include "DumpWriter.h"
#include "../DumpObjects/DumpRevision.h"
#include "../CollectionHelpers.h"
#include "../Indexes/Index.h"
-#include <algorithm>
+#include "../format.h"
DumpWriter::DumpWriter(std::shared_ptr<WritableDump> dump, bool withText,
std::unique_ptr<DiffWriter> diffWriter)
: dump(dump), withText(withText), diffWriter(std::move(diffWriter))
@@ -85,8 +86,18 @@
if (withText)
dumpKind |= DumpKind::Pages;
- dump->fileHeader.Kind = dumpKind;
- dump->fileHeader.Write();
+ // empty name means it's a new dump
+ if (dump->siteInfo->name.empty())
+ {
+ dump->fileHeader.Kind = dumpKind;
+ dump->fileHeader.Write();
+ }
+ else
+ {
+ if (dump->fileHeader.Kind != dumpKind)
+ throw UserException(str(fmt::Format(
+ "The specified dump kind ({0}) is not the same as the kind of
the dump ({1}).") << dumpKind << dump->fileHeader.Kind));
+ }
}
void DumpWriter::EndDump()
diff --git a/README.md b/README.md
index 81c89dd..0cf68c3 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,8 @@
[Incremental dumps][1] are an improved format for dumps of content from
WikiMedia wikis.
-== Compiling ==
+## Compiling
-=== Linux ===
+### Linux
You will need cmake 2.8 and gcc 4.8 (gcc 4.7 could work too, I haven't tested
that).
To compile this project, run:
@@ -10,30 +10,106 @@
cmake .
make
-=== Windows ===
+### Windows
Open the solution in Visual Studio 2012 (Visual Studio Express 2012 could work
too, I haven't tested that) and build it.
-== Running the application ==
+## Running the application
-Compiling produces a command-line application `idumps`.
+Compiling produces a command-line application `idumps` (or `idumps.exe` on
Windows).
-It can be used to convert pages-history XML dump to one or more types of
incremental dumps
-and also to convert an incremental dump back to XML.
-Running it wihtout parameters produces a short usage message, explaining the
meaning of parameters.
+Running it wihtout parameters produces a short usage message, explaining
possible actions and the meaning of their parameters.
-When creating a dump with a file name that already exists, the program tried
to use the existing file.
-This can cause problems, so if an error happens, delete the dump file and try
running the command again
+### For dump readers
-=== Examples ===
+The following actions are useful for normal users of dumps, i.e. those who
want to download and process them, not to create their own dumps.
+In the future, there might be a special version of `idumps` that contains only
these actions.
- idumps c tenwiki-20130622-pages-meta-history.xml sh sh.id pca pca.id
+#### Reading a dump
-Creates stub-history dump `sh.id` and pages-articles dump `pca.id` from the
XML dump of tenwiki.
+The `r` (or `read`) action is for reading a dump and converting it to XML.
+It takes two parameters: path to the dump file and path to the generated XML
file.
+If the XML file already exists, it will be overwritten.
- idumps r sh.id sh.xml
+Example:
-Creates `sh.xml`, XML version of the dump `sh.id`.
+ idumps r dump.id dump.xml
+#### Applying a diff dump
+
+The `a` (or `apply`) action is for applying diff dump to existing normal dump
to update it.
+It takes two parameters: path to the dump file and path to the diff dump file.
+If the diff cannot be applied to the dump (because it's for different wiki, or
for a dump with different timestamp), an error is printed.
+Applying the diff also updates the timestamp of the dump.
+
+Example:
+
+ idumps a dump.id diff.dd
+
+### For dump creators
+
+The following actions are for dump creators, usually those who want to create
a dump of their wiki.
+
+#### Updating a dump from wiki
+
+The `u` (or `update`) action is for updating or creating a dump based on
communication with a wiki.
+The first five parameters are:
+
+* Name of the wiki (e.g. `enwiki`). This has to match with the name that's
already in the dump, if it exists.
+
+* New timestamp for the updated dump. This has to be different than the
current timestamp in the dump, if it exists.
+
+ Timestamps are used in diff dumps, to make sure only the right diff can be
applied to the dump.
+
+* Path to the PHP interpreter. This can be simply `php`, if `php` is in `PATH`.
+
+* Path to the dumpBackup maintenance script, possibly with optional parameters
(e.g. `--report`).
+
+ The script will be called with additional parameters `--full --stub`, which
are added automatically.
+
+* Path to the fetchText maintenance script.
+
+The remaining parameters specify what dumps to update/create, each group of
two or three parameters represents separate dump.
+
+The parameters in each group are:
+
+* Dump specification.
+
+* Path to the dump file.
+
+* Path to the created diff dump (if diff was included in the specification).
+
+ If the file already exists, it will be overwritten.
+
+The specification is a 2- to 4-letter string that describes what kind of dump
to create:
+
+* 1. letter: `p` for pages dump or `s` for stub dump
+* 2. letter: `h` for history dump or `c` for current dump
+* 3. optional letter: `a` for articles dump (without talk and User namespaces)
+* 4. optional letter: `d` to also create diff dump
+
+Example:
+
+ idumps u enwiki 20130823 php "/var/www/maintenance/dumpBackup.php
--report=10000" /var/www/maintenance/fetchText.php pca pages-articles.id shd
stub-history.id stub-history.dd
+
+This sets the name of the wiki to `enwiki`, timestamp to `20130823`. The PHP
interpreter is in `PATH`, MediaWiki is installed in `/var/www` and dumpBackup
will report each 10000 revisions.
+
+The updated or created dumps are a pages-current-articles dump
pages-articles.id and a stub-history dump stub-history.id that also has a diff
dump stub-history.dd.
+
+### Creating a dump from XML
+
+The `c` (or `create`) action is for creating incremental dump based on
pages-history XML dump.
+
+The parameters are similar as in the `update` action:
+
+* Name of the wiki.
+* Timestamp of the created dump.
+* Path to the source XML dump.
+
+The remaining parameters specify what dumps to creates, just as in `update`.
+
+Example:
+
+ idumps c enwiki 20130823 enwiki-20130823-pages-meta-history.xml sc sc.id
[1]: http://www.mediawiki.org/wiki/User:Svick/Incremental_dumps
diff --git a/main.cpp b/main.cpp
index c575cd0..5a20936 100644
--- a/main.cpp
+++ b/main.cpp
@@ -15,19 +15,19 @@
void printUsage()
{
std::cout << "Usage:\n";
- std::cout << "creating dump: idumps c[reate] name timestamp source.xml
spec dump.id ...\n";
+ std::cout << "reading dump: idumps r[ead] dump.id output.xml\n";
+ std::cout << "applying diff: idumps a[pply] dump.id diff.dd\n";
+ std::cout << "updating dump: idumps u[pdate] name new-timestamp phpPath
dumpBackup fetchText spec dump.id ...\n";
std::cout << " name is the name of the wiki (e.g. enwiki)\n";
std::cout << " timestamp identifies this specific dump (it doesn't
actually have to be a timestamp)\n";
std::cout << " spec is a 2 to 4-letter string that describes what kind of
dump to create:\n";
- std::cout << " 1. letter: p for pages dump or s for stub dump:\n";
- std::cout << " 2. letter: h for history dump or c for current dump:\n";
- std::cout << " 3. optional letter: a for articles dump:\n";
- std::cout << " 4. optional letter: d to also create diff dump:\n";
+ std::cout << " 1. letter: p for pages dump or s for stub dump\n";
+ std::cout << " 2. letter: h for history dump or c for current dump\n";
+ std::cout << " 3. optional letter: a for articles dump\n";
+ std::cout << " 4. optional letter: d to also create diff dump\n";
std::cout << " add the path to the diff dump after the path to dump\n";
std::cout << " example: sh for stub-meta-history, pcad for pages-articles
with diff dump\n";
- std::cout << "updating dump: idumps u[pdate] name new-timestamp phpPath
dumpBackup fetchText spec dump.id ...\n";
- std::cout << "reading dump: idumps r[ead] dump.id output.xml\n";
- std::cout << "applying diff: idumps a[pply] dump.id diff.dd\n";
+ std::cout << "creating dump: idumps c[reate] name timestamp source.xml
spec dump.id ...\n";
}
std::unique_ptr<IDumpWriter> createWriter(std::queue<std::string> ¶meters,
const std::string &name, const std::string ×tamp)
@@ -193,7 +193,7 @@
exec_stream_t dumpBackupProcess;
dumpBackupProcess.set_buffer_limit(exec_stream_t::s_out, 8192);
- dumpBackupProcess.start(phpPath, dumpBackupParameters);
+ dumpBackupProcess.start(phpPath, dumpBackupParameters + " --full --stub");
WrapperInputStream dumpBackupStream(dumpBackupProcess.out(),
[&]()
--
To view, visit https://gerrit.wikimedia.org/r/80579
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I355014afbe2c2a954e559125bd1ec4da56e7f16c
Gerrit-PatchSet: 1
Gerrit-Project: operations/dumps/incremental
Gerrit-Branch: gsoc
Gerrit-Owner: Petr Onderka <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits