Brion VIBBER has submitted this change and it was merged.
Change subject: [WIP] Support reading and writing the 0.10 XML schema
......................................................................
[WIP] Support reading and writing the 0.10 XML schema
* bump package version to 1.25
* work around stray <sha1> under <page> on Wikia exports
TODO:
* Look into "deleted" attribute handling... I don't see how this information
can be read from dumps, nor is it written out.
* Needs tests.
Change-Id: I8715e97da855eb7d9d87c88f95e157e2af1fe784
---
M README
M pom.xml
M src/org/mediawiki/dumper/Dumper.java
M src/org/mediawiki/importer/DumpWriter.java
M src/org/mediawiki/importer/LatestFilter.java
M src/org/mediawiki/importer/MultiWriter.java
A src/org/mediawiki/importer/Namespace.java
M src/org/mediawiki/importer/NamespaceSet.java
M src/org/mediawiki/importer/Page.java
M src/org/mediawiki/importer/PageFilter.java
M src/org/mediawiki/importer/Revision.java
M src/org/mediawiki/importer/RevisionListFilter.java
M src/org/mediawiki/importer/Siteinfo.java
M src/org/mediawiki/importer/SphinxWriter.java
M src/org/mediawiki/importer/SqlWriter.java
M src/org/mediawiki/importer/TimeStampFilter.java
A src/org/mediawiki/importer/Wikiinfo.java
M src/org/mediawiki/importer/XmlDumpReader.java
C src/org/mediawiki/importer/XmlDumpWriter0_10.java
R src/org/mediawiki/importer/XmlDumpWriter0_3.java
M tests/org/mediawiki/importer/TitleTest.java
21 files changed, 283 insertions(+), 92 deletions(-)
Approvals:
Brion VIBBER: Verified; Looks good to me, approved
diff --git a/README b/README
index 73af93f..5788290 100644
--- a/README
+++ b/README
@@ -76,6 +76,9 @@
--format=xml
Output back to MediaWiki's XML export format; use this for
filtering dumps for limited import. Output should be idempotent.
+ --format=xml:0.3
+ Output in legacy 0.3 XML format; use with tools that can't handle
+ anything newer.
--format=mysql:1.4
SQL statements formatted for bulk import in MediaWiki 1.4's schema.
(MySQL output format.)
diff --git a/pom.xml b/pom.xml
index 39bec4d..30e77b3 100644
--- a/pom.xml
+++ b/pom.xml
@@ -5,7 +5,7 @@
<name>mwdumper</name>
<groupId>org.wikimedia</groupId>
<artifactId>mwdumper</artifactId>
- <version>1.16</version>
+ <version>1.25</version>
<packaging>jar</packaging>
<url>http://www.mediawiki.org/wiki/MWDumper</url>
diff --git a/src/org/mediawiki/dumper/Dumper.java
b/src/org/mediawiki/dumper/Dumper.java
index 44b747a..177f42a 100644
--- a/src/org/mediawiki/dumper/Dumper.java
+++ b/src/org/mediawiki/dumper/Dumper.java
@@ -89,7 +89,7 @@
if (output != null) {
// Finish constructing the
previous output...
if (sink == null)
- sink = new
XmlDumpWriter(output.getFileStream());
+ sink = new
XmlDumpWriter0_10(output.getFileStream());
writers.add(sink);
sink = null;
}
@@ -104,7 +104,7 @@
if (sink == null) {
if (output == null)
output = new
OutputWrapper(Tools.openStandardOutput());
- sink = new
XmlDumpWriter(output.getFileStream());
+ sink = new
XmlDumpWriter0_10(output.getFileStream());
}
sink = addFilter(sink, val, param);
} else if (opt.equals("progress")) {
@@ -131,7 +131,7 @@
output = new OutputWrapper(Tools.openStandardOutput());
// Finish stacking the last output sink
if (sink == null)
- sink = new XmlDumpWriter(output.getFileStream());
+ sink = new XmlDumpWriter0_10(output.getFileStream());
writers.add(sink);
DumpWriter outputSink = (progressInterval > 0)
@@ -237,11 +237,17 @@
}
static DumpWriter openOutputSink(OutputWrapper output, String format,
String param) throws IOException {
- if (format.equals("xml"))
- return new XmlDumpWriter(output.getFileStream());
- else if (format.equals("sphinx"))
+ if (format.equals("xml")) {
+ if (param.equals("0.3")) {
+ return new
XmlDumpWriter0_3(output.getFileStream());
+ } else if (param.length() == 0 || param.equals("0.10"))
{
+ return new
XmlDumpWriter0_10(output.getFileStream());
+ } else {
+ throw new IllegalArgumentException("XML schema
version not known: " + param);
+ }
+ } else if (format.equals("sphinx")) {
return new SphinxWriter(output.getFileStream());
- else if (format.equals("mysql") || format.equals("pgsql") ||
format.equals("sql")) {
+ } else if (format.equals("mysql") || format.equals("pgsql") ||
format.equals("sql")) {
SqlStream sqlStream = output.getSqlStream();
SqlWriter ret;
@@ -259,8 +265,9 @@
throw new IllegalArgumentException("SQL version
not known: " + param);
return ret;
- } else
+ } else {
throw new IllegalArgumentException("Output format not
known: " + format);
+ }
}
// ----------------
diff --git a/src/org/mediawiki/importer/DumpWriter.java
b/src/org/mediawiki/importer/DumpWriter.java
index b857ddc..eb21f02 100644
--- a/src/org/mediawiki/importer/DumpWriter.java
+++ b/src/org/mediawiki/importer/DumpWriter.java
@@ -30,7 +30,7 @@
public interface DumpWriter {
void close() throws IOException;
- void writeStartWiki() throws IOException;
+ void writeStartWiki(Wikiinfo info) throws IOException;
void writeEndWiki() throws IOException;
void writeSiteinfo(Siteinfo info) throws IOException;
diff --git a/src/org/mediawiki/importer/LatestFilter.java
b/src/org/mediawiki/importer/LatestFilter.java
index e0e051f..709f5bd 100644
--- a/src/org/mediawiki/importer/LatestFilter.java
+++ b/src/org/mediawiki/importer/LatestFilter.java
@@ -39,8 +39,8 @@
sink.close();
}
- public void writeStartWiki() throws IOException {
- sink.writeStartWiki();
+ public void writeStartWiki(Wikiinfo info) throws IOException {
+ sink.writeStartWiki(info);
}
public void writeEndWiki() throws IOException {
diff --git a/src/org/mediawiki/importer/MultiWriter.java
b/src/org/mediawiki/importer/MultiWriter.java
index 955cc8e..305cfdf 100644
--- a/src/org/mediawiki/importer/MultiWriter.java
+++ b/src/org/mediawiki/importer/MultiWriter.java
@@ -42,10 +42,10 @@
}
}
- public void writeStartWiki() throws IOException {
+ public void writeStartWiki(Wikiinfo info) throws IOException {
for (int i = 0; i < sinks.size(); i++) {
DumpWriter sink = sinks.get(i);
- sink.writeStartWiki();
+ sink.writeStartWiki(info);
}
}
diff --git a/src/org/mediawiki/importer/Namespace.java
b/src/org/mediawiki/importer/Namespace.java
new file mode 100644
index 0000000..1c44735
--- /dev/null
+++ b/src/org/mediawiki/importer/Namespace.java
@@ -0,0 +1,37 @@
+/*
+ * MediaWiki import/export processing tools
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE
+ * SOFTWARE.
+ *
+ * $Id$
+ */
+
+package org.mediawiki.importer;
+
+public class Namespace {
+ int Index;
+ String Prefix;
+ String Case;
+
+ public Namespace(int index, String prefix, String nscase) {
+ Index = index;
+ Prefix = prefix;
+ Case = nscase;
+ }
+}
diff --git a/src/org/mediawiki/importer/NamespaceSet.java
b/src/org/mediawiki/importer/NamespaceSet.java
index fcc588b..11d350f 100644
--- a/src/org/mediawiki/importer/NamespaceSet.java
+++ b/src/org/mediawiki/importer/NamespaceSet.java
@@ -31,21 +31,21 @@
import java.util.Iterator;
public class NamespaceSet {
- Map<String, Integer> byname;
- Map<Integer, String> bynumber;
+ Map<String, Namespace> byname;
+ Map<Integer, Namespace> bynumber;
public NamespaceSet() {
- byname = new HashMap<String, Integer>();
- bynumber = new LinkedHashMap<Integer, String>();
+ byname = new HashMap<String, Namespace>();
+ bynumber = new LinkedHashMap<Integer, Namespace>();
}
- public void add(int index, String prefix) {
- add(new Integer(index), prefix);
+ public void add(int index, String prefix, String nscase) {
+ add(new Integer(index), prefix, nscase);
}
- public void add(Integer index, String prefix) {
- byname.put(prefix, index);
- bynumber.put(index, prefix);
+ public void add(Integer index, String prefix, String nscase) {
+ byname.put(prefix, new Namespace(index, prefix, nscase));
+ bynumber.put(index, new Namespace(index, prefix, nscase));
}
public boolean hasPrefix(String prefix) {
@@ -57,11 +57,11 @@
}
public String getPrefix(Integer index) {
- return bynumber.get(index);
+ return bynumber.get(index).Prefix;
}
public Integer getIndex(String prefix) {
- return byname.get(prefix);
+ return byname.get(prefix).Index;
}
public String getColonPrefix(Integer index) {
@@ -71,7 +71,7 @@
return prefix;
}
- public Iterator<Map.Entry<Integer, String>> orderedEntries() {
+ public Iterator<Map.Entry<Integer, Namespace>> orderedEntries() {
return bynumber.entrySet().iterator();
}
}
diff --git a/src/org/mediawiki/importer/Page.java
b/src/org/mediawiki/importer/Page.java
index e8a1cae..edc21c9 100644
--- a/src/org/mediawiki/importer/Page.java
+++ b/src/org/mediawiki/importer/Page.java
@@ -29,7 +29,9 @@
public class Page {
public Title Title;
+ public int Ns;
public int Id;
+ public Title Redirect;
public boolean isRedirect = false;
public Hashtable<String,Object> DiscussionThreadingInfo;
public String Restrictions;
diff --git a/src/org/mediawiki/importer/PageFilter.java
b/src/org/mediawiki/importer/PageFilter.java
index f831f76..d1338b1 100644
--- a/src/org/mediawiki/importer/PageFilter.java
+++ b/src/org/mediawiki/importer/PageFilter.java
@@ -39,8 +39,8 @@
sink.close();
}
- public void writeStartWiki() throws IOException {
- sink.writeStartWiki();
+ public void writeStartWiki(Wikiinfo info) throws IOException {
+ sink.writeStartWiki(info);
}
public void writeEndWiki() throws IOException {
diff --git a/src/org/mediawiki/importer/Revision.java
b/src/org/mediawiki/importer/Revision.java
index 7b59550..9d6f695 100644
--- a/src/org/mediawiki/importer/Revision.java
+++ b/src/org/mediawiki/importer/Revision.java
@@ -29,13 +29,19 @@
public class Revision {
public int Id;
+ public int Parentid;
public Calendar Timestamp;
public Contributor Contributor;
public String Comment;
+ public String Model;
+ public String Format;
public String Text;
+ public Integer Bytes;
+ public String Sha1;
public boolean Minor;
public Revision() {
+ Parentid = 0;
Comment = "";
Text = "";
Minor = false;
diff --git a/src/org/mediawiki/importer/RevisionListFilter.java
b/src/org/mediawiki/importer/RevisionListFilter.java
index c763b22..44f4f13 100644
--- a/src/org/mediawiki/importer/RevisionListFilter.java
+++ b/src/org/mediawiki/importer/RevisionListFilter.java
@@ -58,8 +58,8 @@
sink.close();
}
- public void writeStartWiki() throws IOException {
- sink.writeStartWiki();
+ public void writeStartWiki(Wikiinfo info) throws IOException {
+ sink.writeStartWiki(info);
}
public void writeEndWiki() throws IOException {
diff --git a/src/org/mediawiki/importer/Siteinfo.java
b/src/org/mediawiki/importer/Siteinfo.java
index eec4579..27364c5 100644
--- a/src/org/mediawiki/importer/Siteinfo.java
+++ b/src/org/mediawiki/importer/Siteinfo.java
@@ -27,6 +27,7 @@
public class Siteinfo {
public String Sitename;
+ public String Dbname;
public String Base;
public String Generator;
public String Case;
diff --git a/src/org/mediawiki/importer/SphinxWriter.java
b/src/org/mediawiki/importer/SphinxWriter.java
index c0236a0..c5ffadb 100644
--- a/src/org/mediawiki/importer/SphinxWriter.java
+++ b/src/org/mediawiki/importer/SphinxWriter.java
@@ -47,7 +47,7 @@
writer.close();
}
- public void writeStartWiki() throws IOException {
+ public void writeStartWiki(Wikiinfo info) throws IOException {
writer.openXml();
// No containing element to open
}
diff --git a/src/org/mediawiki/importer/SqlWriter.java
b/src/org/mediawiki/importer/SqlWriter.java
index 74c0b17..a155157 100644
--- a/src/org/mediawiki/importer/SqlWriter.java
+++ b/src/org/mediawiki/importer/SqlWriter.java
@@ -124,8 +124,9 @@
stream.close();
}
- public void writeStartWiki() throws IOException {
+ public void writeStartWiki(Wikiinfo info) throws IOException {
stream.writeComment("-- MediaWiki XML dump converted to SQL by
mwdumper");
+ stream.writeComment("-- Lang: " + commentSafe(info.Lang));
stream.writeStatement("BEGIN");
String prologue = traits.getWikiPrologue();
@@ -146,14 +147,15 @@
public void writeSiteinfo(Siteinfo info) throws IOException {
stream.writeComment("");
stream.writeComment("-- Site: " + commentSafe(info.Sitename));
+ stream.writeComment("-- DB name: " + commentSafe(info.Dbname));
stream.writeComment("-- URL: " + commentSafe(info.Base));
stream.writeComment("-- Generator: " +
commentSafe(info.Generator));
stream.writeComment("-- Case: " + commentSafe(info.Case));
stream.writeComment("--");
stream.writeComment("-- Namespaces:");
- for (Iterator<Map.Entry<Integer, String>> i =
info.Namespaces.orderedEntries(); i.hasNext();) {
- Map.Entry<Integer, String> e = i.next();
- stream.writeComment("-- " + e.getKey() + ": " +
e.getValue());
+ for (Iterator<Map.Entry<Integer, Namespace>> i =
info.Namespaces.orderedEntries(); i.hasNext();) {
+ Map.Entry<Integer, Namespace> e = i.next();
+ stream.writeComment("-- " + e.getKey() + ": " +
e.getValue().Prefix);
}
stream.writeComment("");
}
diff --git a/src/org/mediawiki/importer/TimeStampFilter.java
b/src/org/mediawiki/importer/TimeStampFilter.java
index ab7ec0a..50c0c63 100644
--- a/src/org/mediawiki/importer/TimeStampFilter.java
+++ b/src/org/mediawiki/importer/TimeStampFilter.java
@@ -45,8 +45,8 @@
sink.close();
}
- public void writeStartWiki() throws IOException {
- sink.writeStartWiki();
+ public void writeStartWiki(Wikiinfo info) throws IOException {
+ sink.writeStartWiki(info);
}
public void writeEndWiki() throws IOException {
diff --git a/src/org/mediawiki/importer/Wikiinfo.java
b/src/org/mediawiki/importer/Wikiinfo.java
new file mode 100644
index 0000000..ea4a94f
--- /dev/null
+++ b/src/org/mediawiki/importer/Wikiinfo.java
@@ -0,0 +1,29 @@
+/*
+ * MediaWiki import/export processing tools
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE
+ * SOFTWARE.
+ *
+ * $Id$
+ */
+
+package org.mediawiki.importer;
+
+public class Wikiinfo {
+ public String Lang;
+}
diff --git a/src/org/mediawiki/importer/XmlDumpReader.java
b/src/org/mediawiki/importer/XmlDumpReader.java
index f91f9d1..8431b5e 100644
--- a/src/org/mediawiki/importer/XmlDumpReader.java
+++ b/src/org/mediawiki/importer/XmlDumpReader.java
@@ -50,12 +50,14 @@
private boolean hasContent = false;
private boolean deleted = false;
+ Wikiinfo wikiinfo;
Siteinfo siteinfo;
Page page;
boolean pageSent;
Contributor contrib;
Revision rev;
int nskey;
+ String nscase;
boolean abortFlag;
@@ -116,6 +118,8 @@
startElements.put("siteinfo","siteinfo");
startElements.put("namespaces","namespaces");
startElements.put("namespace","namespace");
+ startElements.put("redirect","redirect");
+ startElements.put("text","text");
endElements.put("ThreadSubject","ThreadSubject");
endElements.put("ThreadParent","ThreadParent");
@@ -129,18 +133,23 @@
endElements.put("base","base");
endElements.put("case","case");
endElements.put("comment","comment");
+ endElements.put("dbname","dbname");
endElements.put("contributor","contributor");
+ endElements.put("format","format");
endElements.put("generator","generator");
endElements.put("id","id");
endElements.put("ip","ip");
endElements.put("mediawiki", "mediawiki");
endElements.put("minor","minor");
+ endElements.put("model","model");
endElements.put("namespaces","namespaces");
endElements.put("namespace","namespace");
+ endElements.put("ns","ns");
endElements.put("page","page");
- endElements.put("redirect","redirect");
+ endElements.put("parentid","parentid");
endElements.put("restrictions","restrictions");
endElements.put("revision","revision");
+ endElements.put("sha1","sha1");
endElements.put("siteinfo","siteinfo");
endElements.put("sitename","sitename");
endElements.put("text","text");
@@ -161,6 +170,7 @@
// check for deleted="deleted", and set deleted flag for the
current element.
String d = attributes.getValue("deleted");
+ // FIXME: I'm not sure the deleted attribute is preserved in
our objects?
deleted = (d!=null && d.equals("deleted"));
try {
@@ -171,8 +181,10 @@
if (qName == "revision") openRevision();
else if (qName == "contributor") openContributor();
else if (qName == "page") openPage();
+ else if (qName == "redirect") openRedirect(attributes);
+ else if (qName == "text") openText(attributes);
// rare tags:
- else if (qName == "mediawiki") openMediaWiki();
+ else if (qName == "mediawiki")
openMediaWiki(attributes);
else if (qName == "siteinfo") openSiteinfo();
else if (qName == "namespaces") openNamespaces();
else if (qName == "namespace")
openNamespace(attributes);
@@ -203,6 +215,7 @@
// frequent tags:
if (qName == "id") readId();
else if (qName == "revision") closeRevision();
+ else if (qName == "parentid") readParentid();
else if (qName == "timestamp") readTimestamp();
else if (qName == "text") readText();
else if (qName == "contributor") closeContributor();
@@ -210,15 +223,19 @@
else if (qName == "ip") readIp();
else if (qName == "comment") readComment();
else if (qName == "minor") readMinor();
+ else if (qName == "model") readModel();
+ else if (qName == "format") readFormat();
+ else if (qName == "sha1") readSha1();
else if (qName == "page") closePage();
else if (qName == "title") readTitle();
+ else if (qName == "ns") readNs();
else if (qName == "restrictions") readRestrictions();
- else if (qName == "redirect") readRedirect();
// rare tags:
else if (qName.startsWith("Thread"))
threadAttribute(qName);
else if (qName == "mediawiki") closeMediaWiki();
else if (qName == "siteinfo") closeSiteinfo();
else if (qName == "sitename") readSitename();
+ else if (qName == "dbname") readDbname();
else if (qName == "base") readBase();
else if (qName == "generator") readGenerator();
else if (qName == "case") readCase();
@@ -239,9 +256,16 @@
page.DiscussionThreadingInfo.put(attrib,
bufferContents());
}
- void openMediaWiki() throws IOException {
+ void openMediaWiki(Attributes attributes) throws IOException {
siteinfo = null;
- writer.writeStartWiki();
+
+ wikiinfo = new Wikiinfo();
+ wikiinfo.Lang = attributes.getValue("xml:lang");
+ if (wikiinfo.Lang.length() == 0) {
+ wikiinfo.Lang = "en";
+ }
+
+ writer.writeStartWiki(wikiinfo);
}
void closeMediaWiki() throws IOException {
@@ -272,6 +296,10 @@
siteinfo.Sitename = bufferContents();
}
+ void readDbname() {
+ siteinfo.Dbname = bufferContents();
+ }
+
void readBase() {
siteinfo.Base = bufferContents();
}
@@ -290,10 +318,11 @@
void openNamespace(Attributes attribs) {
nskey = Integer.parseInt(attribs.getValue("key"));
+ nscase = attribs.getValue("case");
}
void closeNamespace() {
- siteinfo.Namespaces.add(nskey, bufferContents());
+ siteinfo.Namespaces.add(nskey, bufferContents(), nscase);
}
void closeNamespaces() {
@@ -316,7 +345,11 @@
void readTitle() {
page.Title = new Title(bufferContents(), siteinfo.Namespaces);
}
-
+
+ void readNs() {
+ page.Ns = Integer.parseInt(bufferContents());
+ }
+
void readId() {
int id = Integer.parseInt(bufferContents());
if (contrib != null)
@@ -328,11 +361,15 @@
else
throw new IllegalArgumentException("Unexpected <id>
outside a <page>, <revision>, or <contributor>");
}
-
- void readRedirect() {
+
+ void openRedirect(Attributes attributes) {
+ String title = attributes.getValue("title");
+ if (title != null) {
+ page.Redirect = new Title(title, siteinfo.Namespaces);
+ }
page.isRedirect = true;
}
-
+
void readRestrictions() {
page.Restrictions = bufferContents();
}
@@ -353,6 +390,10 @@
rev = null;
}
+ void readParentid() {
+ rev.Parentid = Integer.parseInt(bufferContents());
+ }
+
void readTimestamp() {
rev.Timestamp = parseUTCTimestamp(bufferContents());
}
@@ -366,10 +407,38 @@
rev.Minor = true;
}
+ void readModel() {
+ rev.Model = bufferContents();
+ }
+
+ void readFormat() {
+ rev.Format = bufferContents();
+ }
+
+ void openText(Attributes attributes) {
+ String bytes = attributes.getValue("bytes");
+ if (bytes != null) {
+ rev.Bytes = new Integer(bytes);
+ }
+ }
+
void readText() {
rev.Text = bufferContentsOrNull();
if (rev.Text==null && !deleted) rev.Text = ""; //NOTE: null
means deleted/supressed
}
+
+ void readSha1() {
+ String sha1 = bufferContents();
+
+ if (rev != null) {
+ rev.Sha1 = sha1;
+ } else if (page != null) {
+ // Wikia running MW 1.19 emits a <sha1> directly under
<page>;
+ // ignore it for now as it's unclear how to map it.
+ } else {
+ throw new IllegalArgumentException("Unexpected <id>
outside a <revision>");
+ }
+ }
// -----------
void openContributor() {
diff --git a/src/org/mediawiki/importer/XmlDumpWriter.java
b/src/org/mediawiki/importer/XmlDumpWriter0_10.java
similarity index 70%
copy from src/org/mediawiki/importer/XmlDumpWriter.java
copy to src/org/mediawiki/importer/XmlDumpWriter0_10.java
index 872dab7..65010da 100644
--- a/src/org/mediawiki/importer/XmlDumpWriter.java
+++ b/src/org/mediawiki/importer/XmlDumpWriter0_10.java
@@ -29,24 +29,25 @@
import java.io.OutputStream;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
+import java.util.ArrayList;
import java.util.Calendar;
import java.util.Iterator;
import java.util.Map;
import java.util.TimeZone;
-public class XmlDumpWriter implements DumpWriter {
+public class XmlDumpWriter0_10 implements DumpWriter {
protected OutputStream stream;
protected XmlWriter writer;
- protected static final String version = "0.3";
+ protected static final String version = "0.10";
protected static final String ns =
"http://www.mediawiki.org/xml/export-" + version + "/";
protected static final String schema =
"http://www.mediawiki.org/xml/export-" + version + ".xsd";
protected static final DateFormat dateFormat = new
SimpleDateFormat("yyyy'-'MM'-'dd'T'HH':'mm':'ss'Z'");
static {
dateFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
}
-
- public XmlDumpWriter(OutputStream output) throws IOException {
+
+ public XmlDumpWriter0_10(OutputStream output) throws IOException {
stream = output;
writer = new XmlWriter(stream);
}
@@ -55,15 +56,14 @@
writer.close();
}
- public void writeStartWiki() throws IOException {
+ public void writeStartWiki(Wikiinfo info) throws IOException {
writer.openXml();
writer.openElement("mediawiki", new String[][] {
{"xmlns", ns},
{"xmlns:xsi",
"http://www.w3.org/2001/XMLSchema-instance"},
{"xsi:schemaLocation", ns + " " + schema},
{"version", version},
- {"xml:lang", "en"}});
- // TODO: store and keep the xml:lang
+ {"xml:lang", info.Lang}});
}
public void writeEndWiki() throws IOException {
@@ -75,15 +75,25 @@
XmlWriter writer = this.writer;
writer.openElement("siteinfo");
writer.textElement("sitename", info.Sitename);
+ if (info.Dbname != null) {
+ writer.textElement("dbname", info.Dbname);
+ }
writer.textElement("base", info.Base);
writer.textElement("generator", info.Generator);
writer.textElement("case", info.Case);
writer.openElement("namespaces");
- for (Iterator<Map.Entry<Integer, String>> i =
info.Namespaces.orderedEntries(); i.hasNext();) {
- Map.Entry<Integer, String> e = i.next();
- writer.textElement("namespace",
e.getValue().toString(), new String[][] {
- {"key", e.getKey().toString()}});
+ for (Iterator<Map.Entry<Integer, Namespace>> i =
info.Namespaces.orderedEntries(); i.hasNext();) {
+ Map.Entry<Integer, Namespace> e = i.next();
+
+ ArrayList<String[]> textAttribs = new
ArrayList<String[]>();
+ textAttribs.add(new String[] {"key",
e.getKey().toString()});
+ if (e.getValue().Case != null) {
+ textAttribs.add(new String[] {"case",
e.getValue().Case});
+ }
+ writer.textElement("namespace", e.getValue().Prefix,
+ textAttribs.toArray(new String[][] {})
+ );
}
writer.closeElement();
@@ -94,8 +104,15 @@
XmlWriter writer = this.writer;
writer.openElement("page");
writer.textElement("title", page.Title.toString());
- if (page.Id != 0)
+ writer.textElement("ns", Integer.toString(page.Ns));
+ if (page.Id != 0) {
writer.textElement("id", Integer.toString(page.Id));
+ }
+ if (page.Redirect != null) {
+ writer.emptyElement("redirect", new String[][] {
+ {"title", page.Redirect.toString()}
+ });
+ }
if (page.Restrictions != null && page.Restrictions.length() !=
0)
writer.textElement("restrictions", page.Restrictions);
}
@@ -107,8 +124,12 @@
public void writeRevision(Revision rev) throws IOException {
XmlWriter writer = this.writer;
writer.openElement("revision");
- if (rev.Id != 0)
+ if (rev.Id != 0) {
writer.textElement("id", Integer.toString(rev.Id));
+ }
+ if (rev.Parentid != 0) {
+ writer.textElement("parentid",
Integer.toString(rev.Parentid));
+ }
writer.textElement("timestamp", formatTimestamp(rev.Timestamp));
@@ -120,15 +141,30 @@
if (rev.Comment == null) {
writer.emptyElement("comment", deletedAttrib);
- }
+ }
else if (rev.Comment.length() != 0) {
writer.textElement("comment", rev.Comment);
}
-
- writer.textElement("text", rev.Text,
- rev.Text==null ? new String[][] {{"xml:space",
"preserve"}, {"deleted", "deleted"}}
-
: new String[][] {{"xml:space", "preserve"}}
- );
+
+ if (rev.Model != null) {
+ writer.textElement("model", rev.Model);
+ }
+ if (rev.Format != null) {
+ writer.textElement("format", rev.Format);
+ }
+
+ ArrayList<String[]> textAttribs = new ArrayList<String[]>();
+ textAttribs.add(new String[] {"xml:space", "preserve"});
+ if (rev.Text == null) {
+ textAttribs.add(new String[] {"deleted", "deleted"});
+ } else if (rev.Bytes != null) {
+ textAttribs.add(new String[] {"bytes",
rev.Bytes.toString()});
+ }
+ writer.textElement("text", rev.Text, textAttribs.toArray(new
String[][] {}));
+
+ if (rev.Sha1 != null) {
+ writer.textElement("sha1", rev.Sha1);
+ }
writer.closeElement();
}
diff --git a/src/org/mediawiki/importer/XmlDumpWriter.java
b/src/org/mediawiki/importer/XmlDumpWriter0_3.java
similarity index 90%
rename from src/org/mediawiki/importer/XmlDumpWriter.java
rename to src/org/mediawiki/importer/XmlDumpWriter0_3.java
index 872dab7..40cdbb5 100644
--- a/src/org/mediawiki/importer/XmlDumpWriter.java
+++ b/src/org/mediawiki/importer/XmlDumpWriter0_3.java
@@ -34,7 +34,7 @@
import java.util.Map;
import java.util.TimeZone;
-public class XmlDumpWriter implements DumpWriter {
+public class XmlDumpWriter0_3 implements DumpWriter {
protected OutputStream stream;
protected XmlWriter writer;
@@ -46,7 +46,7 @@
dateFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
}
- public XmlDumpWriter(OutputStream output) throws IOException {
+ public XmlDumpWriter0_3(OutputStream output) throws IOException {
stream = output;
writer = new XmlWriter(stream);
}
@@ -55,15 +55,14 @@
writer.close();
}
- public void writeStartWiki() throws IOException {
+ public void writeStartWiki(Wikiinfo info) throws IOException {
writer.openXml();
writer.openElement("mediawiki", new String[][] {
{"xmlns", ns},
{"xmlns:xsi",
"http://www.w3.org/2001/XMLSchema-instance"},
{"xsi:schemaLocation", ns + " " + schema},
{"version", version},
- {"xml:lang", "en"}});
- // TODO: store and keep the xml:lang
+ {"xml:lang", info.Lang}});
}
public void writeEndWiki() throws IOException {
@@ -80,9 +79,9 @@
writer.textElement("case", info.Case);
writer.openElement("namespaces");
- for (Iterator<Map.Entry<Integer, String>> i =
info.Namespaces.orderedEntries(); i.hasNext();) {
- Map.Entry<Integer, String> e = i.next();
- writer.textElement("namespace",
e.getValue().toString(), new String[][] {
+ for (Iterator<Map.Entry<Integer, Namespace>> i =
info.Namespaces.orderedEntries(); i.hasNext();) {
+ Map.Entry<Integer, Namespace> e = i.next();
+ writer.textElement("namespace", e.getValue().Prefix,
new String[][] {
{"key", e.getKey().toString()}});
}
writer.closeElement();
diff --git a/tests/org/mediawiki/importer/TitleTest.java
b/tests/org/mediawiki/importer/TitleTest.java
index e9d723b..2fbc49d 100644
--- a/tests/org/mediawiki/importer/TitleTest.java
+++ b/tests/org/mediawiki/importer/TitleTest.java
@@ -37,24 +37,24 @@
protected void setUp() throws Exception {
super.setUp();
namespaces = new NamespaceSet();
- namespaces.add(-2, "Media");
- namespaces.add(-1, "Special");
- namespaces.add(0, "");
- namespaces.add(1, "Talk");
- namespaces.add(2, "User");
- namespaces.add(3, "User talk");
- namespaces.add(4, "Project");
- namespaces.add(5, "Project talk");
- namespaces.add(6, "Image");
- namespaces.add(7, "Image talk");
- namespaces.add(8, "MediaWiki");
- namespaces.add(9, "MediaWiki talk");
- namespaces.add(10, "Template");
- namespaces.add(11, "Template talk");
- namespaces.add(12, "Help");
- namespaces.add(13, "Help talk");
- namespaces.add(14, "Category");
- namespaces.add(15, "Category talk");
+ namespaces.add(-2, "Media", "first-letter");
+ namespaces.add(-1, "Special", "first-letter");
+ namespaces.add(0, "", "first-letter");
+ namespaces.add(1, "Talk", "first-letter");
+ namespaces.add(2, "User", "first-letter");
+ namespaces.add(3, "User talk", "first-letter");
+ namespaces.add(4, "Project", "first-letter");
+ namespaces.add(5, "Project talk", "first-letter");
+ namespaces.add(6, "Image", "first-letter");
+ namespaces.add(7, "Image talk", "first-letter");
+ namespaces.add(8, "MediaWiki", "first-letter");
+ namespaces.add(9, "MediaWiki talk", "first-letter");
+ namespaces.add(10, "Template", "first-letter");
+ namespaces.add(11, "Template talk", "first-letter");
+ namespaces.add(12, "Help", "first-letter");
+ namespaces.add(13, "Help talk", "first-letter");
+ namespaces.add(14, "Category", "first-letter");
+ namespaces.add(15, "Category talk", "first-letter");
}
protected void tearDown() throws Exception {
--
To view, visit https://gerrit.wikimedia.org/r/192174
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I8715e97da855eb7d9d87c88f95e157e2af1fe784
Gerrit-PatchSet: 12
Gerrit-Project: mediawiki/tools/mwdumper
Gerrit-Branch: master
Gerrit-Owner: Awight <[email protected]>
Gerrit-Reviewer: ArielGlenn <[email protected]>
Gerrit-Reviewer: Awight <[email protected]>
Gerrit-Reviewer: Brion VIBBER <[email protected]>
Gerrit-Reviewer: Daniel Kinzler <[email protected]>
Gerrit-Reviewer: Diederik <[email protected]>
Gerrit-Reviewer: Martineznovo <[email protected]>
Gerrit-Reviewer: Oren <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits