Brion VIBBER has submitted this change and it was merged.
Change subject: Update Xerxes to 2.11.0
......................................................................
Update Xerxes to 2.11.0
Should fix intermittent UTF-8 input errors (UTF8Reader was throwing
ArrayIndexOutOfBoundsException sometimes when trying to read just
past its buffer boundary).
Includes test case that is known to fail on xerces 2.7.1.
Bug: T59236
Change-Id: Ie0b699804f45d452f9f6b4f3be22e96f9b50d1f7
---
M pom.xml
A tests/org/mediawiki/importer/UTF8BoundsTest.java
2 files changed, 97 insertions(+), 1 deletion(-)
Approvals:
Brion VIBBER: Verified; Looks good to me, approved
diff --git a/pom.xml b/pom.xml
index 30e77b3..5dc7d5c 100644
--- a/pom.xml
+++ b/pom.xml
@@ -46,7 +46,7 @@
<dependency>
<groupId>xerces</groupId>
<artifactId>xercesImpl</artifactId>
- <version>2.7.1</version>
+ <version>2.11.0</version>
<scope>runtime</scope>
</dependency>
<dependency>
@@ -55,6 +55,12 @@
<version>3.8</version>
<scope>compile</scope>
</dependency>
+ <dependency>
+ <groupId>commons-io</groupId>
+ <artifactId>commons-io</artifactId>
+ <version>2.4</version>
+ <scope>compile</scope>
+ </dependency>
</dependencies>
<build>
diff --git a/tests/org/mediawiki/importer/UTF8BoundsTest.java
b/tests/org/mediawiki/importer/UTF8BoundsTest.java
new file mode 100644
index 0000000..4659eb3
--- /dev/null
+++ b/tests/org/mediawiki/importer/UTF8BoundsTest.java
@@ -0,0 +1,90 @@
+/*
+ * MediaWiki import/export processing tools
+ * Copyright 2005-2016 by Brion Vibber and other contributors
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE
+ * SOFTWARE.
+ *
+ * $Id$
+ */
+
+package org.mediawiki.importer;
+
+import junit.framework.TestCase;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+
+import java.nio.charset.StandardCharsets;
+
+import org.apache.commons.io.output.NullOutputStream;
+
+public class UTF8BoundsTest extends TestCase {
+
+ public static void main(String[] args) {
+ junit.textui.TestRunner.run(UTF8BoundsTest.class);
+ }
+
+ private String makeRepeated(int numberOfTimes, String source) {
+ StringBuilder buffer = new StringBuilder();
+ for (int i = 0; i < numberOfTimes; i++) {
+ buffer.append(source);
+ }
+ return buffer.toString();
+ }
+
+ private String makeSampleRepeated(int repeat, int padding) {
+ // Some scary 4-byte-per-char text from got.wikipedia.org
+ String sample = "π
π°πΉπ»π° π°π½π³π°π½π΄πΌπ° ππΉπΎπ°πΉπΈ π°π½π° π²πΏππΉππΊπ°πΌπΌπ° πΏππΌπ΄ππΎπ°
π
πΉπΊπΉππ°πΉπ³πΎππ, ππππ³πΉπ±ππΊππ πΉπ½ π½π°ππΎπ°, πΈπ°ππ΄πΉ ππ°ππΎπΉπ πΌπ°π² πΌπΉπΈπ°ππ±π°πΉπ³πΎπ°π½. ππ π
πΉπΊπΉππ°πΉπ³πΎπ°
π²πΏπππ°πΆπ³π°πΉ πΎπ°π· 447 π»π°πΏπ±π°π½π π·π°π±π°πΉπΈ.";
+ return "<mediawiki xml:lang=\"en\">" +
+ "<siteinfo><namespaces><namespace
key=\"0\"></namespace></namespaces></siteinfo>" +
+ "<page><title>Test</title><revision>" +
+ "<id>1</id>" +
+ "<timestamp>2016-04-23T16:46:00Z</timestamp>" +
+
"<contributor><username>Test</username><id>1</id></contributor>" +
+ "<text>" + makeRepeated(padding, " ") +
makeRepeated(repeat, sample) + "</text>" +
+ "</revision></page></mediawiki>";
+ }
+
+ private boolean runImportThingy(String sample) throws IOException {
+ OutputStream output = new NullOutputStream();
+ InputStream input = new
ByteArrayInputStream(sample.getBytes(StandardCharsets.UTF_8));
+ DumpWriter sink = new XmlDumpWriter0_10(output);
+ XmlDumpReader source = new XmlDumpReader(input, sink);
+
+ source.readDump();
+
+ return true; // did not throw
+ }
+
+ public void testParsingTinyFile() throws IOException {
+ String sample = makeSampleRepeated(0, 0);
+ assertTrue("tiny file parses ok", runImportThingy(sample));
+ }
+
+ public void testParsingManyOffsets() throws IOException {
+ int unicodeRepeatCount = 200;
+ // known to fail in this range on xerces 2.7.1
+ for (int i = 1750; i < 1800; i++) {
+ String sample = makeSampleRepeated(unicodeRepeatCount,
i);
+ assertTrue("file with repeat of " + i + " parses ok",
runImportThingy(sample));
+ }
+ }
+}
--
To view, visit https://gerrit.wikimedia.org/r/285004
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: Ie0b699804f45d452f9f6b4f3be22e96f9b50d1f7
Gerrit-PatchSet: 2
Gerrit-Project: mediawiki/tools/mwdumper
Gerrit-Branch: master
Gerrit-Owner: Brion VIBBER <[email protected]>
Gerrit-Reviewer: Brion VIBBER <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits