Brion VIBBER has submitted this change and it was merged.

Change subject: Update Xerxes to 2.11.0
......................................................................


Update Xerxes to 2.11.0

Should fix intermittent UTF-8 input errors (UTF8Reader was throwing
ArrayIndexOutOfBoundsException sometimes when trying to read just
past its buffer boundary).

Includes test case that is known to fail on xerces 2.7.1.

Bug: T59236
Change-Id: Ie0b699804f45d452f9f6b4f3be22e96f9b50d1f7
---
M pom.xml
A tests/org/mediawiki/importer/UTF8BoundsTest.java
2 files changed, 97 insertions(+), 1 deletion(-)

Approvals:
  Brion VIBBER: Verified; Looks good to me, approved



diff --git a/pom.xml b/pom.xml
index 30e77b3..5dc7d5c 100644
--- a/pom.xml
+++ b/pom.xml
@@ -46,7 +46,7 @@
     <dependency>
       <groupId>xerces</groupId>
       <artifactId>xercesImpl</artifactId>
-      <version>2.7.1</version>
+      <version>2.11.0</version>
       <scope>runtime</scope>
     </dependency>
     <dependency>
@@ -55,6 +55,12 @@
       <version>3.8</version>
       <scope>compile</scope>
     </dependency>
+    <dependency>
+      <groupId>commons-io</groupId>
+      <artifactId>commons-io</artifactId>
+      <version>2.4</version>
+      <scope>compile</scope>
+    </dependency>
   </dependencies>
 
   <build>
diff --git a/tests/org/mediawiki/importer/UTF8BoundsTest.java 
b/tests/org/mediawiki/importer/UTF8BoundsTest.java
new file mode 100644
index 0000000..4659eb3
--- /dev/null
+++ b/tests/org/mediawiki/importer/UTF8BoundsTest.java
@@ -0,0 +1,90 @@
+/*
+ * MediaWiki import/export processing tools
+ * Copyright 2005-2016 by Brion Vibber and other contributors
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to 
deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 
THE
+ * SOFTWARE.
+ *
+ * $Id$
+ */
+
+package org.mediawiki.importer;
+
+import junit.framework.TestCase;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+
+import java.nio.charset.StandardCharsets;
+
+import org.apache.commons.io.output.NullOutputStream;
+
+public class UTF8BoundsTest extends TestCase {
+
+       public static void main(String[] args) {
+               junit.textui.TestRunner.run(UTF8BoundsTest.class);
+       }
+
+       private String makeRepeated(int numberOfTimes, String source) {
+               StringBuilder buffer = new StringBuilder();
+               for (int i = 0; i < numberOfTimes; i++) {
+                       buffer.append(source);
+               }
+               return buffer.toString();
+       }
+
+       private String makeSampleRepeated(int repeat, int padding) {
+               // Some scary 4-byte-per-char text from got.wikipedia.org
+               String sample = "π…πŒ°πŒΉπŒ»πŒ° 𐌰𐌽𐌳𐌰𐌽𐌴𐌼𐌰 πƒπŒΉπŒΎπŒ°πŒΉπŒΈ 𐌰𐌽𐌰 πŒ²πŒΏπ„πŒΉπƒπŒΊπŒ°πŒΌπŒΌπŒ° πŒΏπƒπŒΌπŒ΄π‚πŒΎπŒ° 
π…πŒΉπŒΊπŒΉπ€πŒ°πŒΉπŒ³πŒΎπ‰πƒ, π†π‚π‰πŒ³πŒΉπŒ±π‰πŒΊπ‰πƒ 𐌹𐌽 πŒ½πŒ°π„πŒΎπŒ°, πŒΈπŒ°π‚πŒ΄πŒΉ πˆπŒ°π‚πŒΎπŒΉπƒ 𐌼𐌰𐌲 πŒΌπŒΉπŒΈπŒ°π‚πŒ±πŒ°πŒΉπŒ³πŒΎπŒ°πŒ½. 𐍃𐍉 π…πŒΉπŒΊπŒΉπ€πŒ°πŒΉπŒ³πŒΎπŒ° 
πŒ²πŒΏπ„π‚πŒ°πŒΆπŒ³πŒ°πŒΉ 𐌾𐌰𐌷 447 πŒ»πŒ°πŒΏπŒ±πŒ°πŒ½πƒ 𐌷𐌰𐌱𐌰𐌹𐌸.";
+               return "<mediawiki xml:lang=\"en\">" +
+                       "<siteinfo><namespaces><namespace 
key=\"0\"></namespace></namespaces></siteinfo>" +
+                       "<page><title>Test</title><revision>" +
+                       "<id>1</id>" +
+                       "<timestamp>2016-04-23T16:46:00Z</timestamp>" +
+                       
"<contributor><username>Test</username><id>1</id></contributor>" +
+                       "<text>" + makeRepeated(padding, " ") + 
makeRepeated(repeat, sample) + "</text>" +
+                       "</revision></page></mediawiki>";
+       }
+
+       private boolean runImportThingy(String sample) throws IOException {
+               OutputStream output = new NullOutputStream();
+               InputStream input = new 
ByteArrayInputStream(sample.getBytes(StandardCharsets.UTF_8));
+               DumpWriter sink = new XmlDumpWriter0_10(output);
+               XmlDumpReader source = new XmlDumpReader(input, sink);
+               
+               source.readDump();
+               
+               return true; // did not throw
+       }
+
+       public void testParsingTinyFile() throws IOException {
+               String sample = makeSampleRepeated(0, 0);
+               assertTrue("tiny file parses ok", runImportThingy(sample));
+       }
+
+       public void testParsingManyOffsets() throws IOException {
+               int unicodeRepeatCount = 200;
+               // known to fail in this range on xerces 2.7.1
+               for (int i = 1750; i < 1800; i++) {
+                       String sample = makeSampleRepeated(unicodeRepeatCount, 
i);
+                       assertTrue("file with repeat of " + i + " parses ok", 
runImportThingy(sample));
+               }
+       }
+}

-- 
To view, visit https://gerrit.wikimedia.org/r/285004
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Ie0b699804f45d452f9f6b4f3be22e96f9b50d1f7
Gerrit-PatchSet: 2
Gerrit-Project: mediawiki/tools/mwdumper
Gerrit-Branch: master
Gerrit-Owner: Brion VIBBER <[email protected]>
Gerrit-Reviewer: Brion VIBBER <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to