Author: jukka
Date: Wed Jan 7 07:29:31 2009
New Revision: 732361
URL: http://svn.apache.org/viewvc?rev=732361&view=rev
Log:
TIKA-180: XHTMLContentHandler unable to extract text from MSWord file
Added a SafeContentHandler class that prevents invalid XML characters from
being written out.
Added:
lucene/tika/trunk/src/main/java/org/apache/tika/sax/SafeContentHandler.java
lucene/tika/trunk/src/test/java/org/apache/tika/sax/SafeContentHandlerTest.java
Added:
lucene/tika/trunk/src/main/java/org/apache/tika/sax/SafeContentHandler.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/sax/SafeContentHandler.java?rev=732361&view=auto
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/sax/SafeContentHandler.java
(added)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/sax/SafeContentHandler.java
Wed Jan 7 07:29:31 2009
@@ -0,0 +1,124 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.sax;
+
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class SafeContentHandler extends ContentHandlerDecorator {
+
+ private static final char[] REPLACEMENT = new char[] { ' ' };
+
+ /**
+ * Internal interface that allows both character and
+ * ignorable whitespace content to be filtered the same way.
+ */
+ protected interface Output {
+ void write(char[] ch, int start, int length) throws SAXException;
+ }
+
+ /**
+ * Output through the {...@link ContentHandler#characters(char[], int,
int)}
+ * method of the decorated content handler.
+ */
+ private final Output charactersOutput = new Output() {
+ public void write(char[] ch, int start, int length)
+ throws SAXException {
+ SafeContentHandler.super.characters(ch, start, length);
+ }
+ };
+
+ /**
+ * Output through the
+ * {...@link ContentHandler#ignorableWhitespace(char[], int, int)}
+ * method of the decorated content handler.
+ */
+ private final Output ignorableWhitespaceOutput = new Output() {
+ public void write(char[] ch, int start, int length)
+ throws SAXException {
+ SafeContentHandler.super.characters(ch, start, length);
+ }
+ };
+
+ public SafeContentHandler(ContentHandler handler) {
+ super(handler);
+ }
+
+ private void filter(char[] ch, int start, int length, Output output)
+ throws SAXException {
+ int end = start + length;
+
+ for (int i = start; i < end; i++) {
+ if (isInvalid(ch[i])) {
+ // Output any preceding valid characters
+ if (i > start) {
+ output.write(ch, start, i - start);
+ }
+
+ // Output the replacement for this invalid character
+ writeReplacement(output);
+
+ // Continue with the rest of the array
+ start = i + 1;
+ }
+ }
+
+ // Output any remaining valid characters
+ output.write(ch, start, end - start);
+ }
+
+ /**
+ * Checks whether the given character (more accurately a UTF-16 code unit)
+ * is an invalid XML character and should be replaced for output.
+ * Subclasses can override this method to use an alternative definition
+ * of which characters should be replaced in the XML output.
+ *
+ * @param ch character
+ * @return <code>true</code> if the character should be replaced,
+ * <code>false</code> otherwise
+ */
+ protected boolean isInvalid(char ch) {
+ // TODO: Detect also FFFE, FFFF, and the surrogate blocks
+ return ch < 0x20 && ch != 0x09 && ch != 0x0A && ch != 0x0D;
+ }
+
+ /**
+ * Outputs the replacement for an invalid character. Subclasses can
+ * override this method to use a custom replacement.
+ *
+ * @param output where the replacement is written to
+ * @throws SAXException if the replacement could not be written
+ */
+ protected void writeReplacement(Output output) throws SAXException {
+ output.write(REPLACEMENT, 0, REPLACEMENT.length);
+ }
+
+ //------------------------------------------------------< ContentHandler >
+
+ @Override
+ public void characters(char[] ch, int start, int length)
+ throws SAXException {
+ filter(ch, start, length, charactersOutput);
+ }
+
+ @Override
+ public void ignorableWhitespace(char[] ch, int start, int length)
+ throws SAXException {
+ filter(ch, start, length, ignorableWhitespaceOutput);
+ }
+
+}
Added:
lucene/tika/trunk/src/test/java/org/apache/tika/sax/SafeContentHandlerTest.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/test/java/org/apache/tika/sax/SafeContentHandlerTest.java?rev=732361&view=auto
==============================================================================
---
lucene/tika/trunk/src/test/java/org/apache/tika/sax/SafeContentHandlerTest.java
(added)
+++
lucene/tika/trunk/src/test/java/org/apache/tika/sax/SafeContentHandlerTest.java
Wed Jan 7 07:29:31 2009
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.sax;
+
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import junit.framework.TestCase;
+
+/**
+ * Unit tests for the {...@link SafeContentHandler} class.
+ */
+public class SafeContentHandlerTest extends TestCase {
+
+ private ContentHandler output;
+
+ private ContentHandler safe;
+
+ protected void setUp() {
+ output = new WriteOutContentHandler();
+ safe = new SafeContentHandler(output);
+ }
+
+ public void testEmptyInput() throws SAXException {
+ safe.characters(new char[0], 0, 0);
+ safe.ignorableWhitespace(new char[0], 0, 0);
+ assertEquals("", output.toString());
+ }
+
+ public void testNormalCharacters() throws SAXException {
+ safe.characters("abc".toCharArray(), 0, 3);
+ assertEquals("abc", output.toString());
+ }
+
+ public void testNormalWhitespace() throws SAXException {
+ safe.ignorableWhitespace("abc".toCharArray(), 0, 3);
+ assertEquals("abc", output.toString());
+ }
+
+ public void testInvalidCharacters() throws SAXException {
+ safe.characters("ab\u0007".toCharArray(), 0, 3);
+ safe.characters("a\u000Bc".toCharArray(), 0, 3);
+ safe.characters("\u0019bc".toCharArray(), 0, 3);
+ assertEquals("ab a c bc", output.toString());
+ }
+
+ public void testInvalidWhitespace() throws SAXException {
+ safe.ignorableWhitespace("ab\u0000".toCharArray(), 0, 3);
+ safe.ignorableWhitespace("a\u0001c".toCharArray(), 0, 3);
+ safe.ignorableWhitespace("\u0002bc".toCharArray(), 0, 3);
+ assertEquals("ab a c bc", output.toString());
+ }
+
+}