Author: jukka
Date: Fri Oct 12 00:46:08 2007
New Revision: 584092
URL: http://svn.apache.org/viewvc?rev=584092&view=rev
Log:
TIKA-53 - XHTML SAX events from parsers
Added:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/ContentHandlerDecorator.java
(with props)
incubator/tika/trunk/src/main/java/org/apache/tika/parser/TeeContentHandler.java
(with props)
incubator/tika/trunk/src/main/java/org/apache/tika/parser/WriteOutContentHandler.java
(with props)
incubator/tika/trunk/src/main/java/org/apache/tika/parser/XHTMLContentHandler.java
(with props)
Modified:
incubator/tika/trunk/CHANGES.txt
incubator/tika/trunk/src/main/java/org/apache/tika/ms/MSParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/Parser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserDecorator.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserPostProcessor.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/rtf/RTFParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/utils/ParseUtils.java
incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java
incubator/tika/trunk/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
Modified: incubator/tika/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=584092&r1=584091&r2=584092&view=diff
==============================================================================
--- incubator/tika/trunk/CHANGES.txt (original)
+++ incubator/tika/trunk/CHANGES.txt Fri Oct 12 00:46:08 2007
@@ -91,3 +91,4 @@
41. TIKA-52 - RereadableInputStream needs to support not closing the input
stream it wraps.
(K. Bennett via bdelacretaz)
+42. TIKA-53 - XHTML SAX events from parsers (jukka)
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/ms/MSParser.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/ms/MSParser.java?rev=584092&r1=584091&r2=584092&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/ms/MSParser.java
(original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/ms/MSParser.java Fri Oct
12 00:46:08 2007
@@ -25,7 +25,10 @@
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.XHTMLContentHandler;
import org.apache.tika.utils.RereadableInputStream;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
/**
* Defines a Microsoft document content extractor.
@@ -37,10 +40,11 @@
/**
* Extracts properties and text from an MS Document input stream
*/
- public String parse(InputStream input, Metadata metadata)
- throws IOException, TikaException {
+ public void parse(
+ InputStream stream, ContentHandler handler, Metadata metadata)
+ throws IOException, SAXException, TikaException {
RereadableInputStream ris =
- new RereadableInputStream(input, MEMORY_THRESHOLD, true, false);
+ new RereadableInputStream(stream, MEMORY_THRESHOLD, true, false);
try {
// First, extract properties
POIFSReader reader = new POIFSReader();
@@ -48,14 +52,18 @@
new PropertiesReaderListener(metadata),
SummaryInformation.DEFAULT_STREAM_NAME);
- if (input.available() > 0) {
+ if (stream.available() > 0) {
reader.read(ris);
}
while (ris.read() != -1) {
}
ris.rewind();
// Extract document full text
- return extractText(ris);
+ XHTMLContentHandler xhtml =
+ new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+ xhtml.element("p", extractText(ris));
+ xhtml.endDocument();
} catch (IOException e) {
throw e;
} catch (TikaException e) {
Added:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/ContentHandlerDecorator.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/ContentHandlerDecorator.java?rev=584092&view=auto
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/ContentHandlerDecorator.java
(added)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/ContentHandlerDecorator.java
Fri Oct 12 00:46:08 2007
@@ -0,0 +1,96 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.Locator;
+import org.xml.sax.SAXException;
+
+/**
+ * Decorator base class for the [EMAIL PROTECTED] ContentHandler} interface.
This class
+ * simply delegates all SAX events calls to an underlying decorated handler
+ * instance. Subclasses can provide extra decoration by overriding one or more
+ * of the SAX event methods.
+ */
+public class ContentHandlerDecorator implements ContentHandler {
+
+ /**
+ * Decorated SAX event handler.
+ */
+ private final ContentHandler handler;
+
+ /**
+ * Creates a decorator for the given SAX event handler.
+ *
+ * @param handler SAX event handler to be decorated
+ */
+ public ContentHandlerDecorator(ContentHandler handler) {
+ this.handler = handler;
+ }
+
+ public void startPrefixMapping(String prefix, String uri)
+ throws SAXException {
+ handler.startPrefixMapping(prefix, uri);
+ }
+
+ public void endPrefixMapping(String prefix) throws SAXException {
+ handler.endPrefixMapping(prefix);
+ }
+
+ public void processingInstruction(String target, String data)
+ throws SAXException {
+ handler.processingInstruction(target, data);
+ }
+
+ public void setDocumentLocator(Locator locator) {
+ handler.setDocumentLocator(locator);
+ }
+
+ public void startDocument() throws SAXException {
+ handler.startDocument();
+ }
+
+ public void endDocument() throws SAXException {
+ handler.endDocument();
+ }
+
+ public void startElement(String uri, String localName, String name,
+ Attributes atts) throws SAXException {
+ handler.startElement(uri, localName, name, atts);
+ }
+
+ public void endElement(String uri, String localName, String name)
+ throws SAXException {
+ handler.endElement(uri, localName, name);
+ }
+
+ public void characters(char[] ch, int start, int length)
+ throws SAXException {
+ handler.characters(ch, start, length);
+ }
+
+ public void ignorableWhitespace(char[] ch, int start, int length)
+ throws SAXException {
+ handler.ignorableWhitespace(ch, start, length);
+ }
+
+ public void skippedEntity(String name) throws SAXException {
+ handler.skippedEntity(name);
+ }
+
+}
Propchange:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/ContentHandlerDecorator.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/Parser.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/Parser.java?rev=584092&r1=584091&r2=584092&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/Parser.java
(original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/Parser.java Fri
Oct 12 00:46:08 2007
@@ -21,6 +21,8 @@
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
/**
* Tika parser interface
@@ -28,22 +30,20 @@
public interface Parser {
/**
- * Parses a document from the given input stream and returns the
- * extracted full text content of the document. Fills in selected
- * metadata information in the given set of [EMAIL PROTECTED] Content}
instances.
+ * Parses a document stream into a sequence of XHTML SAX events.
+ * Fills in related document metadata in the given metadata object.
* <p>
- * The given stream is consumed but not closed by this method.
+ * The given document stream is consumed but not closed by this method.
* The responsibility to close the stream remains on the caller.
*
- * @param stream the document to be parsed
- * @param contents configuration of metadata information to extract
- * @param metadata document metadata
- * @return full text content of the document
- * @throws IOException if the document could not be read
+ * @param stream the document stream (input)
+ * @param handler handler for the XHTML SAX events (output)
+ * @param metadata document metadata (input and output)
+ * @throws IOException if the document stream could not be read
+ * @throws SAXException if the SAX events could not be processed
* @throws TikaException if the document could not be parsed
*/
- String parse(
- InputStream stream, Metadata metadata)
- throws IOException, TikaException;
+ void parse(InputStream stream, ContentHandler handler, Metadata metadata)
+ throws IOException, SAXException, TikaException;
}
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserDecorator.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserDecorator.java?rev=584092&r1=584091&r2=584092&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserDecorator.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserDecorator.java
Fri Oct 12 00:46:08 2007
@@ -21,6 +21,8 @@
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
/**
* Decorator base class for the [EMAIL PROTECTED] Parser} interface. This class
@@ -49,9 +51,10 @@
* override this method (and use <code>super.parse()</code> to invoke
* the decorated parser) to implement extra decoration.
*/
- public String parse(InputStream stream, Metadata metadata)
- throws IOException, TikaException {
- return parser.parse(stream, metadata);
+ public void parse(
+ InputStream stream, ContentHandler handler, Metadata metadata)
+ throws IOException, SAXException, TikaException {
+ parser.parse(stream, handler, metadata);
}
}
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserPostProcessor.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserPostProcessor.java?rev=584092&r1=584091&r2=584092&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserPostProcessor.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserPostProcessor.java
Fri Oct 12 00:46:08 2007
@@ -18,11 +18,14 @@
import java.io.IOException;
import java.io.InputStream;
+import java.io.StringWriter;
-import org.apache.log4j.Logger;
+import org.apache.oro.text.regex.MalformedPatternException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.utils.RegexUtils;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
/**
* Parser decorator that post-processes the results from a decorated parser.
@@ -33,12 +36,6 @@
*/
public class ParserPostProcessor extends ParserDecorator {
- /**
- * Logger instance.
- */
- private static final Logger logger =
- Logger.getLogger(ParserPostProcessor.class);
-
private static final String LINK_PATTERN =
"([A-Za-z][A-Za-z0-9+.-]{1,120}:"
+ "[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2}){1,333}"
@@ -57,25 +54,26 @@
* Forwards the call to the delegated parser and post-processes the
* results as described above.
*/
- public String parse(InputStream stream, Metadata metadata)
- throws IOException, TikaException {
- try {
- String contentStr = super.parse(stream, metadata);
+ public void parse(
+ InputStream stream, ContentHandler handler, Metadata metadata)
+ throws IOException, SAXException, TikaException {
+ StringWriter writer = new StringWriter();
+ handler = new TeeContentHandler(
+ handler, new WriteOutContentHandler(writer));
+ super.parse(stream, handler, metadata);
- metadata.set("fulltext", contentStr);
+ String content = writer.toString();
+ metadata.set("fulltext", content);
- int length = Math.min(contentStr.length(), 500);
- String summary = contentStr.substring(0, length);
- metadata.set("summary", summary);
+ int length = Math.min(content.length(), 500);
+ metadata.set("summary", content.substring(0, length));
- for (String link : RegexUtils.extract(contentStr, LINK_PATTERN)) {
+ try {
+ for (String link : RegexUtils.extract(content, LINK_PATTERN)) {
metadata.add("outlinks", link);
}
-
- return contentStr;
- } catch (Exception e) {
- logger.error("Parse error: " + e.getMessage(), e);
- return "";
+ } catch (MalformedPatternException e) {
+ throw new TikaException("Malformed URL pattern", e);
}
}
Added:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/TeeContentHandler.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/TeeContentHandler.java?rev=584092&view=auto
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/TeeContentHandler.java
(added)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/TeeContentHandler.java
Fri Oct 12 00:46:08 2007
@@ -0,0 +1,98 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.Locator;
+import org.xml.sax.SAXException;
+
+/**
+ * Content handler decorator that forwards the received SAX events to two
+ * underlying content handlers.
+ */
+public class TeeContentHandler extends ContentHandlerDecorator {
+
+ private final ContentHandler branch;
+
+ public TeeContentHandler(ContentHandler handler, ContentHandler branch) {
+ super(handler);
+ this.branch = branch;
+ }
+
+ public void startPrefixMapping(String prefix, String uri)
+ throws SAXException {
+ super.startPrefixMapping(prefix, uri);
+ branch.startPrefixMapping(prefix, uri);
+ }
+
+ public void endPrefixMapping(String prefix) throws SAXException {
+ super.endPrefixMapping(prefix);
+ branch.endPrefixMapping(prefix);
+ }
+
+ public void processingInstruction(String target, String data)
+ throws SAXException {
+ super.processingInstruction(target, data);
+ branch.processingInstruction(target, data);
+ }
+
+ public void setDocumentLocator(Locator locator) {
+ super.setDocumentLocator(locator);
+ branch.setDocumentLocator(locator);
+ }
+
+ public void startDocument() throws SAXException {
+ super.startDocument();
+ branch.startDocument();
+ }
+
+ public void endDocument() throws SAXException {
+ super.endDocument();
+ branch.endDocument();
+ }
+
+ public void startElement(String uri, String localName, String name,
+ Attributes atts) throws SAXException {
+ super.startElement(uri, localName, name, atts);
+ branch.startElement(uri, localName, name, atts);
+ }
+
+ public void endElement(String uri, String localName, String name)
+ throws SAXException {
+ super.endElement(uri, localName, name);
+ branch.endElement(uri, localName, name);
+ }
+
+ public void characters(char[] ch, int start, int length)
+ throws SAXException {
+ super.characters(ch, start, length);
+ branch.characters(ch, start, length);
+ }
+
+ public void ignorableWhitespace(char[] ch, int start, int length)
+ throws SAXException {
+ super.ignorableWhitespace(ch, start, length);
+ branch.ignorableWhitespace(ch, start, length);
+ }
+
+ public void skippedEntity(String name) throws SAXException {
+ super.skippedEntity(name);
+ branch.skippedEntity(name);
+ }
+
+}
Propchange:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/TeeContentHandler.java
------------------------------------------------------------------------------
svn:eol-style = native
Added:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/WriteOutContentHandler.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/WriteOutContentHandler.java?rev=584092&view=auto
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/WriteOutContentHandler.java
(added)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/WriteOutContentHandler.java
Fri Oct 12 00:46:08 2007
@@ -0,0 +1,49 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import java.io.IOException;
+import java.io.Writer;
+
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * SAX event handler that writes all character content out to
+ * a [EMAIL PROTECTED] Writer} character stream.
+ */
+public class WriteOutContentHandler extends DefaultHandler {
+
+ /**
+ * The character stream.
+ */
+ private final Writer writer;
+
+ public WriteOutContentHandler(Writer writer) {
+ this.writer = writer;
+ }
+
+ public void characters(char[] ch, int start, int length)
+ throws SAXException {
+ try {
+ writer.write(ch, start, length);
+ } catch (IOException e) {
+ throw new SAXException("Error writing out character content", e);
+ }
+ }
+
+}
Propchange:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/WriteOutContentHandler.java
------------------------------------------------------------------------------
svn:eol-style = native
Added:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/XHTMLContentHandler.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/XHTMLContentHandler.java?rev=584092&view=auto
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/XHTMLContentHandler.java
(added)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/XHTMLContentHandler.java
Fri Oct 12 00:46:08 2007
@@ -0,0 +1,105 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import org.apache.tika.metadata.Metadata;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+/**
+ * Content handler decorator that simplifies the task of producing XHTML
+ * events for Tika content parsers.
+ */
+public class XHTMLContentHandler extends ContentHandlerDecorator {
+
+ /**
+ * The XHTML namespace URI
+ */
+ public static final String XHTML = "http://www.w3.org/1999/xhtml";
+
+ /**
+ * Metadata associated with the document. Used to fill in the
+ * <head/> section.
+ */
+ private final Metadata metadata;
+
+ public XHTMLContentHandler(ContentHandler handler, Metadata metadata) {
+ super(handler);
+ this.metadata = metadata;
+ }
+
+ /**
+ * Starts an XHTML document by setting up the namespace mappings and
+ * writing following header:
+ * <pre>
+ * <html>
+ * <head>
+ * <title>...</title>
+ * </head>
+ * <body>
+ * </pre>
+ */
+ public void startDocument() throws SAXException {
+ super.startDocument();
+ startPrefixMapping("", XHTML);
+ startElement("html");
+ startElement("head");
+ startElement("title");
+ String title = metadata.get(Metadata.TITLE);
+ if (title != null && title.length() > 0) {
+ characters(title);
+ }
+ endElement("title");
+ endElement("head");
+ startElement("body");
+ }
+
+ /**
+ * Ends the XHTML document by writing the following footer and
+ * clearing the namespace mappings:
+ * <pre>
+ * </body>
+ * </html>
+ * </pre>
+ */
+ public void endDocument() throws SAXException {
+ endElement("body");
+ endElement("html");
+ endPrefixMapping("");
+ super.endDocument();
+ }
+
+ public void startElement(String name) throws SAXException {
+ startElement(XHTML, name, name, new AttributesImpl());
+ }
+
+ public void endElement(String name) throws SAXException {
+ endElement(XHTML, name, name);
+ }
+
+ public void characters(String characters) throws SAXException {
+ characters(characters.toCharArray(), 0, characters.length());
+ }
+
+ public void element(String name, String value) throws SAXException {
+ startElement(name);
+ characters(value);
+ endElement(name);
+ }
+
+}
Propchange:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/XHTMLContentHandler.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=584092&r1=584091&r2=584092&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java
Fri Oct 12 00:46:08 2007
@@ -19,32 +19,47 @@
import java.io.IOException;
import java.io.InputStream;
-import org.apache.log4j.Logger;
+import javax.xml.transform.Transformer;
+import javax.xml.transform.TransformerException;
+import javax.xml.transform.TransformerFactory;
+import javax.xml.transform.dom.DOMSource;
+import javax.xml.transform.sax.SAXResult;
+
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.Parser;
import org.w3c.dom.Element;
-import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.dom.Text;
import org.w3c.tidy.Tidy;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
/**
* Html parser
- *
*/
public class HtmlParser implements Parser {
- static Logger logger = Logger.getRootLogger();
+ public void parse(
+ InputStream stream, ContentHandler handler, Metadata metadata)
+ throws IOException, SAXException, TikaException {
+ try {
+ Tidy tidy = new Tidy();
+ tidy.setQuiet(true);
+ tidy.setShowWarnings(false);
+ tidy.setXHTML(true);
+
+ Element root = tidy.parseDOM(stream, null).getDocumentElement();
- public String parse(InputStream stream, Metadata metadata)
- throws IOException, TikaException {
- Tidy tidy = new Tidy();
- tidy.setQuiet(true);
- tidy.setShowWarnings(false);
- Node root = tidy.parseDOM(stream, null).getDocumentElement();
- extractElementTxt((Element) root, Metadata.TITLE, "title", metadata);
- return getTextContent(root);
+ metadata.set(Metadata.CONTENT_TYPE, "text/html");
+ extractElementTxt(root, Metadata.TITLE, "title", metadata);
+
+ TransformerFactory factory = TransformerFactory.newInstance();
+ Transformer transformer = factory.newTransformer();
+ transformer.transform(new DOMSource(root), new SAXResult(handler));
+ } catch (TransformerException e) {
+ throw new TikaException("Failed to transform DOM to SAX", e);
+ }
}
private void extractElementTxt(
@@ -69,24 +84,6 @@
}
}
}
- }
-
- private String getTextContent(Node node) {
- NodeList children = node.getChildNodes();
- StringBuffer sb = new StringBuffer();
- for (int i = 0; i < children.getLength(); i++) {
- Node child = children.item(i);
- switch (child.getNodeType()) {
- case Node.ELEMENT_NODE:
- sb.append(getTextContent(child));
- sb.append(" ");
- break;
- case Node.TEXT_NODE:
- sb.append(((Text) child).getData());
- break;
- }
- }
- return sb.toString();
}
}
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java?rev=584092&r1=584091&r2=584092&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java
Fri Oct 12 00:46:08 2007
@@ -30,6 +30,7 @@
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.XHTMLContentHandler;
import org.apache.tika.parser.xml.XMLParser;
import org.apache.log4j.Logger;
@@ -38,6 +39,8 @@
import org.jdom.JDOMException;
import org.jdom.Namespace;
import org.jdom.input.SAXBuilder;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
/**
* OpenOffice parser
@@ -75,8 +78,9 @@
return xmlDoc;
}
- public String parse(InputStream stream, Metadata metadata)
- throws IOException, TikaException {
+ public void parse(
+ InputStream stream, ContentHandler handler, Metadata metadata)
+ throws IOException, SAXException, TikaException {
Document xmlDoc = parse(stream);
XMLParser xp = new XMLParser();
xp.getAllDocumentNs(xmlDoc);
@@ -94,7 +98,11 @@
xp.extractContent(xmlDoc, "nbPara",
"//meta:document-statistic/@meta:paragraph-count", metadata);
xp.extractContent(xmlDoc, "nbWord",
"//meta:document-statistic/@meta:word-count", metadata);
xp.extractContent(xmlDoc, "nbcharacter",
"//meta:document-statistic/@meta:character-count", metadata);
- return xp.concatOccurrence(xmlDoc, "//*", " ");
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+ xhtml.element("p", xp.concatOccurrence(xmlDoc, "//*", " "));
+ xhtml.endDocument();
}
public List unzip(InputStream is) {
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java?rev=584092&r1=584091&r2=584092&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
Fri Oct 12 00:46:08 2007
@@ -24,18 +24,22 @@
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.XHTMLContentHandler;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.pdmodel.PDDocumentInformation;
import org.pdfbox.util.PDFTextStripper;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
/**
* PDF parser
*/
public class PDFParser implements Parser {
- public String parse(InputStream stream, Metadata metadata)
- throws IOException, TikaException {
+ public void parse(
+ InputStream stream, ContentHandler handler, Metadata metadata)
+ throws IOException, SAXException, TikaException {
try {
PDDocument pdfDocument = PDDocument.load(stream);
try {
@@ -81,7 +85,12 @@
StringWriter writer = new StringWriter();
new PDFTextStripper().writeText(pdfDocument, writer);
- return writer.getBuffer().toString();
+
+ XHTMLContentHandler xhtml =
+ new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+ xhtml.element("p", writer.getBuffer().toString());
+ xhtml.endDocument();
} finally {
pdfDocument.close();
}
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/rtf/RTFParser.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/rtf/RTFParser.java?rev=584092&r1=584091&r2=584092&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/rtf/RTFParser.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/rtf/RTFParser.java
Fri Oct 12 00:46:08 2007
@@ -19,27 +19,35 @@
import java.io.IOException;
import java.io.InputStream;
+import javax.swing.text.BadLocationException;
import javax.swing.text.DefaultStyledDocument;
import javax.swing.text.rtf.RTFEditorKit;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
/**
* RTF parser
*/
public class RTFParser implements Parser {
- public String parse(InputStream stream, Metadata metadata)
- throws IOException, TikaException {
+ public void parse(
+ InputStream stream, ContentHandler handler, Metadata metadata)
+ throws IOException, SAXException, TikaException {
try {
DefaultStyledDocument sd = new DefaultStyledDocument();
new RTFEditorKit().read(stream, sd, 0);
- return sd.getText(0, sd.getLength());
- } catch (IOException e) {
- throw e;
- } catch (Exception e) {
+
+ XHTMLContentHandler xhtml =
+ new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+ xhtml.element("p", sd.getText(0, sd.getLength()));
+ xhtml.endDocument();
+ } catch (BadLocationException e) {
throw new TikaException("Error parsing an RTF document", e);
}
}
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java?rev=584092&r1=584091&r2=584092&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java
Fri Oct 12 00:46:08 2007
@@ -19,10 +19,14 @@
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
+import java.io.Reader;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
import com.ibm.icu.text.CharsetDetector;
import com.ibm.icu.text.CharsetMatch;
@@ -32,8 +36,9 @@
*/
public class TXTParser implements Parser {
- public String parse(InputStream stream, Metadata metadata)
- throws IOException, TikaException {
+ public void parse(
+ InputStream stream, ContentHandler handler, Metadata metadata)
+ throws IOException, SAXException, TikaException {
CharsetDetector detector = new CharsetDetector();
// Use the declared character encoding, if available
@@ -62,7 +67,16 @@
metadata.set(Metadata.LANGUAGE, match.getLanguage());
}
- return match.getString();
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+ xhtml.startElement("p");
+ Reader reader = match.getReader();
+ char[] buffer = new char[4096];
+ for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) {
+ xhtml.characters(buffer, 0, n);
+ }
+ xhtml.endElement("p");
+ xhtml.endDocument();
}
}
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java?rev=584092&r1=584091&r2=584092&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java
Fri Oct 12 00:46:08 2007
@@ -25,6 +25,7 @@
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.XHTMLContentHandler;
import org.apache.tika.utils.Utils;
import org.apache.commons.lang.StringUtils;
@@ -40,6 +41,8 @@
import org.jdom.Namespace;
import org.jdom.ProcessingInstruction;
import org.jdom.Text;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
/**
* XML parser
@@ -48,9 +51,11 @@
static Logger logger = Logger.getRootLogger();
- public String parse(InputStream stream, Metadata metadata)
- throws IOException, TikaException {
+ public void parse(
+ InputStream stream, ContentHandler handler, Metadata metadata)
+ throws IOException, SAXException, TikaException {
Document xmlDoc = Utils.parse(stream);
+
extractContent(xmlDoc, Metadata.TITLE, "//dc:title", metadata);
extractContent(xmlDoc, Metadata.SUBJECT, "//dc:subject", metadata);
extractContent(xmlDoc, Metadata.CREATOR, "//dc:creator", metadata);
@@ -62,7 +67,11 @@
extractContent(xmlDoc, Metadata.IDENTIFIER, "//dc:identifier",
metadata);
extractContent(xmlDoc, Metadata.LANGUAGE, "//dc:language", metadata);
extractContent(xmlDoc, Metadata.RIGHTS, "//dc:rights", metadata);
- return concatOccurrence(xmlDoc, "//*", " ");
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+ xhtml.element("p", concatOccurrence(xmlDoc, "//*", " "));
+ xhtml.endDocument();
}
public String concatOccurrence(Object xmlDoc, String xpath, String
concatSep) {
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/utils/ParseUtils.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/utils/ParseUtils.java?rev=584092&r1=584091&r2=584092&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/utils/ParseUtils.java
(original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/utils/ParseUtils.java
Fri Oct 12 00:46:08 2007
@@ -23,6 +23,7 @@
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
+import java.io.StringWriter;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
@@ -34,6 +35,8 @@
import org.apache.tika.metadata.TikaMimeKeys;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParserFactory;
+import org.apache.tika.parser.WriteOutContentHandler;
+import org.xml.sax.SAXException;
/**
* Contains utility methods for parsing documents. Intended to provide simple
@@ -159,21 +162,24 @@
/**
* Gets the string content of a document read from an input stream.
*
- * @param inputStream
- * the stream from which to read document data
+ * @param stream the stream from which to read document data
* @param config
- * @param mimeType
- * MIME type of the data
+ * @param mimeType MIME type of the data
* @return the string content parsed from the document
- * @throws TikaException
- * @throws IOException
*/
- public static String getStringContent(InputStream inputStream,
- TikaConfig config, String mimeType) throws TikaException,
- IOException {
- ParserConfig pc = config.getParserConfig(mimeType);
- Parser parser = ParserFactory.getParser(pc);
- return parser.parse(inputStream, new Metadata());
+ public static String getStringContent(
+ InputStream stream, TikaConfig config, String mimeType)
+ throws TikaException, IOException {
+ try {
+ ParserConfig pc = config.getParserConfig(mimeType);
+ Parser parser = ParserFactory.getParser(pc);
+ StringWriter writer = new StringWriter();
+ parser.parse(
+ stream, new WriteOutContentHandler(writer), new
Metadata());
+ return writer.toString();
+ } catch (SAXException e) {
+ throw new TikaException("Unexpected SAX error", e);
+ }
}
/**
@@ -183,8 +189,6 @@
* URL pointing to the document to parse
* @param config
* @return the string content parsed from the document
- * @throws TikaException
- * @throws IOException
*/
public static String getStringContent(URL documentUrl, TikaConfig config)
throws TikaException, IOException {
@@ -202,11 +206,10 @@
* @param mimeType
* MIME type of the data
* @return the string content parsed from the document
- * @throws TikaException
- * @throws IOException
*/
- public static String getStringContent(URL documentUrl, TikaConfig config,
- String mimeType) throws TikaException, IOException {
+ public static String getStringContent(
+ URL documentUrl, TikaConfig config, String mimeType)
+ throws TikaException, IOException {
InputStream stream = documentUrl.openStream();
try {
return getStringContent(stream, config, mimeType);
@@ -224,11 +227,10 @@
* @param mimeType
* MIME type of the data
* @return the string content parsed from the document
- * @throws TikaException
- * @throws IOException
*/
- public static String getStringContent(File documentFile, TikaConfig config,
- String mimeType) throws TikaException, IOException {
+ public static String getStringContent(
+ File documentFile, TikaConfig config, String mimeType)
+ throws TikaException, IOException {
InputStream stream = new BufferedInputStream(new FileInputStream(
documentFile));
try {
@@ -245,13 +247,11 @@
* File object pointing to the document to parse
* @param config
* @return the string content parsed from the document
- * @throws TikaException
- * @throws IOException
*/
public static String getStringContent(File documentFile, TikaConfig config)
throws TikaException, IOException {
- String mime = config.getMimeRepository().getMimeType(documentFile)
- .getName();
+ String mime =
+ config.getMimeRepository().getMimeType(documentFile).getName();
return getStringContent(documentFile, config, mime);
}
Modified: incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java?rev=584092&r1=584091&r2=584092&view=diff
==============================================================================
--- incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java
(original)
+++ incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java Fri Oct
12 00:46:08 2007
@@ -32,6 +32,7 @@
import org.apache.tika.utils.ParseUtils;
import org.apache.tika.utils.Utils;
import org.jdom.JDOMException;
+import org.xml.sax.helpers.DefaultHandler;
/**
* Junit test class for Tika [EMAIL PROTECTED] Parser}s.
@@ -113,7 +114,7 @@
Metadata metadata = new Metadata();
InputStream stream = new FileInputStream(file);
try {
- parser.parse(stream, metadata);
+ parser.parse(stream, new DefaultHandler(), metadata);
} finally {
stream.close();
}
@@ -130,7 +131,7 @@
Metadata metadata = new Metadata();
InputStream stream = new FileInputStream(file);
try {
- parser.parse(stream, metadata);
+ parser.parse(stream, new DefaultHandler(), metadata);
} finally {
stream.close();
}
@@ -155,7 +156,7 @@
Metadata metadata = new Metadata();
InputStream stream = new FileInputStream(file);
try {
- parser.parse(stream, metadata);
+ parser.parse(stream, new DefaultHandler(), metadata);
} finally {
stream.close();
}
@@ -183,7 +184,7 @@
Metadata metadata = new Metadata();
InputStream stream = new FileInputStream(file);
try {
- parser.parse(stream, metadata);
+ parser.parse(stream, new DefaultHandler(), metadata);
} finally {
stream.close();
}
Modified:
incubator/tika/trunk/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java?rev=584092&r1=584091&r2=584092&view=diff
==============================================================================
---
incubator/tika/trunk/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
(original)
+++
incubator/tika/trunk/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
Fri Oct 12 00:46:08 2007
@@ -17,9 +17,11 @@
package org.apache.tika.parser.txt;
import java.io.ByteArrayInputStream;
+import java.io.StringWriter;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.WriteOutContentHandler;
import junit.framework.TestCase;
@@ -34,8 +36,12 @@
+ " encoding and the language of the input stream.";
Metadata metadata = new Metadata();
- String content = parser.parse(
- new ByteArrayInputStream(text.getBytes("UTF-8")), metadata);
+ StringWriter writer = new StringWriter();
+ parser.parse(
+ new ByteArrayInputStream(text.getBytes("UTF-8")),
+ new WriteOutContentHandler(writer),
+ metadata);
+ String content = writer.toString();
assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("en", metadata.get(Metadata.CONTENT_LANGUAGE));
@@ -54,8 +60,12 @@
String text =
"I\u00F1t\u00EBrn\u00E2ti\u00F4n\u00E0liz\u00E6ti\u00F8n";
Metadata metadata = new Metadata();
- String content = parser.parse(
- new ByteArrayInputStream(text.getBytes("UTF-8")), metadata);
+ StringWriter writer = new StringWriter();
+ parser.parse(
+ new ByteArrayInputStream(text.getBytes("UTF-8")),
+ new WriteOutContentHandler(writer),
+ metadata);
+ String content = writer.toString();
assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING));
@@ -65,8 +75,12 @@
public void testEmptyText() throws Exception {
Metadata metadata = new Metadata();
- String content = parser.parse(
- new ByteArrayInputStream(new byte[0]), metadata);
+ StringWriter writer = new StringWriter();
+ parser.parse(
+ new ByteArrayInputStream(new byte[0]),
+ new WriteOutContentHandler(writer),
+ metadata);
+ String content = writer.toString();
assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("", content);
}