Author: ridabenjelloun
Date: Sat Jan 12 16:27:33 2008
New Revision: 611511
URL: http://svn.apache.org/viewvc?rev=611511&view=rev
Log:
- Replace XMLParser by XMLParserUtils
- Create Class DcXMLParser that extends XMLParserUtils and implements Parser.
This class allows DublinCore metadata parsing
- Add method setXMLParserNameSpaceContext() in XMLParserUtils.
- Improvement of OpenOfficeParser to extract document content from office:body.
- OpenOfficeParser extends XMLParserUtils
- Modification to tika-config to use DcXMLParser instead of XMLParser
Added:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java
(with props)
incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParserUtils.java
- copied, changed from r611501,
incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java
Removed:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java
incubator/tika/trunk/src/main/resources/tika-config.xml
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java?rev=611511&r1=611510&r2=611511&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java
Sat Jan 12 16:27:33 2008
@@ -27,14 +27,14 @@
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
+import org.apache.log4j.Logger;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.Parser;
-import org.apache.tika.parser.xml.XMLParser;
+import org.apache.tika.parser.xml.XMLParserUtils;
import org.apache.tika.sax.AppendableAdaptor;
import org.apache.tika.sax.XHTMLContentHandler;
-
-import org.apache.log4j.Logger;
+import org.jaxen.SimpleNamespaceContext;
import org.jdom.Document;
import org.jdom.Element;
import org.jdom.JDOMException;
@@ -46,7 +46,7 @@
/**
* OpenOffice parser
*/
-public class OpenOfficeParser implements Parser {
+public class OpenOfficeParser extends XMLParserUtils implements Parser {
static Logger logger = Logger.getRootLogger();
private final Namespace NS_DC = Namespace.getNamespace("dc",
@@ -66,7 +66,7 @@
Element rootMeta = xmlMeta.getRootElement();
Element meta = null;
List ls = rootMeta.getChildren();
- if (! ls.isEmpty()) {
+ if (!ls.isEmpty()) {
meta = (Element) ls.get(0);
}
xmlDoc.getRootElement().addContent(meta.detach());
@@ -79,31 +79,46 @@
return xmlDoc;
}
- public void parse(
- InputStream stream, ContentHandler handler, Metadata metadata)
- throws IOException, SAXException, TikaException {
+ public void parse(InputStream stream, ContentHandler handler,
+ Metadata metadata) throws IOException, SAXException, TikaException
{
Document xmlDoc = parse(stream);
- XMLParser xp = new XMLParser();
- xp.getAllDocumentNs(xmlDoc);
- xp.extractContent(xmlDoc, Metadata.TITLE, "//dc:title", metadata);
- xp.extractContent(xmlDoc, Metadata.SUBJECT, "//dc:subject", metadata);
- xp.extractContent(xmlDoc, Metadata.CREATOR, "//dc:creator", metadata);
- xp.extractContent(xmlDoc, Metadata.DESCRIPTION, "//dc:description",
metadata);
- xp.extractContent(xmlDoc, Metadata.LANGUAGE, "//dc:language",
metadata);
- xp.extractContent(xmlDoc, Metadata.KEYWORDS, "//meta:keyword",
metadata);
- xp.extractContent(xmlDoc, Metadata.DATE, "//dc:date", metadata);
- xp.extractContent(xmlDoc, "nbTab",
"//meta:document-statistic/@meta:table-count", metadata);
- xp.extractContent(xmlDoc, "nbObject",
"//meta:document-statistic/@meta:object-count", metadata);
- xp.extractContent(xmlDoc, "nbImg",
"//meta:document-statistic/@meta:image-count", metadata);
- xp.extractContent(xmlDoc, "nbPage",
"//meta:document-statistic/@meta:page-count", metadata);
- xp.extractContent(xmlDoc, "nbPara",
"//meta:document-statistic/@meta:paragraph-count", metadata);
- xp.extractContent(xmlDoc, "nbWord",
"//meta:document-statistic/@meta:word-count", metadata);
- xp.extractContent(xmlDoc, "nbcharacter",
"//meta:document-statistic/@meta:character-count", metadata);
+ // Set NameSpaceContext for OpenDocument
+ SimpleNamespaceContext context = new SimpleNamespaceContext();
+ context.addNamespace("dc", "http://purl.org/dc/elements/1.1/");
+ context.addNamespace("meta",
+ "urn:oasis:names:tc:opendocument:xmlns:meta:1.0");
+ context.addNamespace("office",
+ "urn:oasis:names:tc:opendocument:xmlns:office:1.0");
+ setXmlParserNameSpaceContext(context);
+
+ extractContent(xmlDoc, Metadata.TITLE, "//dc:title", metadata);
+ extractContent(xmlDoc, Metadata.SUBJECT, "//dc:subject", metadata);
+ extractContent(xmlDoc, Metadata.CREATOR, "//dc:creator", metadata);
+ extractContent(xmlDoc, Metadata.DESCRIPTION, "//dc:description",
+ metadata);
+ extractContent(xmlDoc, Metadata.LANGUAGE, "//dc:language", metadata);
+ extractContent(xmlDoc, Metadata.KEYWORDS, "//meta:keyword", metadata);
+ extractContent(xmlDoc, Metadata.DATE, "//dc:date", metadata);
+ extractContent(xmlDoc, "nbTab",
+ "//meta:document-statistic/@meta:table-count", metadata);
+ extractContent(xmlDoc, "nbObject",
+ "//meta:document-statistic/@meta:object-count", metadata);
+ extractContent(xmlDoc, "nbImg",
+ "//meta:document-statistic/@meta:image-count", metadata);
+ extractContent(xmlDoc, "nbPage",
+ "//meta:document-statistic/@meta:page-count", metadata);
+ extractContent(xmlDoc, "nbPara",
+ "//meta:document-statistic/@meta:paragraph-count", metadata);
+ extractContent(xmlDoc, "nbWord",
+ "//meta:document-statistic/@meta:word-count", metadata);
+ extractContent(xmlDoc, "nbcharacter",
+ "//meta:document-statistic/@meta:character-count", metadata);
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
xhtml.startElement("p");
- xp.concatOccurrence(xmlDoc, "//*", " ", new AppendableAdaptor(xhtml));
+ concatOccurrence(xmlDoc, "//office:body//*", " ",
+ new AppendableAdaptor(xhtml));
xhtml.endElement("p");
xhtml.endDocument();
}
Added:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java?rev=611511&view=auto
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java
(added)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java
Sat Jan 12 16:27:33 2008
@@ -0,0 +1,50 @@
+package org.apache.tika.parser.xml;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.AppendableAdaptor;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.Utils;
+import org.jaxen.SimpleNamespaceContext;
+import org.jdom.Document;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Dublin core metadata parser
+ */
+public class DcXMLParser extends XMLParserUtils implements Parser {
+
+ public void parse(InputStream stream, ContentHandler handler,
+ Metadata metadata) throws IOException, SAXException, TikaException
{
+ Document xmlDoc = Utils.parse(stream);
+ // Set NameSpaceContext for Dublin Core metadata
+ SimpleNamespaceContext context = new SimpleNamespaceContext();
+ context.addNamespace("dc", "http://purl.org/dc/elements/1.1/");
+ setXmlParserNameSpaceContext(context);
+ extractContent(xmlDoc, Metadata.TITLE, "//dc:title", metadata);
+ extractContent(xmlDoc, Metadata.SUBJECT, "//dc:subject", metadata);
+ extractContent(xmlDoc, Metadata.CREATOR, "//dc:creator", metadata);
+ extractContent(xmlDoc, Metadata.DESCRIPTION, "//dc:description",
+ metadata);
+ extractContent(xmlDoc, Metadata.PUBLISHER, "//dc:publisher", metadata);
+ extractContent(xmlDoc, Metadata.CONTRIBUTOR, "//dc:contributor",
+ metadata);
+ extractContent(xmlDoc, Metadata.TYPE, "//dc:type", metadata);
+ extractContent(xmlDoc, Metadata.FORMAT, "//dc:format", metadata);
+ extractContent(xmlDoc, Metadata.IDENTIFIER, "//dc:identifier",
metadata);
+ extractContent(xmlDoc, Metadata.LANGUAGE, "//dc:language", metadata);
+ extractContent(xmlDoc, Metadata.RIGHTS, "//dc:rights", metadata);
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+ xhtml.startElement("p");
+ concatOccurrence(xmlDoc, "//*", " ", new AppendableAdaptor(xhtml));
+ xhtml.endElement("p");
+ xhtml.endDocument();
+ }
+}
Propchange:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java
------------------------------------------------------------------------------
svn:eol-style = native
Copied:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParserUtils.java
(from r611501,
incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java)
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParserUtils.java?p2=incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParserUtils.java&p1=incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java&r1=611501&r2=611511&rev=611511&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParserUtils.java
Sat Jan 12 16:27:33 2008
@@ -22,15 +22,14 @@
import java.util.Iterator;
import java.util.List;
+import org.apache.commons.lang.StringUtils;
+import org.apache.log4j.Logger;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.AppendableAdaptor;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.utils.Utils;
-
-import org.apache.commons.lang.StringUtils;
-import org.apache.log4j.Logger;
import org.jaxen.JaxenException;
import org.jaxen.SimpleNamespaceContext;
import org.jaxen.jdom.JDOMXPath;
@@ -46,41 +45,22 @@
import org.xml.sax.SAXException;
/**
- * XML parser
+ * XML parser utils
*/
-public class XMLParser implements Parser {
+public class XMLParserUtils {
static Logger logger = Logger.getRootLogger();
+ private SimpleNamespaceContext nsContext;
- public void parse(
- InputStream stream, ContentHandler handler, Metadata metadata)
- throws IOException, SAXException, TikaException {
- Document xmlDoc = Utils.parse(stream);
-
- extractContent(xmlDoc, Metadata.TITLE, "//dc:title", metadata);
- extractContent(xmlDoc, Metadata.SUBJECT, "//dc:subject", metadata);
- extractContent(xmlDoc, Metadata.CREATOR, "//dc:creator", metadata);
- extractContent(xmlDoc, Metadata.DESCRIPTION, "//dc:description",
metadata);
- extractContent(xmlDoc, Metadata.PUBLISHER, "//dc:publisher", metadata);
- extractContent(xmlDoc, Metadata.CONTRIBUTOR, "//dc:contributor",
metadata);
- extractContent(xmlDoc, Metadata.TYPE, "//dc:type", metadata);
- extractContent(xmlDoc, Metadata.FORMAT, "//dc:format", metadata);
- extractContent(xmlDoc, Metadata.IDENTIFIER, "//dc:identifier",
metadata);
- extractContent(xmlDoc, Metadata.LANGUAGE, "//dc:language", metadata);
- extractContent(xmlDoc, Metadata.RIGHTS, "//dc:rights", metadata);
-
- XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
- xhtml.startDocument();
- xhtml.startElement("p");
- concatOccurrence(xmlDoc, "//*", " ", new AppendableAdaptor(xhtml));
- xhtml.endElement("p");
- xhtml.endDocument();
- }
-
- public void concatOccurrence(Object xmlDoc, String xpath, String
concatSep, Appendable chaineConcat) throws IOException {
+
+ public void concatOccurrence(Object xmlDoc, String xpath, String concatSep,
+ Appendable chaineConcat) throws IOException {
try {
JDOMXPath xp = new JDOMXPath(xpath);
+ if (nsContext != null) {
+ xp.setNamespaceContext(nsContext);
+ }
List ls = xp.selectNodes(xmlDoc);
Iterator i = ls.iterator();
int j = 0;
@@ -113,9 +93,8 @@
return;
} else {
if (ls.size() != j) {
- chaineConcat.append(' ')
- .append(concatSep)
- .append(' ');
+ chaineConcat.append(' ').append(concatSep).append(
+ ' ');
}
}
}
@@ -165,14 +144,13 @@
}
}
- public void extractContent(
- Document xmlDoc, String name, String xpath, Metadata metadata) {
+ public void extractContent(Document xmlDoc, String name, String xpath,
+ Metadata metadata) {
try {
JDOMXPath xp = new JDOMXPath(xpath);
- SimpleNamespaceContext context = new SimpleNamespaceContext();
- context.addNamespace("dc", "http://purl.org/dc/elements/1.1/");
- context.addNamespace("meta",
"urn:oasis:names:tc:opendocument:xmlns:meta:1.0");
- xp.setNamespaceContext(context);
+ if (nsContext != null) {
+ xp.setNamespaceContext(nsContext);
+ }
List selectNodes = xp.selectNodes(xmlDoc);
Iterator nodes = selectNodes.iterator();
while (nodes.hasNext()) {
@@ -205,4 +183,7 @@
}
+ public void setXmlParserNameSpaceContext(SimpleNamespaceContext nsContext)
{
+ this.nsContext = nsContext;
+ }
}
Modified: incubator/tika/trunk/src/main/resources/tika-config.xml
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/resources/tika-config.xml?rev=611511&r1=611510&r2=611511&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/resources/tika-config.xml (original)
+++ incubator/tika/trunk/src/main/resources/tika-config.xml Sat Jan 12 16:27:33
2008
@@ -23,7 +23,7 @@
<parsers>
- <parser name="text-xml" class="org.apache.tika.parser.xml.XMLParser">
+ <parser name="text-xml" class="org.apache.tika.parser.xml.DcXMLParser">
<mime>application/xml</mime>
</parser>