Author: jukka
Date: Wed Mar 30 16:52:51 2011
New Revision: 1087014
URL: http://svn.apache.org/viewvc?rev=1087014&view=rev
Log:
TIKA-625: Easier XML parser extensibility
Add the proposed Attribute- and ElementMetadataHandler classes
Added:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AbstractMetadataHandler.java
(contents, props changed)
- copied, changed from r1085453,
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AttributeMetadataHandler.java
(with props)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/ElementMetadataHandler.java
(contents, props changed)
- copied, changed from r1085453,
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java
Copied:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AbstractMetadataHandler.java
(from r1085453,
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java)
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AbstractMetadataHandler.java?p2=tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AbstractMetadataHandler.java&p1=tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java&r1=1085453&r2=1087014&rev=1087014&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AbstractMetadataHandler.java
Wed Mar 30 16:52:51 2011
@@ -17,30 +17,33 @@
package org.apache.tika.parser.xml;
import org.apache.tika.metadata.Metadata;
-import org.xml.sax.Attributes;
import org.xml.sax.helpers.DefaultHandler;
/**
- * This adds Metadata entries with a specified name for
- * the textual content of a node (if present), and
- * all attribute values passed through the matcher
- * (but not their names).
+ * Base class for SAX handlers that map SAX events into document metadata.
+ *
+ * @since Apache Tika 1.0
*/
-public class MetadataHandler extends DefaultHandler {
+class AbstractMetadataHandler extends DefaultHandler {
private final Metadata metadata;
private final String name;
- private final StringBuilder buffer = new StringBuilder();
-
- public MetadataHandler(Metadata metadata, String name) {
+ protected AbstractMetadataHandler(Metadata metadata, String name) {
this.metadata = metadata;
this.name = name;
}
- public void addMetadata(String value) {
- if (value.length() > 0) {
+ /**
+ * Adds the given metadata value. The value is ignored if it is
+ * <code>null</code> or empty. If the metadata entry already exists,
+ * then the given value is appended to it with a comma as the separator.
+ *
+ * @param value metadata value
+ */
+ protected void addMetadata(String value) {
+ if (value != null && value.length() > 0) {
String previous = metadata.get(name);
if (previous != null && previous.length() > 0) {
value = previous + ", " + value;
@@ -49,21 +52,4 @@ public class MetadataHandler extends Def
}
}
- public void endElement(String uri, String localName, String name) {
- addMetadata(buffer.toString());
- buffer.setLength(0);
- }
-
- public void startElement(
- String uri, String localName, String name, Attributes attributes) {
- for (int i = 0; i < attributes.getLength(); i++) {
- addMetadata(attributes.getValue(i));
- }
- }
-
-
- public void characters(char[] ch, int start, int length) {
- buffer.append(ch, start, length);
- }
-
}
Propchange:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AbstractMetadataHandler.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AbstractMetadataHandler.java
------------------------------------------------------------------------------
svn:executable = *
Added:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AttributeMetadataHandler.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AttributeMetadataHandler.java?rev=1087014&view=auto
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AttributeMetadataHandler.java
(added)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AttributeMetadataHandler.java
Wed Mar 30 16:52:51 2011
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import org.apache.tika.metadata.Metadata;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+
+/**
+ * SAX event handler that maps the contents of an XML attribute into
+ * a metadata field.
+ *
+ * @since Apache Tika 1.0
+ */
+public class AttributeMetadataHandler extends AbstractMetadataHandler {
+
+ private final String uri;
+
+ private final String localName;
+
+ protected AttributeMetadataHandler(
+ String uri, String localName, Metadata metadata, String name) {
+ super(metadata, name);
+ this.uri = uri;
+ this.localName = localName;
+ }
+
+ @Override
+ public void startElement(
+ String uri, String localName, String qName, Attributes attributes)
+ throws SAXException {
+ for (int i = 0; i < attributes.getLength(); i++) {
+ if (attributes.getURI(i).equals(this.uri)
+ && attributes.getLocalName(i).equals(this.localName)) {
+ addMetadata(attributes.getValue(i).trim());
+ }
+ }
+ }
+
+}
Propchange:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AttributeMetadataHandler.java
------------------------------------------------------------------------------
svn:executable = *
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java?rev=1087014&r1=1087013&r2=1087014&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java
Wed Mar 30 16:52:51 2011
@@ -19,10 +19,6 @@ package org.apache.tika.parser.xml;
import org.apache.tika.metadata.DublinCore;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.sax.TeeContentHandler;
-import org.apache.tika.sax.xpath.CompositeMatcher;
-import org.apache.tika.sax.xpath.Matcher;
-import org.apache.tika.sax.xpath.MatchingContentHandler;
-import org.apache.tika.sax.xpath.XPathParser;
import org.xml.sax.ContentHandler;
/**
@@ -30,39 +26,29 @@ import org.xml.sax.ContentHandler;
*/
public class DcXMLParser extends XMLParser {
- private static final XPathParser DC_XPATH = new XPathParser(
- "dc", "http://purl.org/dc/elements/1.1/");
-
- private static ContentHandler getDublinCore(
- ContentHandler ch, Metadata md, String name, String element) {
- Matcher matcher = new CompositeMatcher(
- DC_XPATH.parse("//dc:" + element),
- DC_XPATH.parse("//dc:" + element + "//text()"));
- ContentHandler branch =
- new MatchingContentHandler(new MetadataHandler(md, name), matcher);
- return new TeeContentHandler(ch, branch);
+ private static ContentHandler getDublinCoreHandler(
+ Metadata metadata, String name, String element) {
+ return new ElementMetadataHandler(
+ "http://purl.org/dc/elements/1.1/", element,
+ metadata, name);
}
- protected ContentHandler getContentHandler(ContentHandler ch, Metadata md)
{
- ch = super.getContentHandler(ch, md);
- ch = getDublinCore(ch, md, DublinCore.TITLE, "title");
- ch = getDublinCore(ch, md, DublinCore.SUBJECT, "subject");
- ch = getDublinCore(ch, md, DublinCore.CREATOR, "creator");
- ch = getDublinCore(ch, md, DublinCore.DESCRIPTION, "description");
- ch = getDublinCore(ch, md, DublinCore.PUBLISHER, "publisher");
- ch = getDublinCore(ch, md, DublinCore.CONTRIBUTOR, "contributor");
- try {
- ch = getDublinCore(ch, md, DublinCore.DATE.getName(), "date");
- } catch (Exception e) {
- // Date format and parsing behavior was undefined and untested
when DublinCare
- // date was converted to Property.internalDate so we silently skip
date on parse error
- }
- ch = getDublinCore(ch, md, DublinCore.TYPE, "type");
- ch = getDublinCore(ch, md, DublinCore.FORMAT, "format");
- ch = getDublinCore(ch, md, DublinCore.IDENTIFIER, "identifier");
- ch = getDublinCore(ch, md, DublinCore.LANGUAGE, "language");
- ch = getDublinCore(ch, md, DublinCore.RIGHTS, "rights");
- return ch;
+ protected ContentHandler getContentHandler(
+ ContentHandler handler, Metadata metadata) {
+ return new TeeContentHandler(
+ super.getContentHandler(handler, metadata),
+ getDublinCoreHandler(metadata, DublinCore.TITLE, "title"),
+ getDublinCoreHandler(metadata, DublinCore.SUBJECT, "subject"),
+ getDublinCoreHandler(metadata, DublinCore.CREATOR, "creator"),
+ getDublinCoreHandler(metadata, DublinCore.DESCRIPTION,
"description"),
+ getDublinCoreHandler(metadata, DublinCore.PUBLISHER,
"publisher"),
+ getDublinCoreHandler(metadata, DublinCore.CONTRIBUTOR,
"contributor"),
+ getDublinCoreHandler(metadata, DublinCore.DATE.getName(),
"date"),
+ getDublinCoreHandler(metadata, DublinCore.TYPE, "type"),
+ getDublinCoreHandler(metadata, DublinCore.FORMAT, "format"),
+ getDublinCoreHandler(metadata, DublinCore.IDENTIFIER,
"identifier"),
+ getDublinCoreHandler(metadata, DublinCore.LANGUAGE,
"language"),
+ getDublinCoreHandler(metadata, DublinCore.RIGHTS, "rights"));
}
}
Copied:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/ElementMetadataHandler.java
(from r1085453,
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java)
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/ElementMetadataHandler.java?p2=tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/ElementMetadataHandler.java&p1=tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java&r1=1085453&r2=1087014&rev=1087014&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/ElementMetadataHandler.java
Wed Mar 30 16:52:51 2011
@@ -18,52 +18,63 @@ package org.apache.tika.parser.xml;
import org.apache.tika.metadata.Metadata;
import org.xml.sax.Attributes;
-import org.xml.sax.helpers.DefaultHandler;
/**
- * This adds Metadata entries with a specified name for
- * the textual content of a node (if present), and
- * all attribute values passed through the matcher
- * (but not their names).
+ * SAX event handler that maps the contents of an XML element into
+ * a metadata field.
+ *
+ * @since Apache Tika 1.0
*/
-public class MetadataHandler extends DefaultHandler {
+public class ElementMetadataHandler extends AbstractMetadataHandler {
- private final Metadata metadata;
+ private final String uri;
- private final String name;
+ private final String localName;
private final StringBuilder buffer = new StringBuilder();
- public MetadataHandler(Metadata metadata, String name) {
- this.metadata = metadata;
- this.name = name;
- }
+ private int matchLevel = 0;
- public void addMetadata(String value) {
- if (value.length() > 0) {
- String previous = metadata.get(name);
- if (previous != null && previous.length() > 0) {
- value = previous + ", " + value;
- }
- metadata.set(name, value);
- }
+ public ElementMetadataHandler(
+ String uri, String localName, Metadata metadata, String name) {
+ super(metadata, name);
+ this.uri = uri;
+ this.localName = localName;
}
- public void endElement(String uri, String localName, String name) {
- addMetadata(buffer.toString());
- buffer.setLength(0);
+ protected boolean isMatchingElement(String uri, String localName) {
+ return uri.equals(this.uri) && localName.equals(this.localName);
}
+ @Override
public void startElement(
String uri, String localName, String name, Attributes attributes) {
- for (int i = 0; i < attributes.getLength(); i++) {
- addMetadata(attributes.getValue(i));
+ if (isMatchingElement(uri, localName)) {
+ matchLevel++;
+ }
+ }
+
+ @Override
+ public void endElement(String uri, String localName, String name) {
+ if (isMatchingElement(uri, localName)) {
+ matchLevel--;
+ if (matchLevel == 0) {
+ addMetadata(buffer.toString().trim());
+ buffer.setLength(0);
+ }
}
}
-
+ @Override
public void characters(char[] ch, int start, int length) {
- buffer.append(ch, start, length);
+ if (matchLevel > 0) {
+ buffer.append(ch, start, length);
+ }
+ }
+
+ @Override
+ public void ignorableWhitespace(char[] ch, int start, int length) {
+ characters(ch, start, length);
}
}
Propchange:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/ElementMetadataHandler.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/ElementMetadataHandler.java
------------------------------------------------------------------------------
svn:executable = *
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java?rev=1087014&r1=1087013&r2=1087014&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java
Wed Mar 30 16:52:51 2011
@@ -25,6 +25,9 @@ import org.xml.sax.helpers.DefaultHandle
* the textual content of a node (if present), and
* all attribute values passed through the matcher
* (but not their names).
+ *
+ * @deprecated Use the {@link AttributeMetadataHandler} and
+ * {@link ElementMetadataHandler} classes instead
*/
public class MetadataHandler extends DefaultHandler {
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java?rev=1087014&r1=1087013&r2=1087014&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java
(original)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java
Wed Mar 30 16:52:51 2011
@@ -44,7 +44,7 @@ public class DcXMLParserTest extends Tes
"Java, XML, XSLT, JDOM, Indexation",
metadata.get(Metadata.SUBJECT));
assertEquals(
- "Framework d\'indexation des documents XML, HTML, PDF
etc.. ",
+ "Framework d\'indexation des documents XML, HTML, PDF
etc..",
metadata.get(Metadata.DESCRIPTION));
assertEquals(
"http://www.apache.org",