Author: jukka
Date: Wed Mar 30 16:52:51 2011
New Revision: 1087014

URL: http://svn.apache.org/viewvc?rev=1087014&view=rev
Log:
TIKA-625: Easier XML parser extensibility

Add the proposed Attribute- and ElementMetadataHandler classes

Added:
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AbstractMetadataHandler.java
   (contents, props changed)
      - copied, changed from r1085453, 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AttributeMetadataHandler.java
   (with props)
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/ElementMetadataHandler.java
   (contents, props changed)
      - copied, changed from r1085453, 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java
Modified:
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java

Copied: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AbstractMetadataHandler.java
 (from r1085453, 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java)
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AbstractMetadataHandler.java?p2=tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AbstractMetadataHandler.java&p1=tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java&r1=1085453&r2=1087014&rev=1087014&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AbstractMetadataHandler.java
 Wed Mar 30 16:52:51 2011
@@ -17,30 +17,33 @@
 package org.apache.tika.parser.xml;
 
 import org.apache.tika.metadata.Metadata;
-import org.xml.sax.Attributes;
 import org.xml.sax.helpers.DefaultHandler;
 
 /**
- * This adds Metadata entries with a specified name for
- *  the textual content of a node (if present), and 
- *  all attribute values passed through the matcher
- *  (but not their names). 
+ * Base class for SAX handlers that map SAX events into document metadata.
+ *
+ * @since Apache Tika 1.0
  */
-public class MetadataHandler extends DefaultHandler {
+class AbstractMetadataHandler extends DefaultHandler {
 
     private final Metadata metadata;
 
     private final String name;
 
-    private final StringBuilder buffer = new StringBuilder();
-
-    public MetadataHandler(Metadata metadata, String name) {
+    protected AbstractMetadataHandler(Metadata metadata, String name) {
         this.metadata = metadata;
         this.name = name;
     }
 
-    public void addMetadata(String value) {
-        if (value.length() > 0) {
+    /**
+     * Adds the given metadata value. The value is ignored if it is
+     * <code>null</code> or empty. If the metadata entry already exists,
+     * then the given value is appended to it with a comma as the separator.
+     *
+     * @param value metadata value
+     */
+    protected void addMetadata(String value) {
+        if (value != null && value.length() > 0) {
             String previous = metadata.get(name);
             if (previous != null && previous.length() > 0) {
                 value = previous + ", " + value;
@@ -49,21 +52,4 @@ public class MetadataHandler extends Def
         }
     }
 
-    public void endElement(String uri, String localName, String name) {
-        addMetadata(buffer.toString());
-        buffer.setLength(0);
-    }
-
-    public void startElement(
-            String uri, String localName, String name, Attributes attributes) {
-        for (int i = 0; i < attributes.getLength(); i++) {
-            addMetadata(attributes.getValue(i));
-        }
-    }
-
-    
-    public void characters(char[] ch, int start, int length) {
-        buffer.append(ch, start, length);
-    }
-
 }

Propchange: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AbstractMetadataHandler.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AbstractMetadataHandler.java
------------------------------------------------------------------------------
    svn:executable = *

Added: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AttributeMetadataHandler.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AttributeMetadataHandler.java?rev=1087014&view=auto
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AttributeMetadataHandler.java
 (added)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AttributeMetadataHandler.java
 Wed Mar 30 16:52:51 2011
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import org.apache.tika.metadata.Metadata;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+
+/**
+ * SAX event handler that maps the contents of an XML attribute into
+ * a metadata field.
+ *
+ * @since Apache Tika 1.0
+ */
+public class AttributeMetadataHandler extends AbstractMetadataHandler {
+
+    private final String uri;
+
+    private final String localName;
+
+    protected AttributeMetadataHandler(
+            String uri, String localName, Metadata metadata, String name) {
+        super(metadata, name);
+        this.uri = uri;
+        this.localName = localName;
+    }
+
+    @Override
+    public void startElement(
+            String uri, String localName, String qName, Attributes attributes)
+            throws SAXException {
+        for (int i = 0; i < attributes.getLength(); i++) {
+            if (attributes.getURI(i).equals(this.uri)
+                    && attributes.getLocalName(i).equals(this.localName)) {
+                addMetadata(attributes.getValue(i).trim());
+            }
+        }
+    }
+
+}

Propchange: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AttributeMetadataHandler.java
------------------------------------------------------------------------------
    svn:executable = *

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java?rev=1087014&r1=1087013&r2=1087014&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java
 Wed Mar 30 16:52:51 2011
@@ -19,10 +19,6 @@ package org.apache.tika.parser.xml;
 import org.apache.tika.metadata.DublinCore;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.sax.TeeContentHandler;
-import org.apache.tika.sax.xpath.CompositeMatcher;
-import org.apache.tika.sax.xpath.Matcher;
-import org.apache.tika.sax.xpath.MatchingContentHandler;
-import org.apache.tika.sax.xpath.XPathParser;
 import org.xml.sax.ContentHandler;
 
 /**
@@ -30,39 +26,29 @@ import org.xml.sax.ContentHandler;
  */
 public class DcXMLParser extends XMLParser {
 
-    private static final XPathParser DC_XPATH = new XPathParser(
-            "dc", "http://purl.org/dc/elements/1.1/";);
-
-    private static ContentHandler getDublinCore(
-            ContentHandler ch, Metadata md, String name, String element) {
-        Matcher matcher = new CompositeMatcher(
-                DC_XPATH.parse("//dc:" + element),
-                DC_XPATH.parse("//dc:" + element + "//text()"));
-        ContentHandler branch =
-            new MatchingContentHandler(new MetadataHandler(md, name), matcher);
-        return new TeeContentHandler(ch, branch);
+    private static ContentHandler getDublinCoreHandler(
+            Metadata metadata, String name, String element) {
+        return new ElementMetadataHandler(
+                "http://purl.org/dc/elements/1.1/";, element,
+                metadata, name);
     }
 
-    protected ContentHandler getContentHandler(ContentHandler ch, Metadata md) 
{
-        ch = super.getContentHandler(ch, md);
-        ch = getDublinCore(ch, md, DublinCore.TITLE, "title");
-        ch = getDublinCore(ch, md, DublinCore.SUBJECT, "subject");
-        ch = getDublinCore(ch, md, DublinCore.CREATOR, "creator");
-        ch = getDublinCore(ch, md, DublinCore.DESCRIPTION, "description");
-        ch = getDublinCore(ch, md, DublinCore.PUBLISHER, "publisher");
-        ch = getDublinCore(ch, md, DublinCore.CONTRIBUTOR, "contributor");
-        try {
-            ch = getDublinCore(ch, md, DublinCore.DATE.getName(), "date");
-        } catch (Exception e) {
-            // Date format and parsing behavior was undefined and untested 
when DublinCare
-            // date was converted to Property.internalDate so we silently skip 
date on parse error
-        }
-        ch = getDublinCore(ch, md, DublinCore.TYPE, "type");
-        ch = getDublinCore(ch, md, DublinCore.FORMAT, "format");
-        ch = getDublinCore(ch, md, DublinCore.IDENTIFIER, "identifier");
-        ch = getDublinCore(ch, md, DublinCore.LANGUAGE, "language");
-        ch = getDublinCore(ch, md, DublinCore.RIGHTS, "rights");
-        return ch;
+    protected ContentHandler getContentHandler(
+            ContentHandler handler, Metadata metadata) {
+        return new TeeContentHandler(
+                super.getContentHandler(handler, metadata),
+                getDublinCoreHandler(metadata, DublinCore.TITLE, "title"),
+                getDublinCoreHandler(metadata, DublinCore.SUBJECT, "subject"),
+                getDublinCoreHandler(metadata, DublinCore.CREATOR, "creator"),
+                getDublinCoreHandler(metadata, DublinCore.DESCRIPTION, 
"description"),
+                getDublinCoreHandler(metadata, DublinCore.PUBLISHER, 
"publisher"),
+                getDublinCoreHandler(metadata, DublinCore.CONTRIBUTOR, 
"contributor"),
+                getDublinCoreHandler(metadata, DublinCore.DATE.getName(), 
"date"),
+                getDublinCoreHandler(metadata, DublinCore.TYPE, "type"),
+                getDublinCoreHandler(metadata, DublinCore.FORMAT, "format"),
+                getDublinCoreHandler(metadata, DublinCore.IDENTIFIER, 
"identifier"),
+                getDublinCoreHandler(metadata, DublinCore.LANGUAGE, 
"language"),
+                getDublinCoreHandler(metadata, DublinCore.RIGHTS, "rights"));
     }
 
 }

Copied: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/ElementMetadataHandler.java
 (from r1085453, 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java)
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/ElementMetadataHandler.java?p2=tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/ElementMetadataHandler.java&p1=tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java&r1=1085453&r2=1087014&rev=1087014&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/ElementMetadataHandler.java
 Wed Mar 30 16:52:51 2011
@@ -18,52 +18,63 @@ package org.apache.tika.parser.xml;
 
 import org.apache.tika.metadata.Metadata;
 import org.xml.sax.Attributes;
-import org.xml.sax.helpers.DefaultHandler;
 
 /**
- * This adds Metadata entries with a specified name for
- *  the textual content of a node (if present), and 
- *  all attribute values passed through the matcher
- *  (but not their names). 
+ * SAX event handler that maps the contents of an XML element into
+ * a metadata field.
+ *
+ * @since Apache Tika 1.0
  */
-public class MetadataHandler extends DefaultHandler {
+public class ElementMetadataHandler extends AbstractMetadataHandler {
 
-    private final Metadata metadata;
+    private final String uri;
 
-    private final String name;
+    private final String localName;
 
     private final StringBuilder buffer = new StringBuilder();
 
-    public MetadataHandler(Metadata metadata, String name) {
-        this.metadata = metadata;
-        this.name = name;
-    }
+    private int matchLevel = 0;
 
-    public void addMetadata(String value) {
-        if (value.length() > 0) {
-            String previous = metadata.get(name);
-            if (previous != null && previous.length() > 0) {
-                value = previous + ", " + value;
-            }
-            metadata.set(name, value);
-        }
+    public ElementMetadataHandler(
+            String uri, String localName, Metadata metadata, String name) {
+        super(metadata, name);
+        this.uri = uri;
+        this.localName = localName;
     }
 
-    public void endElement(String uri, String localName, String name) {
-        addMetadata(buffer.toString());
-        buffer.setLength(0);
+    protected boolean isMatchingElement(String uri, String localName) {
+        return uri.equals(this.uri) && localName.equals(this.localName);
     }
 
+    @Override
     public void startElement(
             String uri, String localName, String name, Attributes attributes) {
-        for (int i = 0; i < attributes.getLength(); i++) {
-            addMetadata(attributes.getValue(i));
+        if (isMatchingElement(uri, localName)) {
+            matchLevel++;
+        }
+    }
+
+    @Override
+    public void endElement(String uri, String localName, String name) {
+        if (isMatchingElement(uri, localName)) {
+            matchLevel--;
+            if (matchLevel == 0) {
+                addMetadata(buffer.toString().trim());
+                buffer.setLength(0);
+            }
         }
     }
 
-    
+    @Override
     public void characters(char[] ch, int start, int length) {
-        buffer.append(ch, start, length);
+        if (matchLevel > 0) {
+            buffer.append(ch, start, length);
+        }
+    }
+
+    @Override
+    public void ignorableWhitespace(char[] ch, int start, int length) {
+        characters(ch, start, length);
     }
 
 }

Propchange: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/ElementMetadataHandler.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/ElementMetadataHandler.java
------------------------------------------------------------------------------
    svn:executable = *

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java?rev=1087014&r1=1087013&r2=1087014&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java
 Wed Mar 30 16:52:51 2011
@@ -25,6 +25,9 @@ import org.xml.sax.helpers.DefaultHandle
  *  the textual content of a node (if present), and 
  *  all attribute values passed through the matcher
  *  (but not their names). 
+ *
+ * @deprecated Use the {@link AttributeMetadataHandler} and
+ *             {@link ElementMetadataHandler} classes instead
  */
 public class MetadataHandler extends DefaultHandler {
 

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java?rev=1087014&r1=1087013&r2=1087014&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java
 Wed Mar 30 16:52:51 2011
@@ -44,7 +44,7 @@ public class DcXMLParserTest extends Tes
                     "Java, XML, XSLT, JDOM, Indexation",
                     metadata.get(Metadata.SUBJECT));
             assertEquals(
-                    "Framework d\'indexation des documents XML, HTML, PDF 
etc.. ",
+                    "Framework d\'indexation des documents XML, HTML, PDF 
etc..",
                     metadata.get(Metadata.DESCRIPTION));
             assertEquals(
                     "http://www.apache.org";,


Reply via email to