Author: mattmann
Date: Fri Jun  8 15:54:31 2007
New Revision: 545659

URL: http://svn.apache.org/viewvc?view=rev&rev=545659
Log:
patch for TIKA-5

Added:
    incubator/tika/trunk/src/main/java/org/apache/tika/metadata/
    
incubator/tika/trunk/src/main/java/org/apache/tika/metadata/CreativeCommons.java
    incubator/tika/trunk/src/main/java/org/apache/tika/metadata/DublinCore.java
    incubator/tika/trunk/src/main/java/org/apache/tika/metadata/HttpHeaders.java
    incubator/tika/trunk/src/main/java/org/apache/tika/metadata/Metadata.java
    incubator/tika/trunk/src/main/java/org/apache/tika/metadata/Office.java
    
incubator/tika/trunk/src/main/java/org/apache/tika/metadata/SpellCheckedMetadata.java
    incubator/tika/trunk/src/main/java/org/apache/tika/metadata/package.html
    incubator/tika/trunk/src/main/test/org/apache/tika/metadata/
    
incubator/tika/trunk/src/main/test/org/apache/tika/metadata/TestMetadata.java
    
incubator/tika/trunk/src/main/test/org/apache/tika/metadata/TestSpellCheckedMetadata.java
Modified:
    incubator/tika/trunk/CHANGES.txt
    incubator/tika/trunk/pom.xml

Modified: incubator/tika/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?view=diff&rev=545659&r1=545658&r2=545659
==============================================================================
--- incubator/tika/trunk/CHANGES.txt (original)
+++ incubator/tika/trunk/CHANGES.txt Fri Jun  8 15:54:31 2007
@@ -2,3 +2,5 @@
 
 Unreleased changes (0.1-dev)
 
+1. TIKA-5 - Port Metadata Framework from Nutch (mattmann)
+

Modified: incubator/tika/trunk/pom.xml
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/pom.xml?view=diff&rev=545659&r1=545658&r2=545659
==============================================================================
--- incubator/tika/trunk/pom.xml (original)
+++ incubator/tika/trunk/pom.xml Fri Jun  8 15:54:31 2007
@@ -28,6 +28,7 @@
   <groupId>org.apache.tika</groupId>
   <artifactId>tika</artifactId>
   <version>0.1-SNAPSHOT</version>
+  
 
   <name>Apache Tika</name>
   <!-- Keep on a single line, see http://jira.codehaus.org/browse/MJAR-39 -->
@@ -139,10 +140,39 @@
       <url>file://${basedir}/../site</url>
     </site>
   </distributionManagement>
-  
+
+  <dependencies>
+       <dependency>
+               <groupId>commons-lang</groupId>
+               <artifactId>commons-lang</artifactId>
+               <version>2.1</version>
+               <type>jar</type>
+               <scope/>
+               <optional/>
+       </dependency>
+       <dependency>
+               <groupId>junit</groupId>
+               <artifactId>junit</artifactId>
+               <version>3.8.1</version>
+               <type>jar</type>
+               <scope>test</scope>
+               <optional/>
+       </dependency>
+  </dependencies>
+
   <build>
-    <plugins>
-      <plugin>
+  <sourceDirectory>${basedir}/src/main/java/</sourceDirectory>
+  <testSourceDirectory>${basedir}/src/main/test/</testSourceDirectory>
+  <plugins>
+        <plugin>
+               <groupId>org.apache.maven.plugins</groupId>
+               <artifactId>maven-compiler-plugin</artifactId>
+               <configuration>
+                       <source>1.5</source>
+                       <target>1.5</target>
+               </configuration>
+        </plugin>
+        <plugin>
         <artifactId>maven-antrun-plugin</artifactId>
         <executions>
         <!-- Include the project metadata files in META-INF -->

Added: 
incubator/tika/trunk/src/main/java/org/apache/tika/metadata/CreativeCommons.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/metadata/CreativeCommons.java?view=auto&rev=545659
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/metadata/CreativeCommons.java
 (added)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/metadata/CreativeCommons.java
 Fri Jun  8 15:54:31 2007
@@ -0,0 +1,35 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.metadata;
+
+/**
+ * A collection of Creative Commons properties names.
+ * 
+ * @see <a href="http://www.creativecommons.org/";>creativecommons.org</a>
+ * 
+ * @author Chris Mattmann
+ * @author J&eacute;r&ocirc;me Charron
+ */
+public interface CreativeCommons {
+
+  public final static String LICENSE_URL = "License-Url";
+
+  public final static String LICENSE_LOCATION = "License-Location";
+
+  public final static String WORK_TYPE = "Work-Type";
+
+}

Added: 
incubator/tika/trunk/src/main/java/org/apache/tika/metadata/DublinCore.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/metadata/DublinCore.java?view=auto&rev=545659
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/metadata/DublinCore.java 
(added)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/metadata/DublinCore.java 
Fri Jun  8 15:54:31 2007
@@ -0,0 +1,161 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.metadata;
+
+/**
+ * A collection of Dublin Core metadata names.
+ * 
+ * @see <a href="http://dublincore.org";>dublincore.org</a>
+ * 
+ * @author Chris Mattmann
+ * @author J&eacute;r&ocirc;me Charron
+ */
+public interface DublinCore {
+
+  /**
+   * Typically, Format may include the media-type or dimensions of the 
resource.
+   * Format may be used to determine the software, hardware or other equipment
+   * needed to display or operate the resource. Examples of dimensions include
+   * size and duration. Recommended best practice is to select a value from a
+   * controlled vocabulary (for example, the list of Internet Media Types 
[MIME]
+   * defining computer media formats).
+   */
+  public static final String FORMAT = "format";
+
+  /**
+   * Recommended best practice is to identify the resource by means of a string
+   * or number conforming to a formal identification system. Example formal
+   * identification systems include the Uniform Resource Identifier (URI)
+   * (including the Uniform Resource Locator (URL)), the Digital Object
+   * Identifier (DOI) and the International Standard Book Number (ISBN).
+   */
+  public static final String IDENTIFIER = "identifier";
+
+  /**
+   * Date on which the resource was changed.
+   */
+  public static final String MODIFIED = "modified";
+
+  /**
+   * An entity responsible for making contributions to the content of the
+   * resource. Examples of a Contributor include a person, an organisation, or 
a
+   * service. Typically, the name of a Contributor should be used to indicate
+   * the entity.
+   */
+  public static final String CONTRIBUTOR = "contributor";
+
+  /**
+   * The extent or scope of the content of the resource. Coverage will 
typically
+   * include spatial location (a place name or geographic coordinates), 
temporal
+   * period (a period label, date, or date range) or jurisdiction (such as a
+   * named administrative entity). Recommended best practice is to select a
+   * value from a controlled vocabulary (for example, the Thesaurus of
+   * Geographic Names [TGN]) and that, where appropriate, named places or time
+   * periods be used in preference to numeric identifiers such as sets of
+   * coordinates or date ranges.
+   */
+  public static final String COVERAGE = "coverage";
+
+  /**
+   * An entity primarily responsible for making the content of the resource.
+   * Examples of a Creator include a person, an organisation, or a service.
+   * Typically, the name of a Creator should be used to indicate the entity.
+   */
+  public static final String CREATOR = "creator";
+
+  /**
+   * A date associated with an event in the life cycle of the resource.
+   * Typically, Date will be associated with the creation or availability of 
the
+   * resource. Recommended best practice for encoding the date value is defined
+   * in a profile of ISO 8601 [W3CDTF] and follows the YYYY-MM-DD format.
+   */
+  public static final String DATE = "date";
+
+  /**
+   * An account of the content of the resource. Description may include but is
+   * not limited to: an abstract, table of contents, reference to a graphical
+   * representation of content or a free-text account of the content.
+   */
+  public static final String DESCRIPTION = "description";
+
+  /**
+   * A language of the intellectual content of the resource. Recommended best
+   * practice is to use RFC 3066 [RFC3066], which, in conjunction with ISO 639
+   * [ISO639], defines two- and three-letter primary language tags with 
optional
+   * subtags. Examples include "en" or "eng" for English, "akk" for Akkadian,
+   * and "en-GB" for English used in the United Kingdom.
+   */
+  public static final String LANGUAGE = "language";
+
+  /**
+   * An entity responsible for making the resource available. Examples of a
+   * Publisher include a person, an organisation, or a service. Typically, the
+   * name of a Publisher should be used to indicate the entity.
+   */
+  public static final String PUBLISHER = "publisher";
+
+  /**
+   * A reference to a related resource. Recommended best practice is to
+   * reference the resource by means of a string or number conforming to a
+   * formal identification system.
+   */
+  public static final String RELATION = "relation";
+
+  /**
+   * Information about rights held in and over the resource. Typically, a 
Rights
+   * element will contain a rights management statement for the resource, or
+   * reference a service providing such information. Rights information often
+   * encompasses Intellectual Property Rights (IPR), Copyright, and various
+   * Property Rights. If the Rights element is absent, no assumptions can be
+   * made about the status of these and other rights with respect to the
+   * resource.
+   */
+  public static final String RIGHTS = "rights";
+
+  /**
+   * A reference to a resource from which the present resource is derived. The
+   * present resource may be derived from the Source resource in whole or in
+   * part. Recommended best practice is to reference the resource by means of a
+   * string or number conforming to a formal identification system.
+   */
+  public static final String SOURCE = "source";
+
+  /**
+   * The topic of the content of the resource. Typically, a Subject will be
+   * expressed as keywords, key phrases or classification codes that describe a
+   * topic of the resource. Recommended best practice is to select a value from
+   * a controlled vocabulary or formal classification scheme.
+   */
+  public static final String SUBJECT = "subject";
+
+  /**
+   * A name given to the resource. Typically, a Title will be a name by which
+   * the resource is formally known.
+   */
+  public static final String TITLE = "title";
+
+  /**
+   * The nature or genre of the content of the resource. Type includes terms
+   * describing general categories, functions, genres, or aggregation levels 
for
+   * content. Recommended best practice is to select a value from a controlled
+   * vocabulary (for example, the DCMI Type Vocabulary [DCMITYPE]). To describe
+   * the physical or digital manifestation of the resource, use the Format
+   * element.
+   */
+  public static final String TYPE = "type";
+
+}

Added: 
incubator/tika/trunk/src/main/java/org/apache/tika/metadata/HttpHeaders.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/metadata/HttpHeaders.java?view=auto&rev=545659
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/metadata/HttpHeaders.java 
(added)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/metadata/HttpHeaders.java 
Fri Jun  8 15:54:31 2007
@@ -0,0 +1,48 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.metadata;
+
+/**
+ * A collection of HTTP header names.
+ * 
+ * @see <a href="http://rfc-ref.org/RFC-TEXTS/2616/";>Hypertext Transfer 
Protocol --
+ *      HTTP/1.1 (RFC 2616)</a>
+ * 
+ * @author Chris Mattmann
+ * @author J&eacute;r&ocirc;me Charron
+ */
+public interface HttpHeaders {
+
+  public final static String CONTENT_ENCODING = "Content-Encoding";
+
+  public final static String CONTENT_LANGUAGE = "Content-Language";
+
+  public final static String CONTENT_LENGTH = "Content-Length";
+
+  public final static String CONTENT_LOCATION = "Content-Location";
+
+  public static final String CONTENT_DISPOSITION = "Content-Disposition";
+
+  public final static String CONTENT_MD5 = "Content-MD5";
+
+  public final static String CONTENT_TYPE = "Content-Type";
+
+  public final static String LAST_MODIFIED = "Last-Modified";
+
+  public final static String LOCATION = "Location";
+
+}

Added: incubator/tika/trunk/src/main/java/org/apache/tika/metadata/Metadata.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/metadata/Metadata.java?view=auto&rev=545659
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/metadata/Metadata.java 
(added)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/metadata/Metadata.java 
Fri Jun  8 15:54:31 2007
@@ -0,0 +1,216 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.metadata;
+
+// JDK imports
+import java.util.Enumeration;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Properties;
+
+/**
+ * A multi-valued metadata container.
+ * 
+ * @author Chris Mattmann
+ * @author J&eacute;r&ocirc;me Charron
+ * 
+ */
+public class Metadata implements CreativeCommons, DublinCore, HttpHeaders,
+    Office {
+
+  /**
+   * A map of all metadata attributes.
+   */
+  private Map<String, String[]> metadata = null;
+
+  /**
+   * Constructs a new, empty metadata.
+   */
+  public Metadata() {
+    metadata = new HashMap<String, String[]>();
+  }
+
+  /**
+   * Returns true if named value is multivalued.
+   * 
+   * @param name
+   *          name of metadata
+   * @return true is named value is multivalued, false if single value or null
+   */
+  public boolean isMultiValued(final String name) {
+    return metadata.get(name) != null && metadata.get(name).length > 1;
+  }
+
+  /**
+   * Returns an array of the names contained in the metadata.
+   * 
+   * @return Metadata names
+   */
+  public String[] names() {
+    return metadata.keySet().toArray(new String[metadata.keySet().size()]);
+  }
+
+  /**
+   * Get the value associated to a metadata name. If many values are 
assiociated
+   * to the specified name, then the first one is returned.
+   * 
+   * @param name
+   *          of the metadata.
+   * @return the value associated to the specified metadata name.
+   */
+  public String get(final String name) {
+    String[] values = metadata.get(name);
+    if (values == null) {
+      return null;
+    } else {
+      return values[0];
+    }
+  }
+
+  /**
+   * Get the values associated to a metadata name.
+   * 
+   * @param name
+   *          of the metadata.
+   * @return the values associated to a metadata name.
+   */
+  public String[] getValues(final String name) {
+    return _getValues(name);
+  }
+
+  private String[] _getValues(final String name) {
+    String[] values = metadata.get(name);
+    if (values == null) {
+      values = new String[0];
+    }
+    return values;
+  }
+
+  /**
+   * Add a metadata name/value mapping. Add the specified value to the list of
+   * values associated to the specified metadata name.
+   * 
+   * @param name
+   *          the metadata name.
+   * @param value
+   *          the metadata value.
+   */
+  public void add(final String name, final String value) {
+    String[] values = metadata.get(name);
+    if (values == null) {
+      set(name, value);
+    } else {
+      String[] newValues = new String[values.length + 1];
+      System.arraycopy(values, 0, newValues, 0, values.length);
+      newValues[newValues.length - 1] = value;
+      metadata.put(name, newValues);
+    }
+  }
+
+  /**
+   * Copy All key-value pairs from properties.
+   * 
+   * @param properties
+   *          properties to copy from
+   */
+  public void setAll(Properties properties) {
+    Enumeration names = properties.propertyNames();
+    while (names.hasMoreElements()) {
+      String name = (String) names.nextElement();
+      metadata.put(name, new String[] { properties.getProperty(name) });
+    }
+  }
+
+  /**
+   * Set metadata name/value. Associate the specified value to the specified
+   * metadata name. If some previous values were associated to this name, they
+   * are removed.
+   * 
+   * @param name
+   *          the metadata name.
+   * @param value
+   *          the metadata value.
+   */
+  public void set(String name, String value) {
+    metadata.put(name, new String[] { value });
+  }
+
+  /**
+   * Remove a metadata and all its associated values.
+   * 
+   * @param name
+   *          metadata name to remove
+   */
+  public void remove(String name) {
+    metadata.remove(name);
+  }
+
+  /**
+   * Returns the number of metadata names in this metadata.
+   * 
+   * @return number of metadata names
+   */
+  public int size() {
+    return metadata.size();
+  }
+
+  public boolean equals(Object o) {
+
+    if (o == null) {
+      return false;
+    }
+
+    Metadata other = null;
+    try {
+      other = (Metadata) o;
+    } catch (ClassCastException cce) {
+      return false;
+    }
+
+    if (other.size() != size()) {
+      return false;
+    }
+
+    String[] names = names();
+    for (int i = 0; i < names.length; i++) {
+      String[] otherValues = other._getValues(names[i]);
+      String[] thisValues = _getValues(names[i]);
+      if (otherValues.length != thisValues.length) {
+        return false;
+      }
+      for (int j = 0; j < otherValues.length; j++) {
+        if (!otherValues[j].equals(thisValues[j])) {
+          return false;
+        }
+      }
+    }
+    return true;
+  }
+
+  public String toString() {
+    StringBuffer buf = new StringBuffer();
+    String[] names = names();
+    for (int i = 0; i < names.length; i++) {
+      String[] values = _getValues(names[i]);
+      for (int j = 0; j < values.length; j++) {
+        buf.append(names[i]).append("=").append(values[j]).append(" ");
+      }
+    }
+    return buf.toString();
+  }
+
+}

Added: incubator/tika/trunk/src/main/java/org/apache/tika/metadata/Office.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/metadata/Office.java?view=auto&rev=545659
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/metadata/Office.java 
(added)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/metadata/Office.java Fri 
Jun  8 15:54:31 2007
@@ -0,0 +1,51 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.metadata;
+
+/**
+ * A collection of <i>"Office"</i> documents properties names.
+ * 
+ * @author Chris Mattmann
+ * @author J&eacute;r&ocirc;me Charron
+ */
+public interface Office {
+
+  public static final String KEYWORDS = "Keywords";
+
+  public static final String COMMENTS = "Comments";
+
+  public static final String LAST_AUTHOR = "Last-Author";
+
+  public static final String APPLICATION_NAME = "Application-Name";
+
+  public static final String CHARACTER_COUNT = "Character Count";
+
+  public static final String LAST_PRINTED = "Last-Printed";
+
+  public static final String LAST_SAVED = "Last-Save-Date";
+
+  public static final String PAGE_COUNT = "Page-Count";
+
+  public static final String REVISION_NUMBER = "Revision-Number";
+
+  public static final String WORD_COUNT = "Word-Count";
+
+  public static final String TEMPLATE = "Template";
+
+  public static final String AUTHOR = "Author";
+
+}

Added: 
incubator/tika/trunk/src/main/java/org/apache/tika/metadata/SpellCheckedMetadata.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/metadata/SpellCheckedMetadata.java?view=auto&rev=545659
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/metadata/SpellCheckedMetadata.java
 (added)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/metadata/SpellCheckedMetadata.java
 Fri Jun  8 15:54:31 2007
@@ -0,0 +1,152 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.metadata;
+
+//JDK imports
+import java.lang.reflect.Field;
+import java.lang.reflect.Modifier;
+import java.util.HashMap;
+import java.util.Map;
+
+//Apache imports
+import org.apache.commons.lang.StringUtils;
+
+/**
+ * A decorator to Metadata that adds spellchecking capabilities to property
+ * names. Currently used spelling vocabulary contains just the httpheaders from
+ * [EMAIL PROTECTED] HttpHeaders} class.
+ * 
+ */
+public class SpellCheckedMetadata extends Metadata {
+
+  /**
+   * Treshold divider.
+   *
+   * <code>threshold = searched.length() / TRESHOLD_DIVIDER;</code>
+   */
+  private static final int TRESHOLD_DIVIDER = 3;
+
+  /**
+   * Normalized name to name mapping.
+   */
+  private final static Map<String, String> NAMES_IDX = new HashMap<String, 
String>();
+
+  /**
+   * Array holding map keys.
+   */
+  private static String[] normalized = null;
+
+  static {
+
+    // Uses following array to fill the metanames index and the
+    // metanames list.
+    Class[] spellthese = {HttpHeaders.class};
+
+    for (Class spellCheckedNames : spellthese) {
+      for (Field field : spellCheckedNames.getFields()) {
+        int mods = field.getModifiers();
+        if (Modifier.isFinal(mods) && Modifier.isPublic(mods)
+            && Modifier.isStatic(mods) && 
field.getType().equals(String.class)) {
+          try {
+            String val = (String) field.get(null);
+            NAMES_IDX.put(normalize(val), val);
+          } catch (Exception e) {
+            // Simply ignore...
+          }
+        }
+      }
+    }
+    normalized = NAMES_IDX.keySet().toArray(new String[NAMES_IDX.size()]);
+  }
+
+  /**
+   * Normalizes String.
+   *
+   * @param str
+   *          the string to normalize
+   * @return normalized String
+   */
+  private static String normalize(final String str) {
+    char c;
+    StringBuffer buf = new StringBuffer();
+    for (int i = 0; i < str.length(); i++) {
+      c = str.charAt(i);
+      if (Character.isLetter(c)) {
+        buf.append(Character.toLowerCase(c));
+      }
+    }
+    return buf.toString();
+  }
+
+  /**
+   * Get the normalized name of metadata attribute name. This method tries to
+   * find a well-known metadata name (one of the metadata names defined in this
+   * class) that matches the specified name. The matching is error tolerent. 
For
+   * instance,
+   * <ul>
+   * <li>content-type gives Content-Type</li>
+   * <li>CoNtEntType gives Content-Type</li>
+   * <li>ConTnTtYpe gives Content-Type</li>
+   * </ul>
+   * If no matching with a well-known metadata name is found, then the original
+   * name is returned.
+   *
+   * @param name
+   *          Name to normalize
+   * @return normalized name
+   */
+  public static String getNormalizedName(final String name) {
+    String searched = normalize(name);
+    String value = NAMES_IDX.get(searched);
+
+    if ((value == null) && (normalized != null)) {
+      int threshold = searched.length() / TRESHOLD_DIVIDER;
+      for (int i = 0; i < normalized.length && value == null; i++) {
+        if (StringUtils.getLevenshteinDistance(searched, normalized[i]) < 
threshold) {
+          value = NAMES_IDX.get(normalized[i]);
+        }
+      }
+    }
+    return (value != null) ? value : name;
+  }
+
+  @Override
+  public void remove(final String name) {
+    super.remove(getNormalizedName(name));
+  }
+
+  @Override
+  public void add(final String name, final String value) {
+    super.add(getNormalizedName(name), value);
+  }
+
+  @Override
+  public String[] getValues(final String name) {
+    return super.getValues(getNormalizedName(name));
+  }
+
+  @Override
+  public String get(final String name) {
+    return super.get(getNormalizedName(name));
+  }
+
+  @Override
+  public void set(final String name, final String value) {
+    super.set(getNormalizedName(name), value);
+  }
+
+}

Added: incubator/tika/trunk/src/main/java/org/apache/tika/metadata/package.html
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/metadata/package.html?view=auto&rev=545659
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/metadata/package.html 
(added)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/metadata/package.html 
Fri Jun  8 15:54:31 2007
@@ -0,0 +1,6 @@
+<html>
+<body>
+A Multi-valued Metadata container, and set
+of constant fields for Tika Metadata.
+</body>
+</html>

Added: 
incubator/tika/trunk/src/main/test/org/apache/tika/metadata/TestMetadata.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/test/org/apache/tika/metadata/TestMetadata.java?view=auto&rev=545659
==============================================================================
--- 
incubator/tika/trunk/src/main/test/org/apache/tika/metadata/TestMetadata.java 
(added)
+++ 
incubator/tika/trunk/src/main/test/org/apache/tika/metadata/TestMetadata.java 
Fri Jun  8 15:54:31 2007
@@ -0,0 +1,211 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.metadata;
+
+// JDK imports
+import java.util.Properties;
+
+// Junit imports
+import junit.framework.Test;
+import junit.framework.TestCase;
+import junit.framework.TestSuite;
+import junit.textui.TestRunner;
+
+/**
+ * JUnit based tests of class [EMAIL PROTECTED] 
org.apache.tika.metadata.Metadata}.
+ */
+public class TestMetadata extends TestCase {
+
+  private static final String CONTENTTYPE = "contenttype";
+
+  public TestMetadata(String testName) {
+    super(testName);
+  }
+
+  public static Test suite() {
+    return new TestSuite(TestMetadata.class);
+  }
+
+  public static void main(String[] args) {
+    TestRunner.run(suite());
+  }
+
+  /** Test for the <code>add(String, String)</code> method. */
+  public void testAdd() {
+    String[] values = null;
+    Metadata meta = new Metadata();
+
+    values = meta.getValues(CONTENTTYPE);
+    assertEquals(0, values.length);
+
+    meta.add(CONTENTTYPE, "value1");
+    values = meta.getValues(CONTENTTYPE);
+    assertEquals(1, values.length);
+    assertEquals("value1", values[0]);
+
+    meta.add(CONTENTTYPE, "value2");
+    values = meta.getValues(CONTENTTYPE);
+    assertEquals(2, values.length);
+    assertEquals("value1", values[0]);
+    assertEquals("value2", values[1]);
+
+    // NOTE : For now, the same value can be added many times.
+    // Should it be changed?
+    meta.add(CONTENTTYPE, "value1");
+    values = meta.getValues(CONTENTTYPE);
+    assertEquals(3, values.length);
+    assertEquals("value1", values[0]);
+    assertEquals("value2", values[1]);
+    assertEquals("value1", values[2]);
+  }
+
+  /** Test for the <code>set(String, String)</code> method. */
+  public void testSet() {
+    String[] values = null;
+    Metadata meta = new Metadata();
+
+    values = meta.getValues(CONTENTTYPE);
+    assertEquals(0, values.length);
+
+    meta.set(CONTENTTYPE, "value1");
+    values = meta.getValues(CONTENTTYPE);
+    assertEquals(1, values.length);
+    assertEquals("value1", values[0]);
+
+    meta.set(CONTENTTYPE, "value2");
+    values = meta.getValues(CONTENTTYPE);
+    assertEquals(1, values.length);
+    assertEquals("value2", values[0]);
+
+    meta.set(CONTENTTYPE, "new value 1");
+    meta.add("contenttype", "new value 2");
+    values = meta.getValues(CONTENTTYPE);
+    assertEquals(2, values.length);
+    assertEquals("new value 1", values[0]);
+    assertEquals("new value 2", values[1]);
+  }
+
+  /** Test for <code>setAll(Properties)</code> method. */
+  public void testSetProperties() {
+    String[] values = null;
+    Metadata meta = new Metadata();
+    Properties props = new Properties();
+
+    meta.setAll(props);
+    assertEquals(0, meta.size());
+
+    props.setProperty("name-one", "value1.1");
+    meta.setAll(props);
+    assertEquals(1, meta.size());
+    values = meta.getValues("name-one");
+    assertEquals(1, values.length);
+    assertEquals("value1.1", values[0]);
+
+    props.setProperty("name-two", "value2.1");
+    meta.setAll(props);
+    assertEquals(2, meta.size());
+    values = meta.getValues("name-one");
+    assertEquals(1, values.length);
+    assertEquals("value1.1", values[0]);
+    values = meta.getValues("name-two");
+    assertEquals(1, values.length);
+    assertEquals("value2.1", values[0]);
+  }
+
+  /** Test for <code>get(String)</code> method. */
+  public void testGet() {
+    Metadata meta = new Metadata();
+    assertNull(meta.get("a-name"));
+    meta.add("a-name", "value-1");
+    assertEquals("value-1", meta.get("a-name"));
+    meta.add("a-name", "value-2");
+    assertEquals("value-1", meta.get("a-name"));
+  }
+
+  /** Test for <code>isMultiValued()</code> method. */
+  public void testIsMultiValued() {
+    Metadata meta = new Metadata();
+    assertFalse(meta.isMultiValued("key"));
+    meta.add("key", "value1");
+    assertFalse(meta.isMultiValued("key"));
+    meta.add("key", "value2");
+    assertTrue(meta.isMultiValued("key"));
+  }
+
+  /** Test for <code>names</code> method. */
+  public void testNames() {
+    String[] names = null;
+    Metadata meta = new Metadata();
+    names = meta.names();
+    assertEquals(0, names.length);
+
+    meta.add("name-one", "value");
+    names = meta.names();
+    assertEquals(1, names.length);
+    assertEquals("name-one", names[0]);
+    meta.add("name-two", "value");
+    names = meta.names();
+    assertEquals(2, names.length);
+  }
+
+  /** Test for <code>remove(String)</code> method. */
+  public void testRemove() {
+    Metadata meta = new Metadata();
+    meta.remove("name-one");
+    assertEquals(0, meta.size());
+    meta.add("name-one", "value-1.1");
+    meta.add("name-one", "value-1.2");
+    meta.add("name-two", "value-2.2");
+    assertEquals(2, meta.size());
+    assertNotNull(meta.get("name-one"));
+    assertNotNull(meta.get("name-two"));
+    meta.remove("name-one");
+    assertEquals(1, meta.size());
+    assertNull(meta.get("name-one"));
+    assertNotNull(meta.get("name-two"));
+    meta.remove("name-two");
+    assertEquals(0, meta.size());
+    assertNull(meta.get("name-one"));
+    assertNull(meta.get("name-two"));
+  }
+
+  /** Test for <code>equals(Object)</code> method. */
+  public void testObject() {
+    Metadata meta1 = new Metadata();
+    Metadata meta2 = new Metadata();
+    assertFalse(meta1.equals(null));
+    assertFalse(meta1.equals("String"));
+    assertTrue(meta1.equals(meta2));
+    meta1.add("name-one", "value-1.1");
+    assertFalse(meta1.equals(meta2));
+    meta2.add("name-one", "value-1.1");
+    assertTrue(meta1.equals(meta2));
+    meta1.add("name-one", "value-1.2");
+    assertFalse(meta1.equals(meta2));
+    meta2.add("name-one", "value-1.2");
+    assertTrue(meta1.equals(meta2));
+    meta1.add("name-two", "value-2.1");
+    assertFalse(meta1.equals(meta2));
+    meta2.add("name-two", "value-2.1");
+    assertTrue(meta1.equals(meta2));
+    meta1.add("name-two", "value-2.2");
+    assertFalse(meta1.equals(meta2));
+    meta2.add("name-two", "value-2.x");
+    assertFalse(meta1.equals(meta2));
+  }
+
+}

Added: 
incubator/tika/trunk/src/main/test/org/apache/tika/metadata/TestSpellCheckedMetadata.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/test/org/apache/tika/metadata/TestSpellCheckedMetadata.java?view=auto&rev=545659
==============================================================================
--- 
incubator/tika/trunk/src/main/test/org/apache/tika/metadata/TestSpellCheckedMetadata.java
 (added)
+++ 
incubator/tika/trunk/src/main/test/org/apache/tika/metadata/TestSpellCheckedMetadata.java
 Fri Jun  8 15:54:31 2007
@@ -0,0 +1,262 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.metadata;
+
+// JDK imports
+import java.util.Properties;
+
+// Junit imports
+import junit.framework.Test;
+import junit.framework.TestCase;
+import junit.framework.TestSuite;
+import junit.textui.TestRunner;
+
+/**
+ * JUnit based tests of class
+ * [EMAIL PROTECTED] org.apache.tika.metadata.SpellCheckedMetadata}.
+ * 
+ * @author Chris Mattmann
+ * @author J&eacute;r&ocirc;me Charron
+ */
+public class TestSpellCheckedMetadata extends TestCase {
+
+  private static final int NUM_ITERATIONS = 10000;
+
+  public TestSpellCheckedMetadata(String testName) {
+    super(testName);
+  }
+
+  public static Test suite() {
+    return new TestSuite(TestSpellCheckedMetadata.class);
+  }
+
+  public static void main(String[] args) {
+    TestRunner.run(suite());
+  }
+
+  /** Test for the <code>getNormalizedName(String)</code> method. */
+  public void testGetNormalizedName() {
+    assertEquals("Content-Type", SpellCheckedMetadata
+        .getNormalizedName("Content-Type"));
+    assertEquals("Content-Type", SpellCheckedMetadata
+        .getNormalizedName("ContentType"));
+    assertEquals("Content-Type", SpellCheckedMetadata
+        .getNormalizedName("Content-type"));
+    assertEquals("Content-Type", SpellCheckedMetadata
+        .getNormalizedName("contenttype"));
+    assertEquals("Content-Type", SpellCheckedMetadata
+        .getNormalizedName("contentype"));
+    assertEquals("Content-Type", SpellCheckedMetadata
+        .getNormalizedName("contntype"));
+  }
+
+  /** Test for the <code>add(String, String)</code> method. */
+  public void testAdd() {
+    String[] values = null;
+    SpellCheckedMetadata meta = new SpellCheckedMetadata();
+
+    values = meta.getValues("contentype");
+    assertEquals(0, values.length);
+
+    meta.add("contentype", "value1");
+    values = meta.getValues("contentype");
+    assertEquals(1, values.length);
+    assertEquals("value1", values[0]);
+
+    meta.add("Content-Type", "value2");
+    values = meta.getValues("contentype");
+    assertEquals(2, values.length);
+    assertEquals("value1", values[0]);
+    assertEquals("value2", values[1]);
+
+    // NOTE : For now, the same value can be added many times.
+    // Should it be changed?
+    meta.add("ContentType", "value1");
+    values = meta.getValues("Content-Type");
+    assertEquals(3, values.length);
+    assertEquals("value1", values[0]);
+    assertEquals("value2", values[1]);
+    assertEquals("value1", values[2]);
+  }
+
+  /** Test for the <code>set(String, String)</code> method. */
+  public void testSet() {
+    String[] values = null;
+    SpellCheckedMetadata meta = new SpellCheckedMetadata();
+
+    values = meta.getValues("contentype");
+    assertEquals(0, values.length);
+
+    meta.set("contentype", "value1");
+    values = meta.getValues("contentype");
+    assertEquals(1, values.length);
+    assertEquals("value1", values[0]);
+
+    meta.set("Content-Type", "value2");
+    values = meta.getValues("contentype");
+    assertEquals(1, values.length);
+    assertEquals("value2", values[0]);
+
+    meta.set("contenttype", "new value 1");
+    meta.add("contenttype", "new value 2");
+    values = meta.getValues("contentype");
+    assertEquals(2, values.length);
+    assertEquals("new value 1", values[0]);
+    assertEquals("new value 2", values[1]);
+  }
+
+  /** Test for <code>setAll(Properties)</code> method. */
+  public void testSetProperties() {
+    String[] values = null;
+    SpellCheckedMetadata meta = new SpellCheckedMetadata();
+    Properties props = new Properties();
+
+    meta.setAll(props);
+    assertEquals(0, meta.size());
+
+    props.setProperty("name-one", "value1.1");
+    meta.setAll(props);
+    assertEquals(1, meta.size());
+    values = meta.getValues("name-one");
+    assertEquals(1, values.length);
+    assertEquals("value1.1", values[0]);
+
+    props.setProperty("name-two", "value2.1");
+    meta.setAll(props);
+    assertEquals(2, meta.size());
+    values = meta.getValues("name-one");
+    assertEquals(1, values.length);
+    assertEquals("value1.1", values[0]);
+    values = meta.getValues("name-two");
+    assertEquals(1, values.length);
+    assertEquals("value2.1", values[0]);
+  }
+
+  /** Test for <code>get(String)</code> method. */
+  public void testGet() {
+    SpellCheckedMetadata meta = new SpellCheckedMetadata();
+    assertNull(meta.get("a-name"));
+
+    meta.add("a-name", "value-1");
+    assertEquals("value-1", meta.get("a-name"));
+    meta.add("a-name", "value-2");
+    assertEquals("value-1", meta.get("a-name"));
+  }
+
+  /** Test for <code>isMultiValued()</code> method. */
+  public void testIsMultiValued() {
+    SpellCheckedMetadata meta = new SpellCheckedMetadata();
+    assertFalse(meta.isMultiValued("key"));
+    meta.add("key", "value1");
+    assertFalse(meta.isMultiValued("key"));
+    meta.add("key", "value2");
+    assertTrue(meta.isMultiValued("key"));
+  }
+
+  /** Test for <code>names</code> method. */
+  public void testNames() {
+    String[] names = null;
+    SpellCheckedMetadata meta = new SpellCheckedMetadata();
+    names = meta.names();
+    assertEquals(0, names.length);
+
+    meta.add("name-one", "value");
+    names = meta.names();
+    assertEquals(1, names.length);
+    assertEquals("name-one", names[0]);
+    meta.add("name-two", "value");
+    names = meta.names();
+    assertEquals(2, names.length);
+  }
+
+  /** Test for <code>remove(String)</code> method. */
+  public void testRemove() {
+    SpellCheckedMetadata meta = new SpellCheckedMetadata();
+    meta.remove("name-one");
+    assertEquals(0, meta.size());
+    meta.add("name-one", "value-1.1");
+    meta.add("name-one", "value-1.2");
+    meta.add("name-two", "value-2.2");
+    assertEquals(2, meta.size());
+    assertNotNull(meta.get("name-one"));
+    assertNotNull(meta.get("name-two"));
+    meta.remove("name-one");
+    assertEquals(1, meta.size());
+    assertNull(meta.get("name-one"));
+    assertNotNull(meta.get("name-two"));
+    meta.remove("name-two");
+    assertEquals(0, meta.size());
+    assertNull(meta.get("name-one"));
+    assertNull(meta.get("name-two"));
+  }
+
+  /** Test for <code>equals(Object)</code> method. */
+  public void testObject() {
+    SpellCheckedMetadata meta1 = new SpellCheckedMetadata();
+    SpellCheckedMetadata meta2 = new SpellCheckedMetadata();
+    assertFalse(meta1.equals(null));
+    assertFalse(meta1.equals("String"));
+    assertTrue(meta1.equals(meta2));
+    meta1.add("name-one", "value-1.1");
+    assertFalse(meta1.equals(meta2));
+    meta2.add("name-one", "value-1.1");
+    assertTrue(meta1.equals(meta2));
+    meta1.add("name-one", "value-1.2");
+    assertFalse(meta1.equals(meta2));
+    meta2.add("name-one", "value-1.2");
+    assertTrue(meta1.equals(meta2));
+    meta1.add("name-two", "value-2.1");
+    assertFalse(meta1.equals(meta2));
+    meta2.add("name-two", "value-2.1");
+    assertTrue(meta1.equals(meta2));
+    meta1.add("name-two", "value-2.2");
+    assertFalse(meta1.equals(meta2));
+    meta2.add("name-two", "value-2.x");
+    assertFalse(meta1.equals(meta2));
+  }
+
+  /**
+   * IO Test method, usable only when you plan to do changes in metadata to
+   * measure relative performance impact.
+   */
+  public final void testHandlingSpeed() {
+    SpellCheckedMetadata result;
+    long start = System.currentTimeMillis();
+    for (int i = 0; i < NUM_ITERATIONS; i++) {
+      SpellCheckedMetadata scmd = constructSpellCheckedMetadata();
+    }
+    System.out.println(NUM_ITERATIONS + " spellchecked metadata I/O time:"
+        + (System.currentTimeMillis() - start) + "ms.");
+  }
+
+  /**
+   * Assembles a Spellchecked metadata Object.
+   */
+  public static final SpellCheckedMetadata constructSpellCheckedMetadata() {
+    SpellCheckedMetadata scmd = new SpellCheckedMetadata();
+    scmd.add("Content-type", "foo/bar");
+    scmd.add("Connection", "close");
+    scmd.add("Last-Modified", "Sat, 09 Dec 2006 15:09:57 GMT");
+    scmd.add("Server", "Foobar");
+    scmd.add("Date", "Sat, 09 Dec 2006 18:07:20 GMT");
+    scmd.add("Accept-Ranges", "bytes");
+    scmd.add("ETag", "\"1234567-89-01234567\"");
+    scmd.add("Content-Length", "123");
+    return scmd;
+  }
+
+}


Reply via email to