Author: mattmann
Date: Fri Jul 16 22:55:48 2010
New Revision: 964970

URL: http://svn.apache.org/viewvc?rev=964970&view=rev
Log:
- OODT-15 WIP: fix problems with extractors in metadata; start to fix and 
enable unit tests that were disabled per kelly

Added:
    incubator/oodt/trunk/metadata/src/testdata/
    incubator/oodt/trunk/metadata/src/testdata/met_extr_preconditions.xml
    incubator/oodt/trunk/metadata/src/testdata/tika-mimetypes.xml
Modified:
    incubator/oodt/trunk/metadata/pom.xml
    
incubator/oodt/trunk/metadata/src/main/java/org/apache/oodt/cas/metadata/extractors/CmdLineMetExtractor.java
    
incubator/oodt/trunk/metadata/src/main/java/org/apache/oodt/cas/metadata/extractors/CopyAndRewriteExtractor.java
    
incubator/oodt/trunk/metadata/src/main/java/org/apache/oodt/cas/metadata/util/MimeTypeUtils.java
    
incubator/oodt/trunk/metadata/src/test/org/apache/oodt/cas/metadata/preconditions/TestPreCondEvalUtils.java

Modified: incubator/oodt/trunk/metadata/pom.xml
URL: 
http://svn.apache.org/viewvc/incubator/oodt/trunk/metadata/pom.xml?rev=964970&r1=964969&r2=964970&view=diff
==============================================================================
--- incubator/oodt/trunk/metadata/pom.xml (original)
+++ incubator/oodt/trunk/metadata/pom.xml Fri Jul 16 22:55:48 2010
@@ -57,6 +57,16 @@ the License.
         </includes>
       </resource>
     </resources>
+    <testResources>
+      <testResource>
+         <targetPath>org/apache/oodt/cas/metadata/preconditions</targetPath>
+         <directory>${basedir}/src/testdata</directory>
+         <includes>
+             <include>met_extr_preconditions.xml</include>
+             <include>tika-mimetypes.xml</include>
+         </includes>
+     </testResource>       
+    </testResources>
     <plugins>
       <plugin>
         <artifactId>maven-surefire-plugin</artifactId>
@@ -73,7 +83,6 @@ the License.
           </includes>
           <excludes>
               <!-- FIXME: These all assume $CWD, but should use 
getResourceAsStream instead. Later. -->
-              
<exclude>org/apache/oodt/cas/metadata/preconditions/TestPreCondEvalUtils.java</exclude>
               
<exclude>org/apache/oodt/cas/metadata/extractors/TestMetReader.java</exclude>
               
<exclude>org/apache/oodt/cas/metadata/extractors/TestExternMetExtractor.java</exclude>
               
<exclude>org/apache/oodt/cas/metadata/extractors/TestCopyAndRewriteExtractor.java</exclude>
@@ -109,9 +118,9 @@ the License.
       <version>1.3</version>
     </dependency>
     <dependency>
-      <groupId>apache</groupId>
+      <groupId>org.apache.tika</groupId>
       <artifactId>tika</artifactId>
-      <version>0.2-fork</version>
+      <version>0.3</version>
     </dependency>
     <dependency>
       <groupId>org.springframework</groupId>

Modified: 
incubator/oodt/trunk/metadata/src/main/java/org/apache/oodt/cas/metadata/extractors/CmdLineMetExtractor.java
URL: 
http://svn.apache.org/viewvc/incubator/oodt/trunk/metadata/src/main/java/org/apache/oodt/cas/metadata/extractors/CmdLineMetExtractor.java?rev=964970&r1=964969&r2=964970&view=diff
==============================================================================
--- 
incubator/oodt/trunk/metadata/src/main/java/org/apache/oodt/cas/metadata/extractors/CmdLineMetExtractor.java
 (original)
+++ 
incubator/oodt/trunk/metadata/src/main/java/org/apache/oodt/cas/metadata/extractors/CmdLineMetExtractor.java
 Fri Jul 16 22:55:48 2010
@@ -29,6 +29,7 @@ import java.util.logging.Level;
 //OODT imports
 import org.apache.oodt.cas.metadata.AbstractMetExtractor;
 import org.apache.oodt.cas.metadata.MetExtractorConfigReader;
+import org.apache.oodt.cas.metadata.SerializableMetadata;
 import org.apache.oodt.cas.metadata.Metadata;
 import org.apache.oodt.commons.exec.EnvUtilities;
 import org.apache.oodt.commons.xml.XMLUtils;
@@ -46,81 +47,82 @@ import org.apache.oodt.commons.xml.XMLUt
  */
 public abstract class CmdLineMetExtractor extends AbstractMetExtractor {
 
-    public CmdLineMetExtractor(MetExtractorConfigReader reader) {
-        super(reader);
-    }
-
-    protected static void processMain(String[] args,
-            CmdLineMetExtractor extractor) throws Exception {
-        String usage = "Usage: " + extractor.getClass().getName()
-                + " <file> <configfile>";
-        String extractFilePath = null, configFilePath = null;
-
-        if (args.length < 2) {
-            System.err.println(usage);
-            System.exit(1);
-        }
-
-        extractFilePath = args[0].replaceAll("\\\\", "");
-        configFilePath = args[1];
-
-        Metadata met = extractor.extractMetadata(new File(extractFilePath),
-                configFilePath);
-        XMLUtils.writeXmlToStream(met.toXML(),
-                getMetFileOutputStream(extractFilePath));
-    }
-
-    protected static void processMain(String[] args,
-            CmdLineMetExtractor extractor, OutputStream os) throws Exception {
-        String usage = "Usage: " + extractor.getClass().getName()
-                + " <file> <configfile>";
-        String extractFilePath = null, configFilePath = null;
-
-        if (args.length < 2) {
-            System.err.println(usage);
-            System.exit(1);
-        }
-
-        extractFilePath = args[0].replaceAll("\\\\", "");
-        configFilePath = args[1];
-
-        Metadata met = extractor.extractMetadata(new File(extractFilePath),
-                configFilePath);
-        XMLUtils.writeXmlToStream(met.toXML(), os);
-
-    }
-
-    private static FileOutputStream getMetFileOutputStream(String filePath) {
-        Properties envVars = EnvUtilities.getEnv();
-        String cwd = envVars.getProperty("PWD");
-        if (cwd == null) {
-            throw new RuntimeException(
-                    "Unable to get current working directory: failing!");
-        }
-
-        if (!cwd.endsWith("/")) {
-            cwd += "/";
-        }
-
-        String metFilePath = cwd
-                + new File(filePath).getName().replaceAll("\\\\", "") + ".met";
-
-        // try and remove the met file, if it already exists
-        // for some reason below, the writeXmlFile method in
-        // XMLUtils doesn't overwrite, and throws an Exception
-        File metFile = new File(metFilePath);
-        if (!metFile.delete()) {
-            LOG.log(Level.WARNING, "Attempt to overwrite met file: ["
-                    + metFilePath + "] unsuccessful!");
-        }
-
-        try {
-            return new FileOutputStream(metFile);
-        } catch (FileNotFoundException e) {
-            LOG.log(Level.WARNING, "Could not create met file: [" + metFile
-                    + "]: Reason " + e.getMessage(), e);
-            return null;
-        }
-    }
+  public CmdLineMetExtractor(MetExtractorConfigReader reader) {
+      super(reader);
+  }
+
+  protected static void processMain(String[] args,
+          CmdLineMetExtractor extractor) throws Exception {
+      String usage = "Usage: " + extractor.getClass().getName()
+              + " <file> <configfile>";
+      String extractFilePath = null, configFilePath = null;
+
+      if (args.length < 2) {
+          System.err.println(usage);
+          System.exit(1);
+      }
+
+      extractFilePath = args[0].replaceAll("\\\\", "");
+      configFilePath = args[1];
+
+      Metadata met = extractor.extractMetadata(new File(extractFilePath),
+              configFilePath);
+      XMLUtils.writeXmlToStream(new SerializableMetadata(met).toXML(),
+              getMetFileOutputStream(extractFilePath));
+  }
+
+  protected static void processMain(String[] args,
+          CmdLineMetExtractor extractor, OutputStream os) throws Exception {
+      String usage = "Usage: " + extractor.getClass().getName()
+              + " <file> <configfile>";
+      String extractFilePath = null, configFilePath = null;
+
+      if (args.length < 2) {
+          System.err.println(usage);
+          System.exit(1);
+      }
+
+      extractFilePath = args[0].replaceAll("\\\\", "");
+      configFilePath = args[1];
+
+      Metadata met = extractor.extractMetadata(new File(extractFilePath),
+              configFilePath);
+      XMLUtils.writeXmlToStream(new SerializableMetadata(met).toXML(), os);
+
+  }
+
+  private static FileOutputStream getMetFileOutputStream(String filePath) {
+      Properties envVars = EnvUtilities.getEnv();
+      String cwd = envVars.getProperty("PWD");
+      if (cwd == null) {
+          throw new RuntimeException(
+                  "Unable to get current working directory: failing!");
+      }
+
+      if (!cwd.endsWith("/")) {
+          cwd += "/";
+      }
+
+      String metFilePath = cwd
+              + new File(filePath).getName().replaceAll("\\\\", "") + ".met";
+
+      // try and remove the met file, if it already exists
+      // for some reason below, the writeXmlFile method in
+      // XMLUtils doesn't overwrite, and throws an Exception
+      File metFile = new File(metFilePath);
+      if (!metFile.delete()) {
+          LOG.log(Level.WARNING, "Attempt to overwrite met file: ["
+                  + metFilePath + "] unsuccessful!");
+      }
+
+      try {
+          return new FileOutputStream(metFile);
+      } catch (FileNotFoundException e) {
+          LOG.log(Level.WARNING, "Could not create met file: [" + metFile
+                  + "]: Reason " + e.getMessage(), e);
+          return null;
+      }
+  }
 
 }
+

Modified: 
incubator/oodt/trunk/metadata/src/main/java/org/apache/oodt/cas/metadata/extractors/CopyAndRewriteExtractor.java
URL: 
http://svn.apache.org/viewvc/incubator/oodt/trunk/metadata/src/main/java/org/apache/oodt/cas/metadata/extractors/CopyAndRewriteExtractor.java?rev=964970&r1=964969&r2=964970&view=diff
==============================================================================
--- 
incubator/oodt/trunk/metadata/src/main/java/org/apache/oodt/cas/metadata/extractors/CopyAndRewriteExtractor.java
 (original)
+++ 
incubator/oodt/trunk/metadata/src/main/java/org/apache/oodt/cas/metadata/extractors/CopyAndRewriteExtractor.java
 Fri Jul 16 22:55:48 2010
@@ -24,6 +24,7 @@ import java.util.logging.Level;
 
 //OODT imports
 import org.apache.oodt.cas.metadata.Metadata;
+import org.apache.oodt.cas.metadata.SerializableMetadata;
 import org.apache.oodt.cas.metadata.exceptions.MetExtractionException;
 import org.apache.oodt.cas.metadata.extractors.CmdLineMetExtractor;
 import org.apache.oodt.cas.metadata.util.PathUtils;
@@ -52,80 +53,81 @@ import org.apache.oodt.cas.metadata.util
  */
 public class CopyAndRewriteExtractor extends CmdLineMetExtractor {
 
-    private final static String FILENAME = "Filename";
+  private final static String FILENAME = "Filename";
 
-    private final static String FILE_LOCATION = "FileLocation";
+  private final static String FILE_LOCATION = "FileLocation";
 
-    private static CopyAndRewriteConfigReader reader = new 
CopyAndRewriteConfigReader();
+  private static CopyAndRewriteConfigReader reader = new 
CopyAndRewriteConfigReader();
 
-    /**
-     * Default Constructor.
-     * 
-     */
-    public CopyAndRewriteExtractor() {
-        super(reader);
-    }
-
-    /*
-     * (non-Javadoc)
-     * 
-     * @see 
org.apache.oodt.cas.metadata.AbstractMetExtractor#extractMetadata(java.io.File)
-     */
-    public Metadata extrMetadata(File file) throws MetExtractionException {
-        if (this.config == null) {
-            throw new MetExtractionException(
-                    "No config file defined: unable to copy and rewrite 
metadata!");
-        }
-
-        Metadata met = null;
-        
-        try {
-            met = new Metadata(new File(PathUtils
-                    .replaceEnvVariables(((CopyAndRewriteConfig) this.config)
-                            .getProperty("orig.met.file.path"))).toURL()
-                    .openStream());
-        } catch (Exception e) {
-            e.printStackTrace();
-            throw new MetExtractionException(
-                    "error parsing original met file: ["
-                            + ((CopyAndRewriteConfig) this.config)
-                                    .getProperty("orig.met.file.path")
-                            + "]: Message: " + e.getMessage());
-        }
-
-        addDefaultFields(file, met);
-
-        // now override
-        int numOverrideFields = Integer
-                .parseInt(((CopyAndRewriteConfig) this.config)
-                        .getProperty("numRewriteFields"));
-
-        LOG.log(Level.FINE, "Extracting metadata: num rewrite fields: ["
-                + numOverrideFields + "]");
-
-        for (int i = 0; i < numOverrideFields; i++) {
-            String rewriteFieldName = ((CopyAndRewriteConfig) this.config)
-                    .getProperty("rewriteField" + (i + 1));
-            String rewriteFieldStr = ((CopyAndRewriteConfig) this.config)
-                    .getProperty(rewriteFieldName + ".pattern");
-            LOG.log(Level.FINE, "Rewrite string: [" + rewriteFieldStr + "]");
-            rewriteFieldStr = PathUtils.replaceEnvVariables(rewriteFieldStr,
-                    met);
-            met.replaceMetadata(rewriteFieldName, rewriteFieldStr);
-        }
-
-        return met;
-
-    }
-
-    public static void main(String[] args) throws Exception {
-        processMain(args, new CopyAndRewriteExtractor());
-    }
-
-    private void addDefaultFields(File file, Metadata met) {
-        met.replaceMetadata(FILENAME, file.getName());
-        met.replaceMetadata(FILE_LOCATION, file.getParentFile()
-                .getAbsolutePath());
-    }
+  /**
+   * Default Constructor.
+   * 
+   */
+  public CopyAndRewriteExtractor() {
+      super(reader);
+  }
+
+  /*
+   * (non-Javadoc)
+   * 
+   * @see 
gov.nasa.jpl.oodt.cas.metadata.AbstractMetExtractor#extractMetadata(java.io.File)
+   */
+  public Metadata extrMetadata(File file) throws MetExtractionException {
+      if (this.config == null) {
+          throw new MetExtractionException(
+                  "No config file defined: unable to copy and rewrite 
metadata!");
+      }
+
+      Metadata met = null;
+      
+      try {
+          met = new SerializableMetadata(new File(PathUtils
+                  .replaceEnvVariables(((CopyAndRewriteConfig) this.config)
+                          .getProperty("orig.met.file.path"))).toURL()
+                  .openStream());
+      } catch (Exception e) {
+          e.printStackTrace();
+          throw new MetExtractionException(
+                  "error parsing original met file: ["
+                          + ((CopyAndRewriteConfig) this.config)
+                                  .getProperty("orig.met.file.path")
+                          + "]: Message: " + e.getMessage());
+      }
+
+      addDefaultFields(file, met);
+
+      // now override
+      int numOverrideFields = Integer
+              .parseInt(((CopyAndRewriteConfig) this.config)
+                      .getProperty("numRewriteFields"));
+
+      LOG.log(Level.FINE, "Extracting metadata: num rewrite fields: ["
+              + numOverrideFields + "]");
+
+      for (int i = 0; i < numOverrideFields; i++) {
+          String rewriteFieldName = ((CopyAndRewriteConfig) this.config)
+                  .getProperty("rewriteField" + (i + 1));
+          String rewriteFieldStr = ((CopyAndRewriteConfig) this.config)
+                  .getProperty(rewriteFieldName + ".pattern");
+          LOG.log(Level.FINE, "Rewrite string: [" + rewriteFieldStr + "]");
+          rewriteFieldStr = PathUtils.replaceEnvVariables(rewriteFieldStr,
+                  met);
+          met.replaceMetadata(rewriteFieldName, rewriteFieldStr);
+      }
+
+      return met;
+
+  }
+
+  public static void main(String[] args) throws Exception {
+      processMain(args, new CopyAndRewriteExtractor());
+  }
+
+  private void addDefaultFields(File file, Metadata met) {
+      met.replaceMetadata(FILENAME, file.getName());
+      met.replaceMetadata(FILE_LOCATION, file.getParentFile()
+              .getAbsolutePath());
+  }
 
 }
+

Modified: 
incubator/oodt/trunk/metadata/src/main/java/org/apache/oodt/cas/metadata/util/MimeTypeUtils.java
URL: 
http://svn.apache.org/viewvc/incubator/oodt/trunk/metadata/src/main/java/org/apache/oodt/cas/metadata/util/MimeTypeUtils.java?rev=964970&r1=964969&r2=964970&view=diff
==============================================================================
--- 
incubator/oodt/trunk/metadata/src/main/java/org/apache/oodt/cas/metadata/util/MimeTypeUtils.java
 (original)
+++ 
incubator/oodt/trunk/metadata/src/main/java/org/apache/oodt/cas/metadata/util/MimeTypeUtils.java
 Fri Jul 16 22:55:48 2010
@@ -166,7 +166,7 @@ public final class MimeTypeUtils {
 
         // if returned null, or if it's the default type then try url 
resolution
         if (type == null
-                || (type != null && type.getName().equals(MimeTypes.DEFAULT))) 
{
+                || (type != null && 
type.getName().equals(MimeTypes.OCTET_STREAM))) {
             // If no mime-type header, or cannot find a corresponding 
registered
             // mime-type, then guess a mime-type from the url pattern
             type = this.mimeTypes.getMimeType(url) != null ? this.mimeTypes
@@ -182,7 +182,7 @@ public final class MimeTypeUtils {
         if (this.mimeMagic) {
             MimeType magicType = this.mimeTypes.getMimeType(data);
             if (magicType != null
-                    && !magicType.getName().equals(MimeTypes.DEFAULT)
+                    && !magicType.getName().equals(MimeTypes.OCTET_STREAM)
                     && type != null
                     && !type.getName().equals(magicType.getName())) {
                 // If magic enabled and the current mime type differs from that
@@ -196,7 +196,7 @@ public final class MimeTypeUtils {
             // default type
             if (type == null) {
                 try {
-                    type = this.mimeTypes.forName(MimeTypes.DEFAULT);
+                    type = this.mimeTypes.forName(MimeTypes.OCTET_STREAM);
                 } catch (Exception ignore) {
                 }
             }

Modified: 
incubator/oodt/trunk/metadata/src/test/org/apache/oodt/cas/metadata/preconditions/TestPreCondEvalUtils.java
URL: 
http://svn.apache.org/viewvc/incubator/oodt/trunk/metadata/src/test/org/apache/oodt/cas/metadata/preconditions/TestPreCondEvalUtils.java?rev=964970&r1=964969&r2=964970&view=diff
==============================================================================
--- 
incubator/oodt/trunk/metadata/src/test/org/apache/oodt/cas/metadata/preconditions/TestPreCondEvalUtils.java
 (original)
+++ 
incubator/oodt/trunk/metadata/src/test/org/apache/oodt/cas/metadata/preconditions/TestPreCondEvalUtils.java
 Fri Jul 16 22:55:48 2010
@@ -20,9 +20,11 @@ package org.apache.oodt.cas.metadata.pre
 
 //JDK imports
 import java.io.File;
-import java.io.FileNotFoundException;
+import java.io.IOException;
 import java.util.LinkedList;
 
+//Spring imports
+import org.springframework.beans.BeansException;
 import org.springframework.context.support.FileSystemXmlApplicationContext;
 
 //Junit imports
@@ -44,21 +46,22 @@ public class TestPreCondEvalUtils extend
     
     private PreCondEvalUtils evalUtils;
     
-    private static final String MET_EXTR_TEST_FILE = 
"./src/main/resources/examples/met_extr_preconditions.xml";
 
-    public TestPreCondEvalUtils() throws FileNotFoundException,
-            ClassNotFoundException, InstantiationException,
-            IllegalAccessException {
+    public TestPreCondEvalUtils() throws ClassNotFoundException, 
InstantiationException,
+            IllegalAccessException, BeansException, IOException {
         this.preconditions = new LinkedList<String>();
         this.preconditions.add("CheckThatDataFileSizeIsGreaterThanZero");
         this.preconditions.add("CheckThatDataFileExists");
         this.preconditions.add("CheckDataFileMimeType");
-        this.evalUtils = new PreCondEvalUtils(new 
FileSystemXmlApplicationContext(MET_EXTR_TEST_FILE));
+        File preCondFile = new 
File(getClass().getResource("met_extr_preconditions.xml").getFile());
+        assertNotNull(preCondFile);
+        this.evalUtils = new PreCondEvalUtils(new 
FileSystemXmlApplicationContext(preCondFile.toURL().toExternalForm()));
     }
 
     public void testEval(){
-        File prodFile = new File(MET_EXTR_TEST_FILE);
+        File prodFile = null;
         try{
+            prodFile = new 
File(getClass().getResource("met_extr_preconditions.xml").getFile());
             assertTrue(this.evalUtils.eval(this.preconditions, prodFile));
         }
         catch(Throwable e){

Added: incubator/oodt/trunk/metadata/src/testdata/met_extr_preconditions.xml
URL: 
http://svn.apache.org/viewvc/incubator/oodt/trunk/metadata/src/testdata/met_extr_preconditions.xml?rev=964970&view=auto
==============================================================================
--- incubator/oodt/trunk/metadata/src/testdata/met_extr_preconditions.xml 
(added)
+++ incubator/oodt/trunk/metadata/src/testdata/met_extr_preconditions.xml Fri 
Jul 16 22:55:48 2010
@@ -0,0 +1,48 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- 
+       Copyright (c) 2008, California Institute of Technology.
+       ALL RIGHTS RESERVED. U.S. Government sponsorship acknowledged.
+       
+       $Id$
+       
+       Author: bfoster, mattmann
+       Description: Describes pre-conditions that should be evaluated before
+       running a particular MetExtractor.
+-->
+
+<beans xmlns="http://www.springframework.org/schema/beans";
+    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance";    
+    xsi:schemaLocation="http://www.springframework.org/schema/beans 
http://www.springframework.org/schema/beans/spring-beans-2.5.xsd";>
+    
+
+    <!-- Precondition Comparators -->
+    <bean id="PreconditionComparator" lazy-init="true" abstract="true" 
class="org.apache.oodt.cas.metadata.preconditions.PreConditionComparator"/>
+    
+    
+    <bean id="CheckThatDataFileSizeIsGreaterThanZero" lazy-init="true" 
parent="PreconditionComparator" 
class="org.apache.oodt.cas.metadata.preconditions.FileSizeComparator">
+        <property name="description" value="Check if the current data file 
size is greater than zero"/>        
+        <property name="compareItem">
+            <value type="java.lang.Long">0</value>
+        </property>
+        <property name="type" value="greater_than"/>
+    </bean>
+    
+    <bean id="CheckThatDataFileExists" lazy-init="true" 
parent="PreconditionComparator" 
class="org.apache.oodt.cas.metadata.preconditions.ExistanceCheckComparator">
+        <property name="description" value="Check if the current data file 
exists"/>        
+        <property name="compareItem">
+            <value type="java.lang.Boolean">true</value>
+        </property>
+        <property name="type" value="equal_to"/>    
+    </bean>
+
+    <bean id="CheckDataFileMimeType" lazy-init="true" 
parent="PreconditionComparator" 
class="org.apache.oodt.cas.metadata.preconditions.MimeTypeComparator">
+        <property name="description" value="Check that data file mime type 
matches the specified mime type"/>        
+        <property name="compareItem">
+            <value type="java.lang.String">application/xml</value>
+        </property>
+        <property name="type" value="equal_to"/>   
+        <property name="mimeTypeRepo" 
value="./src/testdata/tika-mimetypes.xml"/>
+        <property name="useMagic" value="true"/> 
+    </bean>
+
+</beans>
\ No newline at end of file

Added: incubator/oodt/trunk/metadata/src/testdata/tika-mimetypes.xml
URL: 
http://svn.apache.org/viewvc/incubator/oodt/trunk/metadata/src/testdata/tika-mimetypes.xml?rev=964970&view=auto
==============================================================================
--- incubator/oodt/trunk/metadata/src/testdata/tika-mimetypes.xml (added)
+++ incubator/oodt/trunk/metadata/src/testdata/tika-mimetypes.xml Fri Jul 16 
22:55:48 2010
@@ -0,0 +1,654 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- 
+  Copyright (c) 2008, California Institute of Technology.
+  ALL RIGHTS RESERVED. U.S. Government sponsorship acknowledged.
+  
+  $Id$
+ -->
+<!--
+       Licensed to the Apache Software Foundation (ASF) under one or more
+       contributor license agreements.  See the NOTICE file distributed with
+       this work for additional information regarding copyright ownership.
+       The ASF licenses this file to You under the Apache License, Version 2.0
+       (the "License"); you may not use this file except in compliance with
+       the License.  You may obtain a copy of the License at
+       
+       http://www.apache.org/licenses/LICENSE-2.0
+       
+       Unless required by applicable law or agreed to in writing, software
+       distributed under the License is distributed on an "AS IS" BASIS,
+       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+       See the License for the specific language governing permissions and
+       limitations under the License.
+       
+       Description: This xml file defines the valid mime types used by Tika.
+       The mime types within this file are based on the types in the 
mime-types.xml 
+       file available in Apache Nutch.
+-->
+
+<mime-info>
+
+       <mime-type type="text/plain">
+               <magic priority="50">
+                       <match value="This is TeX," type="string" offset="0" />
+                       <match value="This is METAFONT," type="string" 
offset="0" />
+               </magic>
+               <glob pattern="*.txt" />
+               <glob pattern="*.asc" />
+
+               <!-- TIKA-85: http://www.apache.org/dev/svn-eol-style.txt -->
+               <glob pattern="INSTALL" />
+               <glob pattern="KEYS" />
+               <glob pattern="Makefile" />
+               <glob pattern="README" />
+               <glob pattern="abs-linkmap" />
+               <glob pattern="abs-menulinks" />
+               <glob pattern="*.aart" />
+               <glob pattern="*.ac" />
+               <glob pattern="*.am" />
+               <glob pattern="*.bat" />
+               <glob pattern="*.c" />
+               <glob pattern="*.cat" />
+               <glob pattern="*.cgi" />
+               <glob pattern="*.classpath" />
+               <glob pattern="*.cmd" />
+               <glob pattern="*.conf" />
+               <glob pattern="*.config" />
+               <glob pattern="*.cpp" />
+               <glob pattern="*.css" />
+               <glob pattern="*.cwiki" />
+               <glob pattern="*.data" />
+               <glob pattern="*.dcl" />
+               <glob pattern="*.dtd" />
+               <glob pattern="*.egrm" />
+               <glob pattern="*.ent" />
+               <glob pattern="*.ft" />
+               <glob pattern="*.fn" />
+               <glob pattern="*.fv" />
+               <glob pattern="*.grm" />
+               <glob pattern="*.g" />
+               <glob pattern="*.h" />
+               <glob pattern=".htaccess" />
+               <glob pattern="*.ihtml" />
+               <glob pattern="*.in" />
+               <glob pattern="*.java" />
+               <glob pattern="*.jmx" />
+               <glob pattern="*.jsp" />
+               <glob pattern="*.js" />
+               <glob pattern="*.junit" />
+               <glob pattern="*.jx" />
+               <glob pattern="*.manifest" />
+               <glob pattern="*.m4" />
+               <glob pattern="*.mf" />
+               <glob pattern="*.MF" />
+               <glob pattern="*.meta" />
+               <glob pattern="*.mod" />
+               <glob pattern="*.n3" />
+               <glob pattern="*.pen" />
+               <glob pattern="*.pl" />
+               <glob pattern="*.pm" />
+               <glob pattern="*.pod" />
+               <glob pattern="*.pom" />
+               <glob pattern="*.project" />
+               <glob pattern="*.properties" />
+               <glob pattern="*.py" />
+               <glob pattern="*.rb" />
+               <glob pattern="*.rdf" />
+               <glob pattern="*.rnc" />
+               <glob pattern="*.rng" />
+               <glob pattern="*.rnx" />
+               <glob pattern="*.roles" />
+               <glob pattern="*.sh" />
+               <glob pattern="*.sql" />
+               <glob pattern="*.svg" />
+               <glob pattern="*.tld" />
+               <glob pattern="*.types" />
+               <glob pattern="*.vm" />
+               <glob pattern="*.vsl" />
+               <glob pattern="*.wsdd" />
+               <glob pattern="*.wsdl" />
+               <glob pattern="*.xargs" />
+               <glob pattern="*.xcat" />
+               <glob pattern="*.xconf" />
+               <glob pattern="*.xegrm" />
+               <glob pattern="*.xgrm" />
+               <glob pattern="*.xlex" />
+               <glob pattern="*.xlog" />
+               <glob pattern="*.xmap" />
+               <glob pattern="*.xroles" />
+               <glob pattern="*.xsamples" />
+               <glob pattern="*.xsd" />
+               <glob pattern="*.xsl" />
+               <glob pattern="*.xslt" />
+               <glob pattern="*.xsp" />
+               <glob pattern="*.xul" />
+               <glob pattern="*.xweb" />
+               <glob pattern="*.xwelcome" />
+       </mime-type>
+
+       <mime-type type="text/html">
+               <magic priority="50">
+                       <match value="&lt;!DOCTYPE HTML" type="string"
+                               offset="0:64" />
+                       <match value="&lt;!doctype html" type="string"
+                               offset="0:64" />
+                       <match value="&lt;HEAD" type="string" offset="0:64" />
+                       <match value="&lt;head" type="string" offset="0:64" />
+                       <match value="&lt;TITLE" type="string" offset="0:64" />
+                       <match value="&lt;title" type="string" offset="0:64" />
+                       <match value="&lt;html" type="string" offset="0:64" />
+                       <match value="&lt;HTML" type="string" offset="0:64" />
+                       <match value="&lt;BODY" type="string" offset="0" />
+                       <match value="&lt;body" type="string" offset="0" />
+                       <match value="&lt;TITLE" type="string" offset="0" />
+                       <match value="&lt;title" type="string" offset="0" />
+                       <match value="&lt;!--" type="string" offset="0" />
+                       <match value="&lt;h1" type="string" offset="0" />
+                       <match value="&lt;H1" type="string" offset="0" />
+                       <match value="&lt;!doctype HTML" type="string" 
offset="0" />
+                       <match value="&lt;!DOCTYPE html" type="string" 
offset="0" />
+               </magic>
+               <glob pattern="*.html" />
+               <glob pattern="*.htm" />
+       </mime-type>
+
+       <mime-type type="application/xhtml+xml">
+               <sub-class-of type="application/xml" />
+               <glob pattern="*.xhtml" />
+               <root-XML namespaceURI='http://www.w3.org/1999/xhtml'
+                       localName='html' />
+       </mime-type>
+
+       <mime-type type="application/vnd.ms-powerpoint">
+               <glob pattern="*.ppz" />
+               <glob pattern="*.ppt" />
+               <glob pattern="*.pps" />
+               <glob pattern="*.pot" />
+               <magic priority="50">
+                       <match value="0xcfd0e011" type="little32" offset="0" />
+               </magic>
+       </mime-type>
+
+       <mime-type type="application/vnd.ms-excel">
+               <magic priority="50">
+                       <match value="Microsoft Excel 5.0 Worksheet" 
type="string"
+                               offset="2080" />
+               </magic>
+               <glob pattern="*.xls" />
+               <glob pattern="*.xlc" />
+               <glob pattern="*.xll" />
+               <glob pattern="*.xlm" />
+               <glob pattern="*.xlw" />
+               <glob pattern="*.xla" />
+               <glob pattern="*.xlt" />
+               <glob pattern="*.xld" />
+               <alias type="application/msexcel" />
+       </mime-type>
+
+<!-- ===================================================================== -->
+<!-- Open Document Format for Office Applications (OpenDocument) v1.0      -->
+<!-- http://www.oasis-open.org/specs/index.php#opendocumentv1.0            -->
+<!-- ===================================================================== -->
+
+       <mime-type type="application/vnd.oasis.opendocument.text">
+               <comment>OpenDocument v1.0: Text document</comment>
+               <alias type="application/x-vnd.oasis.opendocument.text" />
+               <glob pattern="*.odt" />
+               <magic>
+                       <match type="string" offset="0" value="PK">
+                               <match type="string" offset="30"
+                                       
value="mimetypeapplication/vnd.oasis.opendocument.text" />
+                       </match>
+               </magic>
+       </mime-type>
+
+       <mime-type type="application/vnd.oasis.opendocument.text-template">
+               <comment>OpenDocument v1.0: Text document used as 
template</comment>
+               <alias 
type="application/x-vnd.oasis.opendocument.text-template" />
+               <glob pattern="*.ott" />
+               <magic>
+                       <match type="string" offset="0" value="PK">
+                               <match type="string" offset="30"
+                                       
value="mimetypeapplication/vnd.oasis.opendocument.text-template" />
+                       </match>
+               </magic>
+       </mime-type>
+
+       <mime-type type="application/vnd.oasis.opendocument.graphics">
+               <comment>OpenDocument v1.0: Graphics document 
(Drawing)</comment>
+               <alias type="application/x-vnd.oasis.opendocument.graphics" />
+               <glob pattern="*.odg" />
+               <magic>
+                       <match type="string" offset="0" value="PK">
+                               <match type="string" offset="30"
+                                       
value="mimetypeapplication/vnd.oasis.opendocument.graphics" />
+                       </match>
+               </magic>
+       </mime-type>
+
+       <mime-type type="application/vnd.oasis.opendocument.graphics-template">
+               <comment>OpenDocument v1.0: Graphics document used as 
template</comment>
+               <alias 
type="application/x-vnd.oasis.opendocument.graphics-template" />
+               <glob pattern="*.otg" />
+               <magic>
+                       <match type="string" offset="0" value="PK">
+                               <match type="string" offset="30"
+                                       
value="mimetypeapplication/vnd.oasis.opendocument.graphics-template" />
+                       </match>
+               </magic>
+       </mime-type>
+
+       <mime-type type="application/vnd.oasis.opendocument.presentation">
+               <comment>OpenDocument v1.0: Presentation document</comment>
+               <alias type="application/x-vnd.oasis.opendocument.presentation" 
/>
+               <glob pattern="*.odp" />
+               <magic>
+                       <match type="string" offset="0" value="PK">
+                               <match type="string" offset="30"
+                                       
value="mimetypeapplication/vnd.oasis.opendocument.presentation" />
+                       </match>
+               </magic>
+       </mime-type>
+
+       <mime-type 
type="application/vnd.oasis.opendocument.presentation-template">
+               <comment>OpenDocument v1.0: Presentation document used as 
template</comment>
+               <alias 
type="application/x-vnd.oasis.opendocument.presentation-template" />
+               <glob pattern="*.otp" />
+               <magic>
+                       <match type="string" offset="0" value="PK">
+                               <match type="string" offset="30"
+                                       
value="mimetypeapplication/vnd.oasis.opendocument.presentation-template" />
+                       </match>
+               </magic>
+       </mime-type>
+
+       <mime-type type="application/vnd.oasis.opendocument.spreadsheet">
+               <comment>OpenDocument v1.0: Spreadsheet document</comment>
+               <alias type="application/x-vnd.oasis.opendocument.spreadsheet" 
/>
+               <glob pattern="*.ods" />
+               <magic>
+                       <match type="string" offset="0" value="PK">
+                               <match type="string" offset="30"
+                                       
value="mimetypeapplication/vnd.oasis.opendocument.spreadsheet" />
+                       </match>
+               </magic>
+       </mime-type>
+
+       <mime-type 
type="application/vnd.oasis.opendocument.spreadsheet-template">
+               <comment>OpenDocument v1.0: Spreadsheet document used as 
template</comment>
+               <alias 
type="application/x-vnd.oasis.opendocument.spreadsheet-template" />
+               <glob pattern="*.ots" />
+               <magic>
+                       <match type="string" offset="0" value="PK">
+                               <match type="string" offset="30"
+                                       
value="mimetypeapplication/vnd.oasis.opendocument.spreadsheet-template" />
+                       </match>
+               </magic>
+       </mime-type>
+
+       <mime-type type="application/vnd.oasis.opendocument.chart">
+               <comment>OpenDocument v1.0: Chart document</comment>
+               <alias type="application/x-vnd.oasis.opendocument.chart" />
+               <glob pattern="*.odc" />
+               <magic>
+                       <match type="string" offset="0" value="PK">
+                               <match type="string" offset="30"
+                                       
value="mimetypeapplication/vnd.oasis.opendocument.chart" />
+                       </match>
+               </magic>
+       </mime-type>
+
+       <mime-type type="application/vnd.oasis.opendocument.chart-template">
+               <comment>OpenDocument v1.0: Chart document used as 
template</comment>
+               <alias 
type="application/x-vnd.oasis.opendocument.chart-template" />
+               <glob pattern="*.otc" />
+               <magic>
+                       <match type="string" offset="0" value="PK">
+                               <match type="string" offset="30"
+                                       
value="mimetypeapplication/vnd.oasis.opendocument.chart-template" />
+                       </match>
+               </magic>
+       </mime-type>
+
+       <mime-type type="application/vnd.oasis.opendocument.image">
+               <comment>OpenDocument v1.0: Image document</comment>
+               <alias type="application/x-vnd.oasis.opendocument.image" />
+               <glob pattern="*.odi" />
+               <magic>
+                       <match type="string" offset="0" value="PK">
+                               <match type="string" offset="30"
+                                       
value="mimetypeapplication/vnd.oasis.opendocument.image" />
+                       </match>
+               </magic>
+       </mime-type>
+
+       <mime-type type="application/vnd.oasis.opendocument.image-template">
+               <comment>OpenDocument v1.0: Image document used as 
template</comment>
+               <alias 
type="application/x-vnd.oasis.opendocument.image-template" />
+               <glob pattern="*.oti" />
+               <magic>
+                       <match type="string" offset="0" value="PK">
+                               <match type="string" offset="30"
+                                       
value="mimetypeapplication/vnd.oasis.opendocument.image-template" />
+                       </match>
+               </magic>
+       </mime-type>
+
+       <mime-type type="application/vnd.oasis.opendocument.formula">
+               <comment>OpenDocument v1.0: Formula document</comment>
+               <alias type="application/x-vnd.oasis.opendocument.formula" />
+               <glob pattern="*.odf" />
+               <magic>
+                       <match type="string" offset="0" value="PK">
+                               <match type="string" offset="30"
+                                       
value="mimetypeapplication/vnd.oasis.opendocument.formula" />
+                       </match>
+               </magic>
+       </mime-type>
+
+       <mime-type type="application/vnd.oasis.opendocument.formula-template">
+               <comment>OpenDocument v1.0: Formula document used as 
template</comment>
+               <alias 
type="application/x-vnd.oasis.opendocument.formula-template" />
+               <glob pattern="*.otf" />
+               <magic>
+                       <match type="string" offset="0" value="PK">
+                               <match type="string" offset="30"
+                                       
value="mimetypeapplication/vnd.oasis.opendocument.formula-template" />
+                       </match>
+               </magic>
+       </mime-type>
+
+       <mime-type type="application/vnd.oasis.opendocument.text-master">
+               <comment>OpenDocument v1.0: Global Text document</comment>
+               <alias type="application/x-vnd.oasis.opendocument.text-master" 
/>
+               <glob pattern="*.odm" />
+               <magic>
+                       <match type="string" offset="0" value="PK">
+                               <match type="string" offset="30"
+                                       
value="mimetypeapplication/vnd.oasis.opendocument.text-master" />
+                       </match>
+               </magic>
+       </mime-type>
+
+       <mime-type type="application/vnd.oasis.opendocument.text-web">
+               <comment>OpenDocument v1.0: Text document used as template for 
HTML documents</comment>
+               <alias type="application/x-vnd.oasis.opendocument.text-web" />
+               <glob pattern="*.oth" />
+               <magic>
+                       <match type="string" offset="0" value="PK">
+                               <match type="string" offset="30"
+                                       
value="mimetypeapplication/vnd.oasis.opendocument.text-web" />
+                       </match>
+               </magic>
+       </mime-type>
+
+       <mime-type type="application/zip">
+               <alias type="application/x-zip-compressed" />
+               <magic priority="40">
+                       <match value="PK\003\004" type="string" offset="0" />
+               </magic>
+               <glob pattern="*.zip" />
+       </mime-type>
+
+       <mime-type type="application/msword">
+               <magic priority="50">
+                       <match value="\x31\xbe\x00\x00" type="string" 
offset="0" />
+                       <match value="PO^Q`" type="string" offset="0" />
+                       <match value="\376\067\0\043" type="string" offset="0" 
/>
+                       <match value="\333\245-\0\0\0" type="string" offset="0" 
/>
+                       <match value="Microsoft Word 6.0 Document" type="string"
+                               offset="2080" />
+                       <match value="Microsoft Word document data" 
type="string"
+                               offset="2112" />
+               </magic>
+               <glob pattern="*.doc" />
+               <alias type="application/vnd.ms-word" />
+       </mime-type>
+
+       <mime-type type="application/octet-stream">
+               <magic priority="50">
+                       <match value="\037\036" type="string" offset="0" />
+                       <match value="017437" type="host16" offset="0" />
+                       <match value="0x1fff" type="host16" offset="0" />
+                       <match value="\377\037" type="string" offset="0" />
+                       <match value="0145405" type="host16" offset="0" />
+               </magic>
+               <glob pattern="*.bin" />
+       </mime-type>
+
+       <mime-type type="application/pdf">
+               <magic priority="50">
+                       <match value="%PDF-" type="string" offset="0" />
+               </magic>
+               <glob pattern="*.pdf" />
+               <alias type="application/x-pdf" />
+       </mime-type>
+
+       <mime-type type="application/atom+xml">
+               <root-XML localName="feed"
+                       namespaceURI="http://purl.org/atom/ns#"; />
+       </mime-type>
+
+       <mime-type type="application/mac-binhex40">
+               <glob pattern="*.hqx" />
+       </mime-type>
+
+       <mime-type type="application/mac-compactpro">
+               <glob pattern="*.cpt" />
+       </mime-type>
+
+       <mime-type type="application/rtf">
+           <glob pattern="*.rtf"/>
+               <alias type="text/rtf" />
+       </mime-type>
+
+       <mime-type type="application/rss+xml">
+               <alias type="text/rss" />
+               <root-XML localName="rss" />
+               <root-XML namespaceURI="http://purl.org/rss/1.0/"; />
+               <glob pattern="*.rss" />
+       </mime-type>
+
+       <!--  added in by mattmann -->
+       <mime-type type="application/xml">
+               <alias type="text/xml" />
+               <glob pattern="*.xml" />
+       </mime-type>
+
+       <mime-type type="application/x-mif">
+               <alias type="application/vnd.mif" />
+       </mime-type>
+
+       <mime-type type="application/vnd.wap.wbxml">
+               <glob pattern="*.wbxml" />
+       </mime-type>
+
+       <mime-type type="application/vnd.wap.wmlc">
+               <_comment>Compiled WML Document</_comment>
+               <glob pattern="*.wmlc" />
+       </mime-type>
+
+       <mime-type type="application/vnd.wap.wmlscriptc">
+               <_comment>Compiled WML Script</_comment>
+               <glob pattern="*.wmlsc" />
+       </mime-type>
+
+       <mime-type type="text/vnd.wap.wmlscript">
+               <_comment>WML Script</_comment>
+               <glob pattern="*.wmls" />
+       </mime-type>
+
+       <mime-type type="application/x-bzip">
+               <alias type="application/x-bzip2" />
+       </mime-type>
+
+       <mime-type type="application/x-bzip-compressed-tar">
+               <glob pattern="*.tbz" />
+               <glob pattern="*.tbz2" />
+       </mime-type>
+
+       <mime-type type="application/x-cdlink">
+               <_comment>Virtual CD-ROM CD Image File</_comment>
+               <glob pattern="*.vcd" />
+       </mime-type>
+
+       <mime-type type="application/x-director">
+               <_comment>Shockwave Movie</_comment>
+               <glob pattern="*.dcr" />
+               <glob pattern="*.dir" />
+               <glob pattern="*.dxr" />
+       </mime-type>
+
+       <mime-type type="application/x-futuresplash">
+               <_comment>Macromedia FutureSplash File</_comment>
+               <glob pattern="*.spl" />
+       </mime-type>
+
+       <mime-type type="application/x-java">
+               <alias type="application/java" />
+       </mime-type>
+
+       <mime-type type="application/x-koan">
+               <_comment>SSEYO Koan File</_comment>
+               <glob pattern="*.skp" />
+               <glob pattern="*.skd" />
+               <glob pattern="*.skt" />
+               <glob pattern="*.skm" />
+       </mime-type>
+
+       <mime-type type="application/x-latex">
+               <_comment>LaTeX Source Document</_comment>
+               <glob pattern="*.latex" />
+       </mime-type>
+
+       <!-- JC CHANGED
+               <mime-type type="application/x-mif">
+               <_comment>FrameMaker MIF document</_comment>
+               <glob pattern="*.mif"/>
+               </mime-type> -->
+
+       <mime-type type="application/x-ms-dos-executable">
+               <alias type="application/x-dosexec" />
+       </mime-type>
+
+       <mime-type type="application/ogg">
+               <alias type="application/x-ogg" />
+       </mime-type>
+
+       <mime-type type="application/x-rar">
+               <alias type="application/x-rar-compressed" />
+       </mime-type>
+
+       <mime-type type="application/x-shellscript">
+               <alias type="application/x-sh" />
+       </mime-type>
+
+       <mime-type type="application/xhtml+xml">
+               <glob pattern="*.xht" />
+       </mime-type>
+
+       <mime-type type="audio/midi">
+               <glob pattern="*.kar" />
+       </mime-type>
+
+       <mime-type type="audio/x-pn-realaudio">
+               <alias type="audio/x-realaudio" />
+       </mime-type>
+
+       <mime-type type="image/tiff">
+               <magic priority="50">
+                       <match value="0x4d4d2a00" type="string" offset="0" />
+                       <match value="0x49492a00" type="string" offset="0" />
+               </magic>
+       </mime-type>
+
+       <mime-type type="message/rfc822">
+               <magic priority="50">
+                       <match type="string" value="Relay-Version:" offset="0" 
/>
+                       <match type="string" value="#! rnews" offset="0" />
+                       <match type="string" value="N#! rnews" offset="0" />
+                       <match type="string" value="Forward to" offset="0" />
+                       <match type="string" value="Pipe to" offset="0" />
+                       <match type="string" value="Return-Path:" offset="0" />
+                       <match type="string" value="From:" offset="0" />
+                       <match type="string" value="Message-ID:" offset="0" />
+                       <match type="string" value="Date:" offset="0" />
+               </magic>
+       </mime-type>
+
+       <mime-type type="image/vnd.wap.wbmp">
+               <_comment>Wireless Bitmap File Format</_comment>
+               <glob pattern="*.wbmp" />
+       </mime-type>
+
+       <mime-type type="image/x-psd">
+               <alias type="image/photoshop" />
+       </mime-type>
+
+       <mime-type type="image/x-xcf">
+               <alias type="image/xcf" />
+               <magic priority="50">
+                       <match type="string" value="gimp xcf " offset="0" />
+               </magic>
+       </mime-type>
+
+       <mime-type type="model/iges">
+               <_comment>
+                       Initial Graphics Exchange Specification Format
+               </_comment>
+               <glob pattern="*.igs" />
+               <glob pattern="*.iges" />
+       </mime-type>
+
+       <mime-type type="model/mesh">
+               <glob pattern="*.msh" />
+               <glob pattern="*.mesh" />
+               <glob pattern="*.silo" />
+       </mime-type>
+
+       <mime-type type="model/vrml">
+               <glob pattern="*.vrml" />
+       </mime-type>
+
+       <mime-type type="text/x-tcl">
+               <alias type="application/x-tcl" />
+       </mime-type>
+
+       <mime-type type="text/x-tex">
+               <alias type="application/x-tex" />
+       </mime-type>
+
+       <mime-type type="text/x-texinfo">
+               <alias type="application/x-texinfo" />
+       </mime-type>
+
+       <mime-type type="text/x-troff-me">
+               <alias type="application/x-troff-me" />
+       </mime-type>
+
+       <mime-type type="video/vnd.mpegurl">
+               <glob pattern="*.mxu" />
+       </mime-type>
+
+       <mime-type type="x-conference/x-cooltalk">
+               <_comment>Cooltalk Audio</_comment>
+               <glob pattern="*.ice" />
+       </mime-type>
+
+<!-- ===================================================================== -->
+<!-- TIKA-85: http://www.apache.org/dev/svn-eol-style.txt                  -->
+<!-- ===================================================================== -->
+
+       <mime-type type="image/x-icon">
+               <glob pattern="*.ico" />
+       </mime-type>
+
+       <mime-type type="image/jpeg">
+               <glob pattern="*.jpg" />
+       </mime-type>
+
+       <mime-type type="image/png">
+               <glob pattern="*.png" />
+       </mime-type>
+
+</mime-info>


Reply via email to