Author: mattmann
Date: Mon Mar  9 05:32:46 2009
New Revision: 751586

URL: http://svn.apache.org/viewvc?rev=751586&view=rev
Log:
- fix for TIKA-194

Modified:
    lucene/tika/trunk/CHANGES.txt
    lucene/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypes.java
    lucene/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypesReader.java
    lucene/tika/trunk/src/main/java/org/apache/tika/mime/Patterns.java
    lucene/tika/trunk/src/test/java/org/apache/tika/mime/TestMimeTypes.java

Modified: lucene/tika/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/CHANGES.txt?rev=751586&r1=751585&r2=751586&view=diff
==============================================================================
--- lucene/tika/trunk/CHANGES.txt (original)
+++ lucene/tika/trunk/CHANGES.txt Mon Mar  9 05:32:46 2009
@@ -6,6 +6,14 @@
 
 The most notable changes in Tika 0.3 over the previous release are:
 
+  * Tika now supports mime type glob patterns specified using
+    standard JDK 1.4 (and beyond) syntax via the isregex attribute
+    on the glob tag. See:
+    
+    http://java.sun.com/j2se/1.4.2/docs/api/java/util/regex/Pattern.html
+    
+    for more information. (TIKA-194)
+
   * Tika now supports the Office Open XML format used by
     Microsoft Office 2007. (TIKA-152)
 

Modified: lucene/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypes.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypes.java?rev=751586&r1=751585&r2=751586&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypes.java 
(original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypes.java Mon Mar 
 9 05:32:46 2009
@@ -387,15 +387,42 @@
     }
 
     /**
-     * Adds a file name pattern for the given media type.
-     *
-     * @param type media type
-     * @param pattern file name pattern
-     * @throws MimeTypeException if the pattern conflicts with existing ones
+     * Adds a file name pattern for the given media type. Assumes that the
+     * pattern being added is <b>not</b> a JDK standard regular expression.
+     * 
+     * @param type
+     *            media type
+     * @param pattern
+     *            file name pattern
+     * @throws MimeTypeException
+     *             if the pattern conflicts with existing ones
      */
     public void addPattern(MimeType type, String pattern)
             throws MimeTypeException {
-        patterns.add(pattern, type);
+        this.addPattern(type, pattern, false);
+    }
+
+    /**
+     * Adds a file name pattern for the given media type. The caller can 
specify
+     * whether the pattern being added <b>is</b> or <b>is not</b> a JDK 
standard
+     * regular expression via the <code>isRegex</code> parameter. If the value
+     * is set to true, then a JDK standard regex is assumed, otherwise the
+     * freedesktop glob type is assumed.
+     * 
+     * @param type
+     *            media type
+     * @param pattern
+     *            file name pattern
+     * @param isRegex
+     *            set to true if JDK std regexs are desired, otherwise set to
+     *            false.
+     * @throws MimeTypeException
+     *             if the pattern conflicts with existing ones.
+     * 
+     */
+    public void addPattern(MimeType type, String pattern, boolean isRegex)
+            throws MimeTypeException {
+        patterns.add(pattern, isRegex, type);
     }
 
     /**

Modified: 
lucene/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypesReader.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypesReader.java?rev=751586&r1=751585&r2=751586&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypesReader.java 
(original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypesReader.java 
Mon Mar  9 05:32:46 2009
@@ -59,6 +59,7 @@
  * 
  *    &lt;!ELEMENT glob EMPTY&gt;
  *    &lt;!ATTLIST glob pattern CDATA #REQUIRED&gt;
+ *    &lt;!ATTLIST glob isregex CDATA #IMPLIED&gt;
  * 
  *    &lt;!ELEMENT magic (match)+&gt;
  *    &lt;!ATTLIST magic priority CDATA #IMPLIED&gt;
@@ -159,7 +160,8 @@
                         type.setDescription(
                                 nodeElement.getFirstChild().getNodeValue());
                     } else if (nodeElement.getTagName().equals("glob")) {
-                        types.addPattern(type, 
nodeElement.getAttribute("pattern"));
+                        boolean useRegex = 
Boolean.valueOf(nodeElement.getAttribute("isregex"));
+                        types.addPattern(type, 
nodeElement.getAttribute("pattern"), useRegex);
                     } else if (nodeElement.getTagName().equals("magic")) {
                         readMagic(nodeElement, type);
                     } else if (nodeElement.getTagName().equals("alias")) {

Modified: lucene/tika/trunk/src/main/java/org/apache/tika/mime/Patterns.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/mime/Patterns.java?rev=751586&r1=751585&r2=751586&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/mime/Patterns.java 
(original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/mime/Patterns.java Mon Mar  
9 05:32:46 2009
@@ -57,25 +57,36 @@
             }
         });
 
+
     public void add(String pattern, MimeType type) throws MimeTypeException {
+        this.add(pattern, false, type);
+    }
+   
+    public void add(String pattern, boolean isJavaRegex, MimeType type)
+            throws MimeTypeException {
         if (pattern == null || type == null) {
-            throw new IllegalArgumentException("Pattern and/or mime type is 
missing");
+            throw new IllegalArgumentException(
+                    "Pattern and/or mime type is missing");
         }
-
-        if (pattern.indexOf('*') == -1
-                && pattern.indexOf('?') == -1
-                && pattern.indexOf('[') == -1) {
-            addName(pattern, type);
-        } else if (pattern.startsWith("*")
-                && pattern.indexOf('*', 1) == -1
-                && pattern.indexOf('?') == -1
-                && pattern.indexOf('[') == -1) {
-            addExtension(pattern.substring(1), type);
+        
+        if (isJavaRegex) {
+            // in this case, we don't need to build a regex pattern
+            // it's already there for us, so just add the pattern as is
+            addGlob(pattern, type);
         } else {
-            addGlob(compile(pattern), type);
+
+            if (pattern.indexOf('*') == -1 && pattern.indexOf('?') == -1
+                    && pattern.indexOf('[') == -1) {
+                addName(pattern, type);
+            } else if (pattern.startsWith("*") && pattern.indexOf('*', 1) == -1
+                    && pattern.indexOf('?') == -1 && pattern.indexOf('[') == 
-1) {
+                addExtension(pattern.substring(1), type);
+            } else {
+                addGlob(compile(pattern), type);
+            }
         }
     }
-
+    
     private void addName(String name, MimeType type) throws MimeTypeException {
         MimeType previous = names.get(name);
         if (previous == null || previous.isDescendantOf(type)) {
@@ -107,7 +118,7 @@
             throws MimeTypeException {
         MimeType previous = globs.get(glob);
         if (previous == null || previous.isDescendantOf(type)) {
-            extensions.put(glob, type);
+            globs.put(glob, type);
         } else if (previous == type || type.isDescendantOf(previous)) {
             // do nothing
         } else {

Modified: 
lucene/tika/trunk/src/test/java/org/apache/tika/mime/TestMimeTypes.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/src/test/java/org/apache/tika/mime/TestMimeTypes.java?rev=751586&r1=751585&r2=751586&view=diff
==============================================================================
--- lucene/tika/trunk/src/test/java/org/apache/tika/mime/TestMimeTypes.java 
(original)
+++ lucene/tika/trunk/src/test/java/org/apache/tika/mime/TestMimeTypes.java Mon 
Mar  9 05:32:46 2009
@@ -208,6 +208,28 @@
         assertTypeByName("application/postscript", "x.epsi");
     }
 
+    /**
+     * @since TIKA-194
+     */
+    public void testJavaRegex() throws Exception{
+        MimeType testType = new MimeType(this.repo, "foo/bar");
+        this.repo.add(testType);
+        assertNotNull(repo.forName("foo/bar"));
+        String pattern = "rtg_sst_grb_0\\.5\\.\\d{8}";
+        System.out.println("Pattern: ["+pattern+"]");
+        this.repo.addPattern(testType, pattern, true);
+        String testFileName = "rtg_sst_grb_0.5.12345678";
+        assertNotNull(this.repo.getMimeType(testFileName));
+        assertEquals(this.repo.getMimeType(testFileName).getName(), 
"foo/bar");    
+        
+        MimeType testType2 = new MimeType(this.repo, "foo/bar2");
+        this.repo.add(testType2);
+        assertNotNull(repo.forName("foo/bar2"));
+        this.repo.addPattern(testType2, pattern, false);
+        assertNotNull(this.repo.getMimeType(testFileName));
+        assertNotSame("foo/bar2", 
this.repo.getMimeType(testFileName).getName());
+    }
+    
     public void testRawDetection() throws Exception {
         assertTypeByName("image/x-tika-dng", "x.dng");
         assertTypeByName("image/x-tika-dng", "x.DNG");


Reply via email to