Author: mattmann
Date: Mon Mar 9 05:32:46 2009
New Revision: 751586
URL: http://svn.apache.org/viewvc?rev=751586&view=rev
Log:
- fix for TIKA-194
Modified:
lucene/tika/trunk/CHANGES.txt
lucene/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypes.java
lucene/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypesReader.java
lucene/tika/trunk/src/main/java/org/apache/tika/mime/Patterns.java
lucene/tika/trunk/src/test/java/org/apache/tika/mime/TestMimeTypes.java
Modified: lucene/tika/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/CHANGES.txt?rev=751586&r1=751585&r2=751586&view=diff
==============================================================================
--- lucene/tika/trunk/CHANGES.txt (original)
+++ lucene/tika/trunk/CHANGES.txt Mon Mar 9 05:32:46 2009
@@ -6,6 +6,14 @@
The most notable changes in Tika 0.3 over the previous release are:
+ * Tika now supports mime type glob patterns specified using
+ standard JDK 1.4 (and beyond) syntax via the isregex attribute
+ on the glob tag. See:
+
+ http://java.sun.com/j2se/1.4.2/docs/api/java/util/regex/Pattern.html
+
+ for more information. (TIKA-194)
+
* Tika now supports the Office Open XML format used by
Microsoft Office 2007. (TIKA-152)
Modified: lucene/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypes.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypes.java?rev=751586&r1=751585&r2=751586&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypes.java
(original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypes.java Mon Mar
9 05:32:46 2009
@@ -387,15 +387,42 @@
}
/**
- * Adds a file name pattern for the given media type.
- *
- * @param type media type
- * @param pattern file name pattern
- * @throws MimeTypeException if the pattern conflicts with existing ones
+ * Adds a file name pattern for the given media type. Assumes that the
+ * pattern being added is <b>not</b> a JDK standard regular expression.
+ *
+ * @param type
+ * media type
+ * @param pattern
+ * file name pattern
+ * @throws MimeTypeException
+ * if the pattern conflicts with existing ones
*/
public void addPattern(MimeType type, String pattern)
throws MimeTypeException {
- patterns.add(pattern, type);
+ this.addPattern(type, pattern, false);
+ }
+
+ /**
+ * Adds a file name pattern for the given media type. The caller can
specify
+ * whether the pattern being added <b>is</b> or <b>is not</b> a JDK
standard
+ * regular expression via the <code>isRegex</code> parameter. If the value
+ * is set to true, then a JDK standard regex is assumed, otherwise the
+ * freedesktop glob type is assumed.
+ *
+ * @param type
+ * media type
+ * @param pattern
+ * file name pattern
+ * @param isRegex
+ * set to true if JDK std regexs are desired, otherwise set to
+ * false.
+ * @throws MimeTypeException
+ * if the pattern conflicts with existing ones.
+ *
+ */
+ public void addPattern(MimeType type, String pattern, boolean isRegex)
+ throws MimeTypeException {
+ patterns.add(pattern, isRegex, type);
}
/**
Modified:
lucene/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypesReader.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypesReader.java?rev=751586&r1=751585&r2=751586&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypesReader.java
(original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypesReader.java
Mon Mar 9 05:32:46 2009
@@ -59,6 +59,7 @@
*
* <!ELEMENT glob EMPTY>
* <!ATTLIST glob pattern CDATA #REQUIRED>
+ * <!ATTLIST glob isregex CDATA #IMPLIED>
*
* <!ELEMENT magic (match)+>
* <!ATTLIST magic priority CDATA #IMPLIED>
@@ -159,7 +160,8 @@
type.setDescription(
nodeElement.getFirstChild().getNodeValue());
} else if (nodeElement.getTagName().equals("glob")) {
- types.addPattern(type,
nodeElement.getAttribute("pattern"));
+ boolean useRegex =
Boolean.valueOf(nodeElement.getAttribute("isregex"));
+ types.addPattern(type,
nodeElement.getAttribute("pattern"), useRegex);
} else if (nodeElement.getTagName().equals("magic")) {
readMagic(nodeElement, type);
} else if (nodeElement.getTagName().equals("alias")) {
Modified: lucene/tika/trunk/src/main/java/org/apache/tika/mime/Patterns.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/mime/Patterns.java?rev=751586&r1=751585&r2=751586&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/mime/Patterns.java
(original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/mime/Patterns.java Mon Mar
9 05:32:46 2009
@@ -57,25 +57,36 @@
}
});
+
public void add(String pattern, MimeType type) throws MimeTypeException {
+ this.add(pattern, false, type);
+ }
+
+ public void add(String pattern, boolean isJavaRegex, MimeType type)
+ throws MimeTypeException {
if (pattern == null || type == null) {
- throw new IllegalArgumentException("Pattern and/or mime type is
missing");
+ throw new IllegalArgumentException(
+ "Pattern and/or mime type is missing");
}
-
- if (pattern.indexOf('*') == -1
- && pattern.indexOf('?') == -1
- && pattern.indexOf('[') == -1) {
- addName(pattern, type);
- } else if (pattern.startsWith("*")
- && pattern.indexOf('*', 1) == -1
- && pattern.indexOf('?') == -1
- && pattern.indexOf('[') == -1) {
- addExtension(pattern.substring(1), type);
+
+ if (isJavaRegex) {
+ // in this case, we don't need to build a regex pattern
+ // it's already there for us, so just add the pattern as is
+ addGlob(pattern, type);
} else {
- addGlob(compile(pattern), type);
+
+ if (pattern.indexOf('*') == -1 && pattern.indexOf('?') == -1
+ && pattern.indexOf('[') == -1) {
+ addName(pattern, type);
+ } else if (pattern.startsWith("*") && pattern.indexOf('*', 1) == -1
+ && pattern.indexOf('?') == -1 && pattern.indexOf('[') ==
-1) {
+ addExtension(pattern.substring(1), type);
+ } else {
+ addGlob(compile(pattern), type);
+ }
}
}
-
+
private void addName(String name, MimeType type) throws MimeTypeException {
MimeType previous = names.get(name);
if (previous == null || previous.isDescendantOf(type)) {
@@ -107,7 +118,7 @@
throws MimeTypeException {
MimeType previous = globs.get(glob);
if (previous == null || previous.isDescendantOf(type)) {
- extensions.put(glob, type);
+ globs.put(glob, type);
} else if (previous == type || type.isDescendantOf(previous)) {
// do nothing
} else {
Modified:
lucene/tika/trunk/src/test/java/org/apache/tika/mime/TestMimeTypes.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/test/java/org/apache/tika/mime/TestMimeTypes.java?rev=751586&r1=751585&r2=751586&view=diff
==============================================================================
--- lucene/tika/trunk/src/test/java/org/apache/tika/mime/TestMimeTypes.java
(original)
+++ lucene/tika/trunk/src/test/java/org/apache/tika/mime/TestMimeTypes.java Mon
Mar 9 05:32:46 2009
@@ -208,6 +208,28 @@
assertTypeByName("application/postscript", "x.epsi");
}
+ /**
+ * @since TIKA-194
+ */
+ public void testJavaRegex() throws Exception{
+ MimeType testType = new MimeType(this.repo, "foo/bar");
+ this.repo.add(testType);
+ assertNotNull(repo.forName("foo/bar"));
+ String pattern = "rtg_sst_grb_0\\.5\\.\\d{8}";
+ System.out.println("Pattern: ["+pattern+"]");
+ this.repo.addPattern(testType, pattern, true);
+ String testFileName = "rtg_sst_grb_0.5.12345678";
+ assertNotNull(this.repo.getMimeType(testFileName));
+ assertEquals(this.repo.getMimeType(testFileName).getName(),
"foo/bar");
+
+ MimeType testType2 = new MimeType(this.repo, "foo/bar2");
+ this.repo.add(testType2);
+ assertNotNull(repo.forName("foo/bar2"));
+ this.repo.addPattern(testType2, pattern, false);
+ assertNotNull(this.repo.getMimeType(testFileName));
+ assertNotSame("foo/bar2",
this.repo.getMimeType(testFileName).getName());
+ }
+
public void testRawDetection() throws Exception {
assertTypeByName("image/x-tika-dng", "x.dng");
assertTypeByName("image/x-tika-dng", "x.DNG");