Author: mattmann
Date: Fri Nov 13 22:35:27 2009
New Revision: 836035
URL: http://svn.apache.org/viewvc?rev=836035&view=rev
Log:
- fix for TIKA-309: Mime type application/rdf+xml not correctly detected
Modified:
lucene/tika/trunk/CHANGES.txt
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeType.java
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
Modified: lucene/tika/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/CHANGES.txt?rev=836035&r1=836034&r2=836035&view=diff
==============================================================================
--- lucene/tika/trunk/CHANGES.txt (original)
+++ lucene/tika/trunk/CHANGES.txt Fri Nov 13 22:35:27 2009
@@ -5,6 +5,8 @@
------------------------
The most notable changes in Tika 0.5 over the previous release are:
+ * Improved RDF/OWL mime detection using both MIME magic as well as
+ pattern matching (TIKA-309)
* An org.apache.tika.Tika facade class has been added to simplify common
text extraction and type detection use cases. (TIKA-269)
Modified:
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeType.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeType.java?rev=836035&r1=836034&r2=836035&view=diff
==============================================================================
---
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeType.java
(original)
+++
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeType.java
Fri Nov 13 22:35:27 2009
@@ -255,7 +255,7 @@
RootXML xml = null;
String content = new String(data);
for (int i = 0; i < rootXML.size(); i++) {
- xml = rootXML.get(i);
+ xml = rootXML.get(i);
if (xml.matches(content)) {
return true;
}
@@ -340,7 +340,7 @@
}
String regex = null;
if (isEmpty(namespaceURI)) {
- regex = ".*<" + localName + "[^<>]*>.*";
+ regex = ".*<" + localName + "[^<>]*.*";
} else if (isEmpty(localName)) {
regex = ".*<[^<>]*\\p{Space}xmlns=[\"\']?" + namespaceURI
+ "[\"\']?[^<>]*>.*";
Modified:
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=836035&r1=836034&r2=836035&view=diff
==============================================================================
---
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
(original)
+++
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
Fri Nov 13 22:35:27 2009
@@ -366,12 +366,17 @@
<mime-type type="application/qsig"/>
<mime-type type="application/rdf+xml">
+ <root-XML localName="rdf:RDF"/>
+ <root-XML localName="RDF"
+ namespaceURI="http://www.w3.org/1999/02/22-rdf-syntax-ns#"/>
+ <sub-class-of type="application/xml"/>
<acronym>RDF/XML</acronym>
<comment>XML syntax for RDF graphs</comment>
<glob pattern="*.owl"/>
<glob pattern="*.rdf"/>
- <root-XML localName="RDF"
- namespaceURI="http://www.w3.org/1999/02/22-rdf-syntax-ns#"/>
+ <glob pattern="*.owl"/>
+ <glob pattern="^rdf$" isregex="true"/>
+ <glob pattern="^owl$" isregex="true"/>
</mime-type>
<mime-type type="application/reginfo+xml">
@@ -3548,7 +3553,6 @@
<match value="<body" type="string" offset="0"/>
<match value="<TITLE" type="string" offset="0"/>
<match value="<title" type="string" offset="0"/>
- <match value="<!--" type="string" offset="0"/>
<match value="<h1" type="string" offset="0"/>
<match value="<H1" type="string" offset="0"/>
<match value="<!doctype HTML" type="string" offset="0"/>
Modified:
lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java?rev=836035&r1=836034&r2=836035&view=diff
==============================================================================
---
lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
(original)
+++
lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
Fri Nov 13 22:35:27 2009
@@ -18,6 +18,7 @@
import java.io.IOException;
import java.io.InputStream;
+import java.net.URL;
import junit.framework.TestCase;
@@ -47,6 +48,8 @@
testFile("application/xml", "test-utf16be.xml");
testFile("application/xml", "test-long-comment.xml");
testFile("application/xslt+xml", "stylesheet.xsl");
+ testUrl("application/rdf+xml", new
URL("http://www.ai.sri.com/daml/services/owl-s/1.2/Process.owl"));
+ testUrl("application/rdf+xml", new
URL("http://www.w3.org/2002/07/owl#"));
}
public void testAutosetSupertype() throws MimeTypeException {
@@ -57,25 +60,34 @@
type = types.forName("text/something");
assertEquals("text/plain", type.getSuperType().getName());
}
+
+ private void testUrl(String expected, URL url) throws IOException{
+ InputStream in = url.openStream();
+ testStream(expected, url.toString(), in);
+ }
private void testFile(String expected, String filename) throws IOException
{
InputStream in = getClass().getResourceAsStream(filename);
- assertNotNull("Test file not found: " + filename, in);
+ testStream(expected, filename, in);
+ }
+
+ private void testStream(String expected, String urlOrFileName, InputStream
in) throws IOException{
+ assertNotNull("Test stream: ["+urlOrFileName+"] is null!", in);
if (!in.markSupported()) {
in = new java.io.BufferedInputStream(in);
}
try {
Metadata metadata = new Metadata();
String mime = this.mimeTypes.detect(in, metadata).toString();
- assertEquals(filename + " is not properly detected.", expected,
mime);
+ assertEquals(urlOrFileName + " is not properly detected.",
expected, mime);
//Add resource name and test again
- metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
+ metadata.set(Metadata.RESOURCE_NAME_KEY, urlOrFileName);
mime = this.mimeTypes.detect(in, metadata).toString();
- assertEquals(filename + " is not properly detected.", expected,
mime);
+ assertEquals(urlOrFileName + " is not properly detected after
adding resource name.", expected, mime);
} finally {
in.close();
- }
+ }
}
}