Author: siren
Date: Sat Sep 20 08:08:50 2008
New Revision: 697376

URL: http://svn.apache.org/viewvc?rev=697376&view=rev
Log:
TIKA-159 - Add support for parsing basic audio types: wav, aiff, au, midi

Added:
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/audio/
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/audio/AudioParser.java
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/audio/MidiParser.java
    incubator/tika/trunk/src/test/java/org/apache/tika/parser/audio/
    
incubator/tika/trunk/src/test/java/org/apache/tika/parser/audio/AudioParserTest.java
    
incubator/tika/trunk/src/test/java/org/apache/tika/parser/audio/MidiParserTest.java
    incubator/tika/trunk/src/test/resources/test-documents/testAIFF.aif   (with 
props)
    incubator/tika/trunk/src/test/resources/test-documents/testAU.au   (with 
props)
    incubator/tika/trunk/src/test/resources/test-documents/testMID.mid   (with 
props)
    incubator/tika/trunk/src/test/resources/test-documents/testWAV.wav   (with 
props)
Modified:
    incubator/tika/trunk/CHANGES.txt
    incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml
    incubator/tika/trunk/src/main/resources/tika-config.xml
    incubator/tika/trunk/src/test/java/org/apache/tika/mime/TestMimeTypes.java

Modified: incubator/tika/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=697376&r1=697375&r2=697376&view=diff
==============================================================================
--- incubator/tika/trunk/CHANGES.txt (original)
+++ incubator/tika/trunk/CHANGES.txt Sat Sep 20 08:08:50 2008
@@ -84,6 +84,9 @@
 
 35. TIKA-161 - Enable PMD reports (Jukka Zitting)
 
+36. TIKA-159 - Add support for parsing basic audio types: wav, aiff, au, midi 
(Sami Siren)
+
+
 Release 0.1-incubating - 12/27/2007
 
 1. TIKA-5 - Port Metadata Framework from Nutch (mattmann)

Added: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/audio/AudioParser.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/audio/AudioParser.java?rev=697376&view=auto
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/audio/AudioParser.java
 (added)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/audio/AudioParser.java
 Sat Sep 20 08:08:50 2008
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.audio;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Map.Entry;
+
+import javax.sound.sampled.AudioFileFormat;
+import javax.sound.sampled.AudioFormat;
+import javax.sound.sampled.AudioSystem;
+import javax.sound.sampled.UnsupportedAudioFileException;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class AudioParser implements Parser {
+
+    public void parse(InputStream stream, ContentHandler handler,
+            Metadata metadata) throws IOException, SAXException, TikaException 
{
+        parse(stream, metadata);
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+        xhtml.endDocument();
+    }
+
+    public void parse(InputStream stream, Metadata metadata)
+            throws IOException, TikaException {
+        String type = metadata.get(Metadata.CONTENT_TYPE);
+        if (type != null) {
+            try {
+
+                AudioFileFormat fileFormat = AudioSystem
+                        .getAudioFileFormat(stream);
+
+                AudioFormat format = fileFormat.getFormat();
+
+                metadata.set("samplerate", Integer.toString((int) format
+                        .getSampleRate()));
+                metadata
+                        .set("channels", 
Integer.toString(format.getChannels()));
+                metadata.set("bits", Integer.toString(format
+                        .getSampleSizeInBits()));
+                metadata.set("encoding", format.getEncoding().toString());
+
+                // Javadoc suggests that some of the following properties might
+                // be available, but I had no success in finding any:
+
+                // "duration" Long playback duration of the file in 
microseconds
+                // "author" String name of the author of this file
+                // "title" String title of this file
+                // "copyright" String copyright message
+                // "date" Date date of the recording or release
+                // "comment" String an arbitrary text
+
+                for (Entry<String, Object> entry : format.properties()
+                        .entrySet()) {
+                    metadata.set(entry.getKey(), entry.getValue().toString());
+                }
+
+            } catch (UnsupportedAudioFileException e) {
+                // cannot parse, unknown format
+            }
+
+        }
+    }
+}

Added: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/audio/MidiParser.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/audio/MidiParser.java?rev=697376&view=auto
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/audio/MidiParser.java 
(added)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/audio/MidiParser.java 
Sat Sep 20 08:08:50 2008
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.audio;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.HashMap;
+
+import javax.sound.midi.InvalidMidiDataException;
+import javax.sound.midi.MidiSystem;
+import javax.sound.midi.Sequence;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class MidiParser implements Parser {
+
+    public void parse(InputStream stream, ContentHandler handler,
+            Metadata metadata) throws IOException, SAXException, TikaException 
{
+        parse(stream, metadata);
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+        xhtml.endDocument();
+    }
+
+    private static HashMap<Float, String> divisionTypes = new HashMap<Float, 
String>();
+
+    static {
+        divisionTypes.put(Sequence.PPQ, "PRQ");
+        divisionTypes.put(Sequence.SMPTE_24, "SMPTE_24");
+        divisionTypes.put(Sequence.SMPTE_25, "SMPTE_25");
+        divisionTypes.put(Sequence.SMPTE_30, "SMPTE_30");
+        divisionTypes.put(Sequence.SMPTE_30DROP, "SMPTE_30DROP");
+    }
+
+    public void parse(InputStream stream, Metadata metadata)
+            throws IOException, TikaException {
+        String type = metadata.get(Metadata.CONTENT_TYPE);
+        if (type != null) {
+
+            try {
+
+                Sequence sequence = MidiSystem.getSequence(stream);
+
+                metadata.set("tracks", Integer
+                        .toString(sequence.getTracks().length));
+
+                metadata.set("patches", Integer.toString(sequence
+                        .getPatchList().length));
+
+                metadata.set("divisionType", divisionTypes.get(sequence
+                        .getDivisionType()));
+
+            } catch (InvalidMidiDataException e) {
+                // cannot parse format
+            }
+
+        }
+    }
+}

Modified: incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml?rev=697376&r1=697375&r2=697376&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml (original)
+++ incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml Sat Sep 20 
08:08:50 2008
@@ -570,6 +570,10 @@
 
   <mime-type type="audio/midi">
     <glob pattern="*.kar" />
+    <glob pattern="*.mid" />
+    <magic priority ="20">
+      <match type="string" value="MThd" offset="0" />
+    </magic>
   </mime-type>
 
   <mime-type type="audio/x-pn-realaudio">
@@ -693,4 +697,31 @@
     <glob pattern="*.png" />
   </mime-type>
 
+  <mime-type type="audio/basic">
+    <glob pattern="*.au" />
+    <glob pattern="*.snd" />
+    <magic priority="20">
+      <match value=".snd" type="string" offset="0" />
+    </magic>
+  </mime-type>
+
+  <mime-type type="audio/x-aiff">
+    <glob pattern="*.aif" />
+    <glob pattern="*.aiff" />
+    <magic priority="20">
+      <match value="FORM" type="string" offset="0" />
+      <match value="AIFF" type="string" offset="8" />
+      <match value="AIFC" type="string" offset="8" />
+      <match value="8SVX" type="string" offset="8" />
+    </magic>
+  </mime-type>
+
+  <mime-type type="audio/x-wav">
+    <glob pattern="*.wav" />
+    <magic priority="20">
+      <match value="RIFF" type="string" offset="0" />
+      <match value="WAVE" type="string" offset="8" />
+    </magic>
+  </mime-type>
+
 </mime-info>

Modified: incubator/tika/trunk/src/main/resources/tika-config.xml
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/resources/tika-config.xml?rev=697376&r1=697375&r2=697376&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/resources/tika-config.xml (original)
+++ incubator/tika/trunk/src/main/resources/tika-config.xml Sat Sep 20 08:08:50 
2008
@@ -125,6 +125,18 @@
         <parser name="parse-mp3" class="org.apache.tika.parser.mp3.Mp3Parser">
                 <mime>audio/mpeg</mime>
         </parser>
+
+        <parser name="parse-midi" 
class="org.apache.tika.parser.audio.MidiParser">
+                <mime>application/x-midi</mime>
+                <mime>audio/midi</mime>
+        </parser>
+
+        <parser name="parse-audio" 
class="org.apache.tika.parser.audio.AudioParser">
+                <mime>audio/basic</mime>
+                <mime>audio/x-wav</mime>
+                <mime>audio/x-aiff</mime>
+        </parser>
+
     </parsers>
 
 </properties>
\ No newline at end of file

Modified: 
incubator/tika/trunk/src/test/java/org/apache/tika/mime/TestMimeTypes.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/mime/TestMimeTypes.java?rev=697376&r1=697375&r2=697376&view=diff
==============================================================================
--- incubator/tika/trunk/src/test/java/org/apache/tika/mime/TestMimeTypes.java 
(original)
+++ incubator/tika/trunk/src/test/java/org/apache/tika/mime/TestMimeTypes.java 
Sat Sep 20 08:08:50 2008
@@ -18,14 +18,13 @@
 package org.apache.tika.mime;
 
 // Junit imports
-import java.net.URL;
-import java.net.MalformedURLException;
 import java.io.File;
 import java.io.IOException;
+import java.net.MalformedURLException;
+import java.net.URL;
 
 import junit.framework.TestCase;
 
-// Tika imports
 import org.apache.tika.config.TikaConfig;
 
 /**
@@ -49,8 +48,6 @@
 
     private static final File f = new File("/a/b/c/x.pdf");
 
-
-
     public TestMimeTypes() {
         try {
             repo = TikaConfig.getDefaultConfig().getMimeRepository();
@@ -84,19 +81,19 @@
         assertEquals("text/plain", repo.getMimeType("x.txt").getName());
         assertEquals("text/html", repo.getMimeType("x.htm").getName());
         assertEquals("text/html", repo.getMimeType("x.html").getName());
-        assertEquals("application/xhtml+xml",
-                repo.getMimeType("x.xhtml").getName());
+        assertEquals("application/xhtml+xml", repo.getMimeType("x.xhtml")
+                .getName());
         assertEquals("application/xml", repo.getMimeType("x.xml").getName());
         assertEquals("application/msword", 
repo.getMimeType("x.doc").getName());
-        assertEquals("application/vnd.ms-powerpoint",
-                repo.getMimeType("x.ppt").getName());
-        assertEquals("application/vnd.ms-excel",
-                repo.getMimeType("x.xls").getName());
+        assertEquals("application/vnd.ms-powerpoint", repo.getMimeType("x.ppt")
+                .getName());
+        assertEquals("application/vnd.ms-excel", repo.getMimeType("x.xls")
+                .getName());
         assertEquals("application/zip", repo.getMimeType("x.zip").getName());
-        assertEquals("application/vnd.oasis.opendocument.text",
-                repo.getMimeType("x.odt").getName());
-        assertEquals("application/octet-stream",
-                repo.getMimeType("x.xyz").getName());
+        assertEquals("application/vnd.oasis.opendocument.text", repo
+                .getMimeType("x.odt").getName());
+        assertEquals("application/octet-stream", repo.getMimeType("x.xyz")
+                .getName());
     }
 
     /**
@@ -110,9 +107,10 @@
         // TODO: Currently returns generic MS Office type based on
         // the magic header. The getMimeType method should understand
         // MS Office types better.
-        // assertEquals("application/vnd.ms-excel", 
getMimeType("testEXCEL.xls"));
+        // assertEquals("application/vnd.ms-excel",
+        // getMimeType("testEXCEL.xls"));
         // assertEquals("application/vnd.ms-powerpoint",
-        //         getMimeType("testPPT.ppt"));
+        // getMimeType("testPPT.ppt"));
         // assertEquals("application/msword", getMimeType("testWORD.doc"));
         assertEquals("text/html", getMimeType("testHTML_utf8.html"));
         assertEquals("application/vnd.oasis.opendocument.text",
@@ -121,9 +119,12 @@
         assertEquals("application/rtf", getMimeType("testRTF.rtf"));
         assertEquals("text/plain", getMimeType("testTXT.txt"));
         assertEquals("application/xml", getMimeType("testXML.xml"));
+        assertEquals("audio/basic", getMimeType("testAU.au"));
+        assertEquals("audio/x-aiff", getMimeType("testAIFF.aif"));
+        assertEquals("audio/x-wav", getMimeType("testWAV.wav"));
+        assertEquals("audio/midi", getMimeType("testMID.mid"));
     }
 
-
     private String getMimeType(String filename) {
 
         String type = null;

Added: 
incubator/tika/trunk/src/test/java/org/apache/tika/parser/audio/AudioParserTest.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/audio/AudioParserTest.java?rev=697376&view=auto
==============================================================================
--- 
incubator/tika/trunk/src/test/java/org/apache/tika/parser/audio/AudioParserTest.java
 (added)
+++ 
incubator/tika/trunk/src/test/java/org/apache/tika/parser/audio/AudioParserTest.java
 Sat Sep 20 08:08:50 2008
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.audio;
+
+import java.io.InputStream;
+
+import junit.framework.TestCase;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.Parser;
+import org.xml.sax.helpers.DefaultHandler;
+
+public class AudioParserTest extends TestCase {
+
+    private final Parser parser = new AudioParser();
+
+    public void testWAV() throws Exception {
+        Metadata metadata = new Metadata();
+        metadata.set(Metadata.CONTENT_TYPE, "audio/x-wav");
+        InputStream stream = getClass().getResourceAsStream(
+                "/test-documents/testWAV.wav");
+
+        parser.parse(stream, new DefaultHandler(), metadata);
+
+        assertEquals("44100", metadata.get("samplerate"));
+        assertEquals("2", metadata.get("channels"));
+        assertEquals("16", metadata.get("bits"));
+        assertEquals("PCM_SIGNED", metadata.get("encoding"));
+
+    }
+
+    public void testAIFF() throws Exception {
+        Metadata metadata = new Metadata();
+        metadata.set(Metadata.CONTENT_TYPE, "audio/x-aiff");
+        InputStream stream = getClass().getResourceAsStream(
+                "/test-documents/testAIFF.aif");
+
+        parser.parse(stream, new DefaultHandler(), metadata);
+
+        assertEquals("44100", metadata.get("samplerate"));
+        assertEquals("2", metadata.get("channels"));
+        assertEquals("16", metadata.get("bits"));
+        assertEquals("PCM_SIGNED", metadata.get("encoding"));
+
+    }
+
+    public void testAU() throws Exception {
+        Metadata metadata = new Metadata();
+        metadata.set(Metadata.CONTENT_TYPE, "audio/basic");
+        InputStream stream = getClass().getResourceAsStream(
+                "/test-documents/testAU.au");
+
+        parser.parse(stream, new DefaultHandler(), metadata);
+
+        assertEquals("44100", metadata.get("samplerate"));
+        assertEquals("2", metadata.get("channels"));
+        assertEquals("16", metadata.get("bits"));
+        assertEquals("PCM_SIGNED", metadata.get("encoding"));
+
+    }
+
+}

Added: 
incubator/tika/trunk/src/test/java/org/apache/tika/parser/audio/MidiParserTest.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/audio/MidiParserTest.java?rev=697376&view=auto
==============================================================================
--- 
incubator/tika/trunk/src/test/java/org/apache/tika/parser/audio/MidiParserTest.java
 (added)
+++ 
incubator/tika/trunk/src/test/java/org/apache/tika/parser/audio/MidiParserTest.java
 Sat Sep 20 08:08:50 2008
@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.audio;
+
+import java.io.InputStream;
+
+import junit.framework.TestCase;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.Parser;
+import org.xml.sax.helpers.DefaultHandler;
+
+public class MidiParserTest extends TestCase {
+
+    private final Parser parser = new MidiParser();
+
+    public void testMID() throws Exception {
+        Metadata metadata = new Metadata();
+        metadata.set(Metadata.CONTENT_TYPE, "audio/midi");
+        InputStream stream = getClass().getResourceAsStream(
+                "/test-documents/testMID.mid");
+
+        parser.parse(stream, new DefaultHandler(), metadata);
+
+        assertEquals("2", metadata.get("tracks"));
+        assertEquals("0", metadata.get("patches"));
+        assertEquals("PRQ", metadata.get("divisionType"));
+
+    }
+}

Added: incubator/tika/trunk/src/test/resources/test-documents/testAIFF.aif
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/resources/test-documents/testAIFF.aif?rev=697376&view=auto
==============================================================================
Binary file - no diff available.

Propchange: incubator/tika/trunk/src/test/resources/test-documents/testAIFF.aif
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: incubator/tika/trunk/src/test/resources/test-documents/testAU.au
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/resources/test-documents/testAU.au?rev=697376&view=auto
==============================================================================
Binary file - no diff available.

Propchange: incubator/tika/trunk/src/test/resources/test-documents/testAU.au
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: incubator/tika/trunk/src/test/resources/test-documents/testMID.mid
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/resources/test-documents/testMID.mid?rev=697376&view=auto
==============================================================================
Binary file - no diff available.

Propchange: incubator/tika/trunk/src/test/resources/test-documents/testMID.mid
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: incubator/tika/trunk/src/test/resources/test-documents/testWAV.wav
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/resources/test-documents/testWAV.wav?rev=697376&view=auto
==============================================================================
Binary file - no diff available.

Propchange: incubator/tika/trunk/src/test/resources/test-documents/testWAV.wav
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream


Reply via email to