Author: siren
Date: Sat Sep 20 08:08:50 2008
New Revision: 697376
URL: http://svn.apache.org/viewvc?rev=697376&view=rev
Log:
TIKA-159 - Add support for parsing basic audio types: wav, aiff, au, midi
Added:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/audio/
incubator/tika/trunk/src/main/java/org/apache/tika/parser/audio/AudioParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/audio/MidiParser.java
incubator/tika/trunk/src/test/java/org/apache/tika/parser/audio/
incubator/tika/trunk/src/test/java/org/apache/tika/parser/audio/AudioParserTest.java
incubator/tika/trunk/src/test/java/org/apache/tika/parser/audio/MidiParserTest.java
incubator/tika/trunk/src/test/resources/test-documents/testAIFF.aif (with
props)
incubator/tika/trunk/src/test/resources/test-documents/testAU.au (with
props)
incubator/tika/trunk/src/test/resources/test-documents/testMID.mid (with
props)
incubator/tika/trunk/src/test/resources/test-documents/testWAV.wav (with
props)
Modified:
incubator/tika/trunk/CHANGES.txt
incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml
incubator/tika/trunk/src/main/resources/tika-config.xml
incubator/tika/trunk/src/test/java/org/apache/tika/mime/TestMimeTypes.java
Modified: incubator/tika/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=697376&r1=697375&r2=697376&view=diff
==============================================================================
--- incubator/tika/trunk/CHANGES.txt (original)
+++ incubator/tika/trunk/CHANGES.txt Sat Sep 20 08:08:50 2008
@@ -84,6 +84,9 @@
35. TIKA-161 - Enable PMD reports (Jukka Zitting)
+36. TIKA-159 - Add support for parsing basic audio types: wav, aiff, au, midi
(Sami Siren)
+
+
Release 0.1-incubating - 12/27/2007
1. TIKA-5 - Port Metadata Framework from Nutch (mattmann)
Added:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/audio/AudioParser.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/audio/AudioParser.java?rev=697376&view=auto
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/audio/AudioParser.java
(added)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/audio/AudioParser.java
Sat Sep 20 08:08:50 2008
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.audio;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Map.Entry;
+
+import javax.sound.sampled.AudioFileFormat;
+import javax.sound.sampled.AudioFormat;
+import javax.sound.sampled.AudioSystem;
+import javax.sound.sampled.UnsupportedAudioFileException;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class AudioParser implements Parser {
+
+ public void parse(InputStream stream, ContentHandler handler,
+ Metadata metadata) throws IOException, SAXException, TikaException
{
+ parse(stream, metadata);
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+ xhtml.endDocument();
+ }
+
+ public void parse(InputStream stream, Metadata metadata)
+ throws IOException, TikaException {
+ String type = metadata.get(Metadata.CONTENT_TYPE);
+ if (type != null) {
+ try {
+
+ AudioFileFormat fileFormat = AudioSystem
+ .getAudioFileFormat(stream);
+
+ AudioFormat format = fileFormat.getFormat();
+
+ metadata.set("samplerate", Integer.toString((int) format
+ .getSampleRate()));
+ metadata
+ .set("channels",
Integer.toString(format.getChannels()));
+ metadata.set("bits", Integer.toString(format
+ .getSampleSizeInBits()));
+ metadata.set("encoding", format.getEncoding().toString());
+
+ // Javadoc suggests that some of the following properties might
+ // be available, but I had no success in finding any:
+
+ // "duration" Long playback duration of the file in
microseconds
+ // "author" String name of the author of this file
+ // "title" String title of this file
+ // "copyright" String copyright message
+ // "date" Date date of the recording or release
+ // "comment" String an arbitrary text
+
+ for (Entry<String, Object> entry : format.properties()
+ .entrySet()) {
+ metadata.set(entry.getKey(), entry.getValue().toString());
+ }
+
+ } catch (UnsupportedAudioFileException e) {
+ // cannot parse, unknown format
+ }
+
+ }
+ }
+}
Added:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/audio/MidiParser.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/audio/MidiParser.java?rev=697376&view=auto
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/audio/MidiParser.java
(added)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/audio/MidiParser.java
Sat Sep 20 08:08:50 2008
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.audio;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.HashMap;
+
+import javax.sound.midi.InvalidMidiDataException;
+import javax.sound.midi.MidiSystem;
+import javax.sound.midi.Sequence;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class MidiParser implements Parser {
+
+ public void parse(InputStream stream, ContentHandler handler,
+ Metadata metadata) throws IOException, SAXException, TikaException
{
+ parse(stream, metadata);
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+ xhtml.endDocument();
+ }
+
+ private static HashMap<Float, String> divisionTypes = new HashMap<Float,
String>();
+
+ static {
+ divisionTypes.put(Sequence.PPQ, "PRQ");
+ divisionTypes.put(Sequence.SMPTE_24, "SMPTE_24");
+ divisionTypes.put(Sequence.SMPTE_25, "SMPTE_25");
+ divisionTypes.put(Sequence.SMPTE_30, "SMPTE_30");
+ divisionTypes.put(Sequence.SMPTE_30DROP, "SMPTE_30DROP");
+ }
+
+ public void parse(InputStream stream, Metadata metadata)
+ throws IOException, TikaException {
+ String type = metadata.get(Metadata.CONTENT_TYPE);
+ if (type != null) {
+
+ try {
+
+ Sequence sequence = MidiSystem.getSequence(stream);
+
+ metadata.set("tracks", Integer
+ .toString(sequence.getTracks().length));
+
+ metadata.set("patches", Integer.toString(sequence
+ .getPatchList().length));
+
+ metadata.set("divisionType", divisionTypes.get(sequence
+ .getDivisionType()));
+
+ } catch (InvalidMidiDataException e) {
+ // cannot parse format
+ }
+
+ }
+ }
+}
Modified: incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml?rev=697376&r1=697375&r2=697376&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml (original)
+++ incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml Sat Sep 20
08:08:50 2008
@@ -570,6 +570,10 @@
<mime-type type="audio/midi">
<glob pattern="*.kar" />
+ <glob pattern="*.mid" />
+ <magic priority ="20">
+ <match type="string" value="MThd" offset="0" />
+ </magic>
</mime-type>
<mime-type type="audio/x-pn-realaudio">
@@ -693,4 +697,31 @@
<glob pattern="*.png" />
</mime-type>
+ <mime-type type="audio/basic">
+ <glob pattern="*.au" />
+ <glob pattern="*.snd" />
+ <magic priority="20">
+ <match value=".snd" type="string" offset="0" />
+ </magic>
+ </mime-type>
+
+ <mime-type type="audio/x-aiff">
+ <glob pattern="*.aif" />
+ <glob pattern="*.aiff" />
+ <magic priority="20">
+ <match value="FORM" type="string" offset="0" />
+ <match value="AIFF" type="string" offset="8" />
+ <match value="AIFC" type="string" offset="8" />
+ <match value="8SVX" type="string" offset="8" />
+ </magic>
+ </mime-type>
+
+ <mime-type type="audio/x-wav">
+ <glob pattern="*.wav" />
+ <magic priority="20">
+ <match value="RIFF" type="string" offset="0" />
+ <match value="WAVE" type="string" offset="8" />
+ </magic>
+ </mime-type>
+
</mime-info>
Modified: incubator/tika/trunk/src/main/resources/tika-config.xml
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/resources/tika-config.xml?rev=697376&r1=697375&r2=697376&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/resources/tika-config.xml (original)
+++ incubator/tika/trunk/src/main/resources/tika-config.xml Sat Sep 20 08:08:50
2008
@@ -125,6 +125,18 @@
<parser name="parse-mp3" class="org.apache.tika.parser.mp3.Mp3Parser">
<mime>audio/mpeg</mime>
</parser>
+
+ <parser name="parse-midi"
class="org.apache.tika.parser.audio.MidiParser">
+ <mime>application/x-midi</mime>
+ <mime>audio/midi</mime>
+ </parser>
+
+ <parser name="parse-audio"
class="org.apache.tika.parser.audio.AudioParser">
+ <mime>audio/basic</mime>
+ <mime>audio/x-wav</mime>
+ <mime>audio/x-aiff</mime>
+ </parser>
+
</parsers>
</properties>
\ No newline at end of file
Modified:
incubator/tika/trunk/src/test/java/org/apache/tika/mime/TestMimeTypes.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/mime/TestMimeTypes.java?rev=697376&r1=697375&r2=697376&view=diff
==============================================================================
--- incubator/tika/trunk/src/test/java/org/apache/tika/mime/TestMimeTypes.java
(original)
+++ incubator/tika/trunk/src/test/java/org/apache/tika/mime/TestMimeTypes.java
Sat Sep 20 08:08:50 2008
@@ -18,14 +18,13 @@
package org.apache.tika.mime;
// Junit imports
-import java.net.URL;
-import java.net.MalformedURLException;
import java.io.File;
import java.io.IOException;
+import java.net.MalformedURLException;
+import java.net.URL;
import junit.framework.TestCase;
-// Tika imports
import org.apache.tika.config.TikaConfig;
/**
@@ -49,8 +48,6 @@
private static final File f = new File("/a/b/c/x.pdf");
-
-
public TestMimeTypes() {
try {
repo = TikaConfig.getDefaultConfig().getMimeRepository();
@@ -84,19 +81,19 @@
assertEquals("text/plain", repo.getMimeType("x.txt").getName());
assertEquals("text/html", repo.getMimeType("x.htm").getName());
assertEquals("text/html", repo.getMimeType("x.html").getName());
- assertEquals("application/xhtml+xml",
- repo.getMimeType("x.xhtml").getName());
+ assertEquals("application/xhtml+xml", repo.getMimeType("x.xhtml")
+ .getName());
assertEquals("application/xml", repo.getMimeType("x.xml").getName());
assertEquals("application/msword",
repo.getMimeType("x.doc").getName());
- assertEquals("application/vnd.ms-powerpoint",
- repo.getMimeType("x.ppt").getName());
- assertEquals("application/vnd.ms-excel",
- repo.getMimeType("x.xls").getName());
+ assertEquals("application/vnd.ms-powerpoint", repo.getMimeType("x.ppt")
+ .getName());
+ assertEquals("application/vnd.ms-excel", repo.getMimeType("x.xls")
+ .getName());
assertEquals("application/zip", repo.getMimeType("x.zip").getName());
- assertEquals("application/vnd.oasis.opendocument.text",
- repo.getMimeType("x.odt").getName());
- assertEquals("application/octet-stream",
- repo.getMimeType("x.xyz").getName());
+ assertEquals("application/vnd.oasis.opendocument.text", repo
+ .getMimeType("x.odt").getName());
+ assertEquals("application/octet-stream", repo.getMimeType("x.xyz")
+ .getName());
}
/**
@@ -110,9 +107,10 @@
// TODO: Currently returns generic MS Office type based on
// the magic header. The getMimeType method should understand
// MS Office types better.
- // assertEquals("application/vnd.ms-excel",
getMimeType("testEXCEL.xls"));
+ // assertEquals("application/vnd.ms-excel",
+ // getMimeType("testEXCEL.xls"));
// assertEquals("application/vnd.ms-powerpoint",
- // getMimeType("testPPT.ppt"));
+ // getMimeType("testPPT.ppt"));
// assertEquals("application/msword", getMimeType("testWORD.doc"));
assertEquals("text/html", getMimeType("testHTML_utf8.html"));
assertEquals("application/vnd.oasis.opendocument.text",
@@ -121,9 +119,12 @@
assertEquals("application/rtf", getMimeType("testRTF.rtf"));
assertEquals("text/plain", getMimeType("testTXT.txt"));
assertEquals("application/xml", getMimeType("testXML.xml"));
+ assertEquals("audio/basic", getMimeType("testAU.au"));
+ assertEquals("audio/x-aiff", getMimeType("testAIFF.aif"));
+ assertEquals("audio/x-wav", getMimeType("testWAV.wav"));
+ assertEquals("audio/midi", getMimeType("testMID.mid"));
}
-
private String getMimeType(String filename) {
String type = null;
Added:
incubator/tika/trunk/src/test/java/org/apache/tika/parser/audio/AudioParserTest.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/audio/AudioParserTest.java?rev=697376&view=auto
==============================================================================
---
incubator/tika/trunk/src/test/java/org/apache/tika/parser/audio/AudioParserTest.java
(added)
+++
incubator/tika/trunk/src/test/java/org/apache/tika/parser/audio/AudioParserTest.java
Sat Sep 20 08:08:50 2008
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.audio;
+
+import java.io.InputStream;
+
+import junit.framework.TestCase;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.Parser;
+import org.xml.sax.helpers.DefaultHandler;
+
+public class AudioParserTest extends TestCase {
+
+ private final Parser parser = new AudioParser();
+
+ public void testWAV() throws Exception {
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.CONTENT_TYPE, "audio/x-wav");
+ InputStream stream = getClass().getResourceAsStream(
+ "/test-documents/testWAV.wav");
+
+ parser.parse(stream, new DefaultHandler(), metadata);
+
+ assertEquals("44100", metadata.get("samplerate"));
+ assertEquals("2", metadata.get("channels"));
+ assertEquals("16", metadata.get("bits"));
+ assertEquals("PCM_SIGNED", metadata.get("encoding"));
+
+ }
+
+ public void testAIFF() throws Exception {
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.CONTENT_TYPE, "audio/x-aiff");
+ InputStream stream = getClass().getResourceAsStream(
+ "/test-documents/testAIFF.aif");
+
+ parser.parse(stream, new DefaultHandler(), metadata);
+
+ assertEquals("44100", metadata.get("samplerate"));
+ assertEquals("2", metadata.get("channels"));
+ assertEquals("16", metadata.get("bits"));
+ assertEquals("PCM_SIGNED", metadata.get("encoding"));
+
+ }
+
+ public void testAU() throws Exception {
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.CONTENT_TYPE, "audio/basic");
+ InputStream stream = getClass().getResourceAsStream(
+ "/test-documents/testAU.au");
+
+ parser.parse(stream, new DefaultHandler(), metadata);
+
+ assertEquals("44100", metadata.get("samplerate"));
+ assertEquals("2", metadata.get("channels"));
+ assertEquals("16", metadata.get("bits"));
+ assertEquals("PCM_SIGNED", metadata.get("encoding"));
+
+ }
+
+}
Added:
incubator/tika/trunk/src/test/java/org/apache/tika/parser/audio/MidiParserTest.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/audio/MidiParserTest.java?rev=697376&view=auto
==============================================================================
---
incubator/tika/trunk/src/test/java/org/apache/tika/parser/audio/MidiParserTest.java
(added)
+++
incubator/tika/trunk/src/test/java/org/apache/tika/parser/audio/MidiParserTest.java
Sat Sep 20 08:08:50 2008
@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.audio;
+
+import java.io.InputStream;
+
+import junit.framework.TestCase;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.Parser;
+import org.xml.sax.helpers.DefaultHandler;
+
+public class MidiParserTest extends TestCase {
+
+ private final Parser parser = new MidiParser();
+
+ public void testMID() throws Exception {
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.CONTENT_TYPE, "audio/midi");
+ InputStream stream = getClass().getResourceAsStream(
+ "/test-documents/testMID.mid");
+
+ parser.parse(stream, new DefaultHandler(), metadata);
+
+ assertEquals("2", metadata.get("tracks"));
+ assertEquals("0", metadata.get("patches"));
+ assertEquals("PRQ", metadata.get("divisionType"));
+
+ }
+}
Added: incubator/tika/trunk/src/test/resources/test-documents/testAIFF.aif
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/resources/test-documents/testAIFF.aif?rev=697376&view=auto
==============================================================================
Binary file - no diff available.
Propchange: incubator/tika/trunk/src/test/resources/test-documents/testAIFF.aif
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: incubator/tika/trunk/src/test/resources/test-documents/testAU.au
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/resources/test-documents/testAU.au?rev=697376&view=auto
==============================================================================
Binary file - no diff available.
Propchange: incubator/tika/trunk/src/test/resources/test-documents/testAU.au
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: incubator/tika/trunk/src/test/resources/test-documents/testMID.mid
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/resources/test-documents/testMID.mid?rev=697376&view=auto
==============================================================================
Binary file - no diff available.
Propchange: incubator/tika/trunk/src/test/resources/test-documents/testMID.mid
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: incubator/tika/trunk/src/test/resources/test-documents/testWAV.wav
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/resources/test-documents/testWAV.wav?rev=697376&view=auto
==============================================================================
Binary file - no diff available.
Propchange: incubator/tika/trunk/src/test/resources/test-documents/testWAV.wav
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream