jukka
Tue, 26 Jan 2010 03:28:08 -0800
Author: jukka Date: Tue Jan 26 11:27:33 2010 New Revision: 903176 URL: http://svn.apache.org/viewvc?rev=903176&view=rev Log: TIKA-368: ID3v2 support for mp3 parser Patch by Nick Burch Added: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/CompositeTagHandler.java lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3Tags.java lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v22Handler.java lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v23Handler.java lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v24Handler.java lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testMP3id3v1.mp3 - copied unchanged from r903148, lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testMP3.mp3 lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testMP3id3v1_v2.mp3 (with props) lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testMP3id3v2.mp3 (with props) Removed: lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testMP3.mp3 Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/TestParsers.java lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java Added: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/CompositeTagHandler.java URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/CompositeTagHandler.java?rev=903176&view=auto ============================================================================== --- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/CompositeTagHandler.java (added) +++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/CompositeTagHandler.java Tue Jan 26 11:27:33 2010 @@ -0,0 +1,103 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.mp3; + +/** + * Takes an array of {...@link ID3Tags} in preference order, and when asked for + * a given tag, will return it from the first {...@link ID3Tags} that has it. + */ +public class CompositeTagHandler implements ID3Tags { + + private ID3Tags[] tags; + + public CompositeTagHandler(ID3Tags[] tags) { + this.tags = tags; + } + + public boolean getTagsPresent() { + for (ID3Tags tag : tags) { + if (tag.getTagsPresent()) { + return true; + } + } + return false; + } + + public String getTitle() { + for (ID3Tags tag : tags) { + if (tag.getTitle() != null) { + return tag.getTitle(); + } + } + return null; + } + + public String getArtist() { + for (ID3Tags tag : tags) { + if (tag.getArtist() != null) { + return tag.getArtist(); + } + } + return null; + } + + public String getAlbum() { + for (ID3Tags tag : tags) { + if (tag.getAlbum() != null) { + return tag.getAlbum(); + } + } + return null; + } + + public String getYear() { + for (ID3Tags tag : tags) { + if (tag.getYear() != null) { + return tag.getYear(); + } + } + return null; + } + + public String getComment() { + for (ID3Tags tag : tags) { + if (tag.getComment() != null) { + return tag.getComment(); + } + } + return null; + } + + public String getGenre() { + for (ID3Tags tag : tags) { + if (tag.getGenre() != null) { + return tag.getGenre(); + } + } + return null; + } + + public String getTrackNumber() { + for (ID3Tags tag : tags) { + if (tag.getTrackNumber() != null) { + return tag.getTrackNumber(); + } + } + return null; + } + +} Added: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3Tags.java URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3Tags.java?rev=903176&view=auto ============================================================================== --- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3Tags.java (added) +++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3Tags.java Tue Jan 26 11:27:33 2010 @@ -0,0 +1,185 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.mp3; + + +/** + * Interface that defines the common interface for ID3 tag parsers, + * such as ID3v1 and ID3v2.3. + * Implementations should return NULL if the file lacks a given + * tag, or if the tag isn't defined for the version. + * + * Note that so far, only the ID3v1 core tags are listed here. In + * future, we may wish to add more to cover the extra tags that + * our ID3v2 handlers can produce. + */ +public interface ID3Tags { + /** + * List of predefined genres. + * + * @see http://www.id3.org/id3v2-00 + */ + String[] GENRES = new String[] { + /* 0 */ "Blues", + /* 1 */ "Classic Rock", + /* 2 */ "Country", + /* 3 */ "Dance", + /* 4 */ "Disco", + /* 5 */ "Funk", + /* 6 */ "Grunge", + /* 7 */ "Hip-Hop", + /* 8 */ "Jazz", + /* 9 */ "Metal", + /* 10 */ "New Age", + /* 11 */ "Oldies", + /* 12 */ "Other", + /* 13 */ "Pop", + /* 14 */ "R&B", + /* 15 */ "Rap", + /* 16 */ "Reggae", + /* 17 */ "Rock", + /* 18 */ "Techno", + /* 19 */ "Industrial", + /* 20 */ "Alternative", + /* 21 */ "Ska", + /* 22 */ "Death Metal", + /* 23 */ "Pranks", + /* 24 */ "Soundtrack", + /* 25 */ "Euro-Techno", + /* 26 */ "Ambient", + /* 27 */ "Trip-Hop", + /* 28 */ "Vocal", + /* 29 */ "Jazz+Funk", + /* 30 */ "Fusion", + /* 31 */ "Trance", + /* 32 */ "Classical", + /* 33 */ "Instrumental", + /* 34 */ "Acid", + /* 35 */ "House", + /* 36 */ "Game", + /* 37 */ "Sound Clip", + /* 38 */ "Gospel", + /* 39 */ "Noise", + /* 40 */ "AlternRock", + /* 41 */ "Bass", + /* 42 */ "Soul", + /* 43 */ "Punk", + /* 44 */ "Space", + /* 45 */ "Meditative", + /* 46 */ "Instrumental Pop", + /* 47 */ "Instrumental Rock", + /* 48 */ "Ethnic", + /* 49 */ "Gothic", + /* 50 */ "Darkwave", + /* 51 */ "Techno-Industrial", + /* 52 */ "Electronic", + /* 53 */ "Pop-Folk", + /* 54 */ "Eurodance", + /* 55 */ "Dream", + /* 56 */ "Southern Rock", + /* 57 */ "Comedy", + /* 58 */ "Cult", + /* 59 */ "Gangsta", + /* 60 */ "Top 40", + /* 61 */ "Christian Rap", + /* 62 */ "Pop/Funk", + /* 63 */ "Jungle", + /* 64 */ "Native American", + /* 65 */ "Cabaret", + /* 66 */ "New Wave", + /* 67 */ "Psychadelic", + /* 68 */ "Rave", + /* 69 */ "Showtunes", + /* 70 */ "Trailer", + /* 71 */ "Lo-Fi", + /* 72 */ "Tribal", + /* 73 */ "Acid Punk", + /* 74 */ "Acid Jazz", + /* 75 */ "Polka", + /* 76 */ "Retro", + /* 77 */ "Musical", + /* 78 */ "Rock & Roll", + /* 79 */ "Hard Rock", + /* 80 */ "Folk", + /* 81 */ "Folk-Rock", + /* 82 */ "National Folk", + /* 83 */ "Swing", + /* 84 */ "Fast Fusion", + /* 85 */ "Bebob", + /* 86 */ "Latin", + /* 87 */ "Revival", + /* 88 */ "Celtic", + /* 89 */ "Bluegrass", + /* 90 */ "Avantgarde", + /* 91 */ "Gothic Rock", + /* 92 */ "Progressive Rock", + /* 93 */ "Psychedelic Rock", + /* 94 */ "Symphonic Rock", + /* 95 */ "Slow Rock", + /* 96 */ "Big Band", + /* 97 */ "Chorus", + /* 98 */ "Easy Listening", + /* 99 */ "Acoustic", + /* 100 */ "Humour", + /* 101 */ "Speech", + /* 102 */ "Chanson", + /* 103 */ "Opera", + /* 104 */ "Chamber Music", + /* 105 */ "Sonata", + /* 106 */ "Symphony", + /* 107 */ "Booty Bass", + /* 108 */ "Primus", + /* 109 */ "Porn Groove", + /* 110 */ "Satire", + /* 111 */ "Slow Jam", + /* 112 */ "Club", + /* 113 */ "Tango", + /* 114 */ "Samba", + /* 115 */ "Folklore", + /* 116 */ "Ballad", + /* 117 */ "Power Ballad", + /* 118 */ "Rhythmic Soul", + /* 119 */ "Freestyle", + /* 120 */ "Duet", + /* 121 */ "Punk Rock", + /* 122 */ "Drum Solo", + /* 123 */ "A capella", + /* 124 */ "Euro-House", + /* 125 */ "Dance Hall", + /* sentinel */ "" + }; + + /** + * Does the file contain this kind of tags? + */ + boolean getTagsPresent(); + + String getTitle(); + + String getArtist(); + + String getAlbum(); + + String getComment(); + + String getGenre(); + + String getYear(); + + String getTrackNumber(); + +} Added: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java?rev=903176&view=auto ============================================================================== --- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java (added) +++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java Tue Jan 26 11:27:33 2010 @@ -0,0 +1,176 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.mp3; + +import java.io.IOException; +import java.io.InputStream; +import java.io.UnsupportedEncodingException; + +import org.apache.tika.exception.TikaException; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +/** + * This is used to parse ID3 Version 1 Tag information from an MP3 file, + * if available. + * + * @see <a href="http://www.id3.org/ID3v1">MP3 ID3 Version 1 specification</a> + */ +public class ID3v1Handler implements ID3Tags { + private String title; + private String artist; + private String album; + private String year; + private String comment; + private String genre; + private String trackNumber; + + boolean found = false; + + public ID3v1Handler(InputStream stream, ContentHandler handler) + throws IOException, SAXException, TikaException { + byte[] tag = getSuffix(stream, 128); + if (tag.length == 128 + && tag[0] == 'T' && tag[1] == 'A' && tag[2] == 'G') { + found = true; + + title = getString(tag, 3, 33); + artist = getString(tag, 33, 63); + album = getString(tag, 63, 93); + year = getString(tag, 93, 97); + comment = getString(tag, 97, 127); + + int genreID = (int) tag[127] & 0xff; // unsigned byte + genre = GENRES[Math.min(genreID, GENRES.length - 1)]; + + // ID3v1.1 Track addition + // If the last two bytes of the comment field are zero and + // non-zero, then the last byte is the track number + if (tag[125] == 0 && tag[126] != 0) { + int trackNum = (int) tag[126] & 0xff; + trackNumber = Integer.toString(trackNum); + } + } + } + + + public boolean getTagsPresent() { + return found; + } + + public String getTitle() { + return title; + } + + public String getArtist() { + return artist; + } + + public String getAlbum() { + return album; + } + + public String getYear() { + return year; + } + + public String getComment() { + return comment; + } + + public String getGenre() { + return genre; + } + + public String getTrackNumber() { + return trackNumber; + } + + + /** + * Returns the identified ISO-8859-1 substring from the given byte buffer. + * The return value is the zero-terminated substring retrieved from + * between the given start and end positions in the given byte buffer. + * Extra whitespace (and control characters) from the beginning and the + * end of the substring is removed. + * + * @param buffer byte buffer + * @param start start index of the substring + * @param end end index of the substring + * @return the identified substring + * @throws TikaException if the ISO-8859-1 encoding is not available + */ + private static String getString(byte[] buffer, int start, int end) + throws TikaException { + // Find the zero byte that marks the end of the string + int zero = start; + while (zero < end && buffer[zero] != 0) { + zero++; + } + + // Skip trailing whitespace + end = zero; + while (start < end && buffer[end - 1] <= ' ') { + end--; + } + + // Skip leading whitespace + while (start < end && buffer[start] <= ' ') { + start++; + } + + // Return the remaining substring + try { + return new String(buffer, start, end - start, "ISO-8859-1"); + } catch (UnsupportedEncodingException e) { + throw new TikaException("ISO-8859-1 encoding is not available", e); + } + } + + /** + * Reads and returns the last <code>length</code> bytes from the + * given stream. + * @param stream input stream + * @param length number of bytes from the end to read and return + * @return stream the <code>InputStream</code> to read from. + * @throws IOException if the stream could not be read from. + */ + private static byte[] getSuffix(InputStream stream, int length) + throws IOException { + byte[] buffer = new byte[2 * length]; + int bytesInBuffer = 0; + + int n = stream.read(buffer); + while (n != -1) { + bytesInBuffer += n; + if (bytesInBuffer == buffer.length) { + System.arraycopy(buffer, bytesInBuffer - length, buffer, 0, length); + bytesInBuffer = length; + } + n = stream.read(buffer, bytesInBuffer, buffer.length - bytesInBuffer); + } + + if (bytesInBuffer < length) { + length = bytesInBuffer; + } + + byte[] result = new byte[length]; + System.arraycopy(buffer, bytesInBuffer - length, result, 0, length); + return result; + } + +} Added: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v22Handler.java URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v22Handler.java?rev=903176&view=auto ============================================================================== --- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v22Handler.java (added) +++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v22Handler.java Tue Jan 26 11:27:33 2010 @@ -0,0 +1,115 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.mp3; + +import java.io.IOException; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.parser.mp3.ID3v2Frame.RawTag; +import org.apache.tika.parser.mp3.ID3v2Frame.RawTagIterator; +import org.xml.sax.SAXException; + +/** + * This is used to parse ID3 Version 2.2 Tag information from an MP3 file, + * if available. + * + * @see <a href="http://id3lib.sourceforge.net/id3/id3v2-00.txt">MP3 ID3 Version 2.2 specification</a> + */ +public class ID3v22Handler implements ID3Tags { + private String title; + private String artist; + private String album; + private String year; + private String comment; + private String genre; + private String trackNumber; + + public ID3v22Handler(ID3v2Frame frame) + throws IOException, SAXException, TikaException { + RawTagIterator tags = new RawV22TagIterator(frame); + while (tags.hasNext()) { + RawTag tag = tags.next(); + if (tag.name.equals("TT2")) { + title = getTagString(tag.data, 0, tag.data.length); + } else if (tag.name.equals("TP1")) { + artist = getTagString(tag.data, 0, tag.data.length); + } else if (tag.name.equals("TAL")) { + album = getTagString(tag.data, 0, tag.data.length); + } else if (tag.name.equals("TYE")) { + year = getTagString(tag.data, 0, tag.data.length); + } else if (tag.name.equals("COM")) { + comment = getTagString(tag.data, 0, tag.data.length); + } else if (tag.name.equals("TRK")) { + trackNumber = getTagString(tag.data, 0, tag.data.length); + } else if (tag.name.equals("TCO")) { + String rawGenre = getTagString(tag.data, 0, tag.data.length); + int open = rawGenre.indexOf("("); + int close = rawGenre.indexOf(")"); + if (open < close) { + try { + int genreID = Integer.parseInt(rawGenre.substring(open+1, close)); + genre = ID3Tags.GENRES[genreID]; + } catch(NumberFormatException ignore) { + } + } + } + } + } + + private String getTagString(byte[] data, int offset, int length) { + return ID3v2Frame.getTagString(data, offset, length); + } + + public boolean getTagsPresent() { + return true; + } + + public String getTitle() { + return title; + } + + public String getArtist() { + return artist; + } + + public String getAlbum() { + return album; + } + + public String getYear() { + return year; + } + + public String getComment() { + return comment; + } + + public String getGenre() { + return genre; + } + + public String getTrackNumber() { + return trackNumber; + } + + private class RawV22TagIterator extends RawTagIterator { + private RawV22TagIterator(ID3v2Frame frame) { + frame.super(3, 3, 1, 0); + } + } + +} Added: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v23Handler.java URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v23Handler.java?rev=903176&view=auto ============================================================================== --- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v23Handler.java (added) +++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v23Handler.java Tue Jan 26 11:27:33 2010 @@ -0,0 +1,115 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.mp3; + +import java.io.IOException; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.parser.mp3.ID3v2Frame.RawTag; +import org.apache.tika.parser.mp3.ID3v2Frame.RawTagIterator; +import org.xml.sax.SAXException; + +/** + * This is used to parse ID3 Version 2.3 Tag information from an MP3 file, + * if available. + * + * @see <a href="http://id3lib.sourceforge.net/id3/id3v2.3.0.html">MP3 ID3 Version 2.3 specification</a> + */ +public class ID3v23Handler implements ID3Tags { + private String title; + private String artist; + private String album; + private String year; + private String comment; + private String genre; + private String trackNumber; + + public ID3v23Handler(ID3v2Frame frame) + throws IOException, SAXException, TikaException { + RawTagIterator tags = new RawV23TagIterator(frame); + while (tags.hasNext()) { + RawTag tag = tags.next(); + if (tag.name.equals("TIT2")) { + title = getTagString(tag.data, 0, tag.data.length); + } else if (tag.name.equals("TPE1")) { + artist = getTagString(tag.data, 0, tag.data.length); + } else if (tag.name.equals("TALB")) { + album = getTagString(tag.data, 0, tag.data.length); + } else if (tag.name.equals("TYER")) { + year = getTagString(tag.data, 0, tag.data.length); + } else if (tag.name.equals("COMM")) { + comment = getTagString(tag.data, 0, tag.data.length); + } else if (tag.name.equals("TRCK")) { + trackNumber = getTagString(tag.data, 0, tag.data.length); + } else if (tag.name.equals("TCON")) { + String rawGenre = getTagString(tag.data, 0, tag.data.length); + int open = rawGenre.indexOf("("); + int close = rawGenre.indexOf(")"); + if (open < close) { + try { + int genreID = Integer.parseInt(rawGenre.substring(open+1, close)); + genre = ID3Tags.GENRES[genreID]; + } catch(NumberFormatException ignore) { + } + } + } + } + } + + private String getTagString(byte[] data, int offset, int length) { + return ID3v2Frame.getTagString(data, offset, length); + } + + public boolean getTagsPresent() { + return true; + } + + public String getTitle() { + return title; + } + + public String getArtist() { + return artist; + } + + public String getAlbum() { + return album; + } + + public String getYear() { + return year; + } + + public String getComment() { + return comment; + } + + public String getGenre() { + return genre; + } + + public String getTrackNumber() { + return trackNumber; + } + + private class RawV23TagIterator extends RawTagIterator { + private RawV23TagIterator(ID3v2Frame frame) { + frame.super(4, 4, 1, 2); + } + } + +} Added: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v24Handler.java URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v24Handler.java?rev=903176&view=auto ============================================================================== --- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v24Handler.java (added) +++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v24Handler.java Tue Jan 26 11:27:33 2010 @@ -0,0 +1,116 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.mp3; + +import java.io.IOException; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.parser.mp3.ID3v2Frame.RawTag; +import org.apache.tika.parser.mp3.ID3v2Frame.RawTagIterator; +import org.xml.sax.SAXException; + +/** + * This is used to parse ID3 Version 2.4 Tag information from an MP3 file, + * if available. + * + * @see <a href="http://www.id3.org/id3v2.4.0-structures">MP3 ID3 Version 2.4 specification</a> + * @see <a href="http://www.id3.org/id3v2.4.0-frames">MP3 ID3 Version 2.4 frames/tags</a> + */ +public class ID3v24Handler implements ID3Tags { + private String title; + private String artist; + private String album; + private String year; + private String comment; + private String genre; + private String trackNumber; + + public ID3v24Handler(ID3v2Frame frame) + throws IOException, SAXException, TikaException { + RawTagIterator tags = new RawV24TagIterator(frame); + while (tags.hasNext()) { + RawTag tag = tags.next(); + if (tag.name.equals("TIT2")) { + title = getTagString(tag.data, 0, tag.data.length); + } else if (tag.name.equals("TPE1")) { + artist = getTagString(tag.data, 0, tag.data.length); + } else if (tag.name.equals("TALB")) { + album = getTagString(tag.data, 0, tag.data.length); + } else if (tag.name.equals("TYER")) { + year = getTagString(tag.data, 0, tag.data.length); + } else if (tag.name.equals("COMM")) { + comment = getTagString(tag.data, 0, tag.data.length); + } else if (tag.name.equals("TRCK")) { + trackNumber = getTagString(tag.data, 0, tag.data.length); + } else if (tag.name.equals("TCON")) { + String rawGenre = getTagString(tag.data, 0, tag.data.length); + int open = rawGenre.indexOf("("); + int close = rawGenre.indexOf(")"); + if (open < close) { + try { + int genreID = Integer.parseInt(rawGenre.substring(open+1, close)); + genre = ID3Tags.GENRES[genreID]; + } catch(NumberFormatException ignore) { + } + } + } + } + } + + private String getTagString(byte[] data, int offset, int length) { + return ID3v2Frame.getTagString(data, offset, length); + } + + public boolean getTagsPresent() { + return true; + } + + public String getTitle() { + return title; + } + + public String getArtist() { + return artist; + } + + public String getAlbum() { + return album; + } + + public String getYear() { + return year; + } + + public String getComment() { + return comment; + } + + public String getGenre() { + return genre; + } + + public String getTrackNumber() { + return trackNumber; + } + + private class RawV24TagIterator extends RawTagIterator { + private RawV24TagIterator(ID3v2Frame frame) { + frame.super(4, 4, 4, 2); + } + } + +} Added: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java?rev=903176&view=auto ============================================================================== --- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java (added) +++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java Tue Jan 26 11:27:33 2010 @@ -0,0 +1,283 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.mp3; + +import java.io.IOException; +import java.io.InputStream; +import java.io.UnsupportedEncodingException; +import java.util.Iterator; + +/** + * A frame of ID3v2 data, which is then passed to a handler to + * be turned into useful data. + */ +public class ID3v2Frame { + private int majorVersion; + private int minorVersion; + private int flags; + private int length; + /** Excludes the header size part */ + private byte[] extendedHeader; + private byte[] data; + + public int getMajorVersion() { + return majorVersion; + } + + public int getMinorVersion() { + return minorVersion; + } + + public int getFlags() { + return flags; + } + + public int getLength() { + return length; + } + + public byte[] getExtendedHeader() { + return extendedHeader; + } + + public byte[] getData() { + return data; + } + + /** + * Returns a frame of ID3v2 data, or null if the + * next data to be read from the InputStream + * doesn't correspond to an ID3v2 Frame + */ + public static ID3v2Frame createFrameIfPresent(InputStream inp) + throws IOException { + int h1 = inp.read(); + int h2 = inp.read(); + int h3 = inp.read(); + if (h1 == (int)'I' && h2 == (int)'D' && h3 == (int)'3') { + int majorVersion = inp.read(); + int minorVersion = inp.read(); + if (majorVersion == -1 || minorVersion == -1) { + return null; + } + return new ID3v2Frame(majorVersion, minorVersion, inp); + } + + // Not a frame header + return null; + } + + private ID3v2Frame(int majorVersion, int minorVersion, InputStream inp) + throws IOException { + this.majorVersion = majorVersion; + this.minorVersion = minorVersion; + + // Get the flags and the length + flags = inp.read(); + length = 4 * getInt(readFully(inp, 4)); + + // Do we have an extended header? + if ((flags & 0x02) == 0x02) { + int size = getInt(readFully(inp, 4)); + extendedHeader = readFully(inp, size); + } + + // Get the frame's data + data = readFully(inp, length); + } + + protected static int getInt(byte[] data) { + return getInt(data, 0); + } + + protected static int getInt(byte[] data, int offset) { + int b0 = data[offset+0] & 0xFF; + int b1 = data[offset+1] & 0xFF; + int b2 = data[offset+2] & 0xFF; + int b3 = data[offset+3] & 0xFF; + return (b0 << 24) + (b1 << 16) + (b2 << 8) + (b3 << 0); + } + + protected static int getInt3(byte[] data, int offset) { + int b0 = data[offset+0] & 0xFF; + int b1 = data[offset+1] & 0xFF; + int b2 = data[offset+2] & 0xFF; + return (b0 << 16) + (b1 << 8) + (b2 << 0); + } + + protected static int getInt2(byte[] data, int offset) { + int b0 = data[offset+0] & 0xFF; + int b1 = data[offset+1] & 0xFF; + return (b0 << 8) + (b1 << 0); + } + + protected static byte[] readFully(InputStream inp, int length) + throws IOException { + byte[] b = new byte[length]; + + int pos = 0; + int read; + while (pos < length) { + read = inp.read(b, pos, length-pos); + if (read == -1) { + throw new IOException("Tried to read " + length + " bytes, but only " + pos + " bytes present"); + } + pos += read; + } + + return b; + } + + /** + * Returns the (possibly null padded) String at the given offset and + * length. String encoding is held in the first byte; + */ + protected static String getTagString(byte[] data, int offset, int length) { + int actualLength = length; + while (data[actualLength-1] == 0) { + actualLength--; + } + + // Does it have an encoding flag? + // Detect by the first byte being sub 0x20 + String encoding = "ISO-8859-1"; + byte maybeEncodingFlag = data[offset]; + if (maybeEncodingFlag == 0 || maybeEncodingFlag == 1) { + offset++; + actualLength--; + if (maybeEncodingFlag == 1) { + // With BOM + encoding = "UTF-16"; + } else if (maybeEncodingFlag == 2) { + // Without BOM + encoding = "UTF-16BE"; + } else if (maybeEncodingFlag == 3) { + encoding = "UTF8"; + } + } + + try { + return new String(data, offset, actualLength, encoding); + } catch (UnsupportedEncodingException e) { + throw new RuntimeException( + "Core encoding " + encoding + " is not available", e); + } + } + + /** + * Returns the String at the given + * offset and length. Strings are ISO-8859-1 + */ + protected static String getString(byte[] data, int offset, int length) { + try { + return new String(data, offset, length, "ISO-8859-1"); + } catch (UnsupportedEncodingException e) { + throw new RuntimeException( + "Core encoding ISO-8859-1 encoding is not available", e); + } + } + + + /** + * Iterates over id3v2 raw tags. + * Create an instance of this that configures the + * various length and multipliers. + */ + protected class RawTagIterator implements Iterator<RawTag> { + private int nameLength; + private int sizeLength; + private int sizeMultiplier; + private int flagLength; + + private int offset = 0; + + protected RawTagIterator( + int nameLength, int sizeLength, int sizeMultiplier, + int flagLength) { + this.nameLength = nameLength; + this.sizeLength = sizeLength; + this.sizeMultiplier = sizeMultiplier; + this.flagLength = flagLength; + } + + public boolean hasNext() { + if (offset < data.length) { + // Check for padding at the end + if (data[offset] != 0) { + return true; + } + } + return false; + } + + public RawTag next() { + RawTag tag = new RawTag(nameLength, sizeLength, sizeMultiplier, + flagLength, data, offset); + offset += tag.getSize(); + return tag; + } + + public void remove() { + } + + } + + protected static class RawTag { + private int headerSize; + protected String name; + protected int flag; + protected byte[] data; + + private RawTag( + int nameLength, int sizeLength, int sizeMultiplier, + int flagLength, byte[] frameData, int offset) { + headerSize = nameLength + sizeLength + flagLength; + + // Name, normally 3 or 4 bytes + name = getString(frameData, offset, nameLength); + + // Size + int rawSize; + if (sizeLength == 3) { + rawSize = getInt3(frameData, offset+nameLength); + } else { + rawSize = getInt(frameData, offset+nameLength); + } + int size = rawSize * sizeMultiplier; + + // Flag + if (flagLength > 0) { + if (flagLength == 1) { + flag = (int)frameData[offset+nameLength+sizeLength]; + } else { + flag = getInt2(frameData, offset+nameLength+sizeLength); + } + } + + // Now data + data = new byte[size]; + System.arraycopy(frameData, + offset+nameLength+sizeLength+flagLength, data, 0, size); + } + + protected int getSize() { + return headerSize + data.length; + } + + } + +} Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java?rev=903176&r1=903175&r2=903176&view=diff ============================================================================== --- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java (original) +++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java Tue Jan 26 11:27:33 2010 @@ -18,7 +18,8 @@ import java.io.IOException; import java.io.InputStream; -import java.io.UnsupportedEncodingException; +import java.util.ArrayList; +import java.util.List; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; @@ -33,144 +34,11 @@ * from an MP3 file, if available. * * @see <a href="http://www.id3.org/ID3v1">MP3 ID3 Version 1 specification</a> + * @see <a href="http://www.id3.org/id3v2.4.0-structure">MP3 ID3 Version 2.4 Structure Specification</a> + * @see <a href="http://www.id3.org/id3v2.4.0-frames">MP3 ID3 Version 2.4 Frames Specification</a> */ public class Mp3Parser implements Parser { - /** - * List of predefined genres. - * - * @see http://www.id3.org/id3v2-00 - */ - private static final String[] GENRES = new String[] { - /* 0 */ "Blues", - /* 1 */ "Classic Rock", - /* 2 */ "Country", - /* 3 */ "Dance", - /* 4 */ "Disco", - /* 5 */ "Funk", - /* 6 */ "Grunge", - /* 7 */ "Hip-Hop", - /* 8 */ "Jazz", - /* 9 */ "Metal", - /* 10 */ "New Age", - /* 11 */ "Oldies", - /* 12 */ "Other", - /* 13 */ "Pop", - /* 14 */ "R&B", - /* 15 */ "Rap", - /* 16 */ "Reggae", - /* 17 */ "Rock", - /* 18 */ "Techno", - /* 19 */ "Industrial", - /* 20 */ "Alternative", - /* 21 */ "Ska", - /* 22 */ "Death Metal", - /* 23 */ "Pranks", - /* 24 */ "Soundtrack", - /* 25 */ "Euro-Techno", - /* 26 */ "Ambient", - /* 27 */ "Trip-Hop", - /* 28 */ "Vocal", - /* 29 */ "Jazz+Funk", - /* 30 */ "Fusion", - /* 31 */ "Trance", - /* 32 */ "Classical", - /* 33 */ "Instrumental", - /* 34 */ "Acid", - /* 35 */ "House", - /* 36 */ "Game", - /* 37 */ "Sound Clip", - /* 38 */ "Gospel", - /* 39 */ "Noise", - /* 40 */ "AlternRock", - /* 41 */ "Bass", - /* 42 */ "Soul", - /* 43 */ "Punk", - /* 44 */ "Space", - /* 45 */ "Meditative", - /* 46 */ "Instrumental Pop", - /* 47 */ "Instrumental Rock", - /* 48 */ "Ethnic", - /* 49 */ "Gothic", - /* 50 */ "Darkwave", - /* 51 */ "Techno-Industrial", - /* 52 */ "Electronic", - /* 53 */ "Pop-Folk", - /* 54 */ "Eurodance", - /* 55 */ "Dream", - /* 56 */ "Southern Rock", - /* 57 */ "Comedy", - /* 58 */ "Cult", - /* 59 */ "Gangsta", - /* 60 */ "Top 40", - /* 61 */ "Christian Rap", - /* 62 */ "Pop/Funk", - /* 63 */ "Jungle", - /* 64 */ "Native American", - /* 65 */ "Cabaret", - /* 66 */ "New Wave", - /* 67 */ "Psychadelic", - /* 68 */ "Rave", - /* 69 */ "Showtunes", - /* 70 */ "Trailer", - /* 71 */ "Lo-Fi", - /* 72 */ "Tribal", - /* 73 */ "Acid Punk", - /* 74 */ "Acid Jazz", - /* 75 */ "Polka", - /* 76 */ "Retro", - /* 77 */ "Musical", - /* 78 */ "Rock & Roll", - /* 79 */ "Hard Rock", - /* 80 */ "Folk", - /* 81 */ "Folk-Rock", - /* 82 */ "National Folk", - /* 83 */ "Swing", - /* 84 */ "Fast Fusion", - /* 85 */ "Bebob", - /* 86 */ "Latin", - /* 87 */ "Revival", - /* 88 */ "Celtic", - /* 89 */ "Bluegrass", - /* 90 */ "Avantgarde", - /* 91 */ "Gothic Rock", - /* 92 */ "Progressive Rock", - /* 93 */ "Psychedelic Rock", - /* 94 */ "Symphonic Rock", - /* 95 */ "Slow Rock", - /* 96 */ "Big Band", - /* 97 */ "Chorus", - /* 98 */ "Easy Listening", - /* 99 */ "Acoustic", - /* 100 */ "Humour", - /* 101 */ "Speech", - /* 102 */ "Chanson", - /* 103 */ "Opera", - /* 104 */ "Chamber Music", - /* 105 */ "Sonata", - /* 106 */ "Symphony", - /* 107 */ "Booty Bass", - /* 108 */ "Primus", - /* 109 */ "Porn Groove", - /* 110 */ "Satire", - /* 111 */ "Slow Jam", - /* 112 */ "Club", - /* 113 */ "Tango", - /* 114 */ "Samba", - /* 115 */ "Folklore", - /* 116 */ "Ballad", - /* 117 */ "Power Ballad", - /* 118 */ "Rhythmic Soul", - /* 119 */ "Freestyle", - /* 120 */ "Duet", - /* 121 */ "Punk Rock", - /* 122 */ "Drum Solo", - /* 123 */ "A capella", - /* 124 */ "Euro-House", - /* 125 */ "Dance Hall", - /* sentinel */ "" - }; - public void parse( InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) @@ -179,34 +47,28 @@ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); - - byte[] tag = getSuffix(stream, 128); - if (tag.length == 128 - && tag[0] == 'T' && tag[1] == 'A' && tag[2] == 'G') { - String title = getString(tag, 3, 33); - String artist = getString(tag, 33, 63); - String album = getString(tag, 63, 93); - String year = getString(tag, 93, 97); - String comment = getString(tag, 97, 127); - int genre = (int) tag[127] & 0xff; // unsigned byte - - metadata.set(Metadata.TITLE, title); - metadata.set(Metadata.AUTHOR, artist); - - xhtml.element("h1", title); - xhtml.element("p", artist); + + // Create handlers for the various kinds of ID3 tags + ID3Tags[] tags = getAllTagHandlers(stream, handler); + + if (tags.length > 0) { + CompositeTagHandler tag = new CompositeTagHandler(tags); + + metadata.set(Metadata.TITLE, tag.getTitle()); + metadata.set(Metadata.AUTHOR, tag.getArtist()); + + xhtml.element("h1", tag.getTitle()); + xhtml.element("p", tag.getArtist()); + // ID3v1.1 Track addition - // If the last two bytes of the comment field are zero and - // non-zero, then the last byte is the track number - if (tag[125] == 0 && tag[126] != 0) { - int track = (int) tag[126] & 0xff; - xhtml.element("p", album + ", track " + track); + if (tag.getTrackNumber() != null) { + xhtml.element("p", tag.getAlbum() + ", track " + tag.getTrackNumber()); } else { - xhtml.element("p", album); + xhtml.element("p", tag.getAlbum()); } - xhtml.element("p", year); - xhtml.element("p", comment); - xhtml.element("p", GENRES[Math.min(genre, GENRES.length - 1)]); + xhtml.element("p", tag.getYear()); + xhtml.element("p", tag.getComment()); + xhtml.element("p", tag.getGenre()); } xhtml.endDocument(); @@ -222,75 +84,51 @@ } /** - * Returns the identified ISO-8859-1 substring from the given byte buffer. - * The return value is the zero-terminated substring retrieved from - * between the given start and end positions in the given byte buffer. - * Extra whitespace (and control characters) from the beginning and the - * end of the substring is removed. - * - * @param buffer byte buffer - * @param start start index of the substring - * @param end end index of the substring - * @return the identified substring - * @throws TikaException if the ISO-8859-1 encoding is not available + * Scans the MP3 frames for ID3 tags, and creates ID3Tag Handlers + * for each supported set of tags. */ - private static String getString(byte[] buffer, int start, int end) - throws TikaException { - // Find the zero byte that marks the end of the string - int zero = start; - while (zero < end && buffer[zero] != 0) { - zero++; - } - - // Skip trailing whitespace - end = zero; - while (start < end && buffer[end - 1] <= ' ') { - end--; - } - - // Skip leading whitespace - while (start < end && buffer[start] <= ' ') { - start++; - } + protected ID3Tags[] getAllTagHandlers(InputStream stream, ContentHandler handler) + throws IOException, SAXException, TikaException { + ID3v24Handler v24 = null; + ID3v23Handler v23 = null; + ID3v22Handler v22 = null; + ID3v1Handler v1 = null; + + // ID3v2 tags live at the start of the file + // You can apparently have several different ID3 tag blocks + // So, keep going until we don't find any more + ID3v2Frame f; + while ((f = ID3v2Frame.createFrameIfPresent(stream)) != null) { + if (f.getMajorVersion() == 4) { + v24 = new ID3v24Handler(f); + } else if(f.getMajorVersion() == 3) { + v23 = new ID3v23Handler(f); + } else if(f.getMajorVersion() == 2) { + v22 = new ID3v22Handler(f); + } + } - // Return the remaining substring - try { - return new String(buffer, start, end - start, "ISO-8859-1"); - } catch (UnsupportedEncodingException e) { - throw new TikaException("ISO-8859-1 encoding is not available", e); - } - } + // ID3v1 tags live at the end of the file + // Just let the handler run until it's finished + v1 = new ID3v1Handler(stream, handler); + + // Go in order of preference + // Currently, that's newest to oldest + List<ID3Tags> tags = new ArrayList<ID3Tags>(); - /** - * Reads and returns the last <code>length</code> bytes from the - * given stream. - * @param stream input stream - * @param length number of bytes from the end to read and return - * @return stream the <code>InputStream</code> to read from. - * @throws IOException if the stream could not be read from. - */ - private static byte[] getSuffix(InputStream stream, int length) - throws IOException { - byte[] buffer = new byte[2 * length]; - int bytesInBuffer = 0; - - int n = stream.read(buffer); - while (n != -1) { - bytesInBuffer += n; - if (bytesInBuffer == buffer.length) { - System.arraycopy(buffer, bytesInBuffer - length, buffer, 0, length); - bytesInBuffer = length; - } - n = stream.read(buffer, bytesInBuffer, buffer.length - bytesInBuffer); + if(v24 != null && v24.getTagsPresent()) { + tags.add(v24); } - - if (bytesInBuffer < length) { - length = bytesInBuffer; + if(v23 != null && v23.getTagsPresent()) { + tags.add(v23); } - - byte[] result = new byte[length]; - System.arraycopy(buffer, bytesInBuffer - length, result, 0, length); - return result; - } + if(v22 != null && v22.getTagsPresent()) { + tags.add(v22); + } + if(v1 != null && v1.getTagsPresent()) { + tags.add(v1); + } + return tags.toArray(new ID3Tags[tags.size()]); + } } Modified: lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/TestParsers.java URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/TestParsers.java?rev=903176&r1=903175&r2=903176&view=diff ============================================================================== --- lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/TestParsers.java (original) +++ lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/TestParsers.java Tue Jan 26 11:27:33 2010 @@ -161,7 +161,7 @@ } public void testMP3Extraction() throws Exception { - File file = getResourceAsFile("/test-documents/testMP3.mp3"); + File file = getResourceAsFile("/test-documents/testMP3id3v1.mp3"); String s1 = ParseUtils.getStringContent(file, tc); String s2 = ParseUtils.getStringContent(file, tc, "audio/mpeg"); assertEquals(s1, s2); Modified: lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java?rev=903176&r1=903175&r2=903176&view=diff ============================================================================== --- lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java (original) +++ lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java Tue Jan 26 11:27:33 2010 @@ -16,6 +16,7 @@ */ package org.apache.tika.parser.mp3; +import java.io.ByteArrayInputStream; import java.io.InputStream; import junit.framework.TestCase; @@ -31,13 +32,16 @@ */ public class Mp3ParserTest extends TestCase { - public void testMp3Parsing() throws Exception { + /** + * Test that with only ID3v1 tags, we get some information out + */ + public void testMp3ParsingID3v1() throws Exception { Parser parser = new AutoDetectParser(); // Should auto-detect! ContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); InputStream stream = Mp3ParserTest.class.getResourceAsStream( - "/test-documents/testMP3.mp3"); + "/test-documents/testMP3id3v1.mp3"); try { parser.parse(stream, handler, metadata); } finally { @@ -57,4 +61,80 @@ assertTrue(content.contains("Rock")); } + /** + * Test that with only ID3v2 tags, we get the full + * set of information out. + */ + public void testMp3ParsingID3v2() throws Exception { + Parser parser = new AutoDetectParser(); // Should auto-detect! + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + + InputStream stream = Mp3ParserTest.class.getResourceAsStream( + "/test-documents/testMP3id3v2.mp3"); + try { + parser.parse(stream, handler, metadata); + } finally { + stream.close(); + } + + assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("Test Title", metadata.get(Metadata.TITLE)); + assertEquals("Test Artist", metadata.get(Metadata.AUTHOR)); + + String content = handler.toString(); + assertTrue(content.contains("Test Title")); + assertTrue(content.contains("Test Artist")); + assertTrue(content.contains("Test Album")); + assertTrue(content.contains("2008")); + assertTrue(content.contains("Test Comment")); + assertTrue(content.contains("Rock")); + } + + /** + * Test that with both id3v2 and id3v1, we prefer the + * details from id3v2 + */ + public void testMp3ParsingID3v1v2() throws Exception { + Parser parser = new AutoDetectParser(); // Should auto-detect! + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + + InputStream stream = Mp3ParserTest.class.getResourceAsStream( + "/test-documents/testMP3id3v1_v2.mp3"); + try { + parser.parse(stream, handler, metadata); + } finally { + stream.close(); + } + + assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("Test Title", metadata.get(Metadata.TITLE)); + assertEquals("Test Artist", metadata.get(Metadata.AUTHOR)); + + String content = handler.toString(); + assertTrue(content.contains("Test Title")); + assertTrue(content.contains("Test Artist")); + assertTrue(content.contains("Test Album")); + assertTrue(content.contains("2008")); + assertTrue(content.contains("Test Comment")); + assertTrue(content.contains("Rock")); + } + + public void testID3v2Frame() throws Exception { + byte[] empty = new byte[] { + 0x49, 0x44, 0x33, 3, 1, 0, + 0, 0, 0, 0 + }; + + assertEquals(11, ID3v2Frame.getInt(new byte[] {0,0,0,0x0b})); + assertEquals(257, ID3v2Frame.getInt(new byte[] {0,0,1,1})); + + ID3v2Frame f = ID3v2Frame.createFrameIfPresent(new ByteArrayInputStream(empty)); + assertEquals(3, f.getMajorVersion()); + assertEquals(1, f.getMinorVersion()); + assertEquals(0, f.getFlags()); + assertEquals(0, f.getLength()); + assertEquals(0, f.getData().length); + } } Added: lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testMP3id3v1_v2.mp3 URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testMP3id3v1_v2.mp3?rev=903176&view=auto ============================================================================== Binary file - no diff available. Propchange: lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testMP3id3v1_v2.mp3 ------------------------------------------------------------------------------ svn:executable = * Propchange: lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testMP3id3v1_v2.mp3 ------------------------------------------------------------------------------ svn:mime-type = application/octet-stream Added: lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testMP3id3v2.mp3 URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testMP3id3v2.mp3?rev=903176&view=auto ============================================================================== Binary file - no diff available. Propchange: lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testMP3id3v2.mp3 ------------------------------------------------------------------------------ svn:executable = * Propchange: lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testMP3id3v2.mp3 ------------------------------------------------------------------------------ svn:mime-type = application/octet-stream