Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java?rev=1722029&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java Mon Dec 28 23:22:46 2015 @@ -0,0 +1,246 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.mp3; + +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Set; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TailStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.metadata.XMPDM; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.mp3.ID3Tags.ID3Comment; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +/** + * The <code>Mp3Parser</code> is used to parse ID3 Version 1 Tag information + * from an MP3 file, if available. + * + * @see <a href="http://www.id3.org/ID3v1">MP3 ID3 Version 1 specification</a> + * @see <a href="http://www.id3.org/id3v2.4.0-structure">MP3 ID3 Version 2.4 Structure Specification</a> + * @see <a href="http://www.id3.org/id3v2.4.0-frames">MP3 ID3 Version 2.4 Frames Specification</a> + */ +public class Mp3Parser extends AbstractParser { + + /** Serial version UID */ + private static final long serialVersionUID = 8537074922934844370L; + + private static final Set<MediaType> SUPPORTED_TYPES = + Collections.singleton(MediaType.audio("mpeg")); + + public Set<MediaType> getSupportedTypes(ParseContext context) { + return SUPPORTED_TYPES; + } + + + public void parse( + InputStream stream, ContentHandler handler, + Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { + metadata.set(Metadata.CONTENT_TYPE, "audio/mpeg"); + metadata.set(XMPDM.AUDIO_COMPRESSOR, "MP3"); + + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + xhtml.startDocument(); + + // Create handlers for the various kinds of ID3 tags + ID3TagsAndAudio audioAndTags = getAllTagHandlers(stream, handler); + + // Process tags metadata if the file has supported tags + if (audioAndTags.tags.length > 0) { + CompositeTagHandler tag = new CompositeTagHandler(audioAndTags.tags); + + metadata.set(TikaCoreProperties.TITLE, tag.getTitle()); + metadata.set(TikaCoreProperties.CREATOR, tag.getArtist()); + metadata.set(XMPDM.ARTIST, tag.getArtist()); + metadata.set(XMPDM.ALBUM_ARTIST, tag.getAlbumArtist()); + metadata.set(XMPDM.COMPOSER, tag.getComposer()); + metadata.set(XMPDM.ALBUM, tag.getAlbum()); + metadata.set(XMPDM.COMPILATION, tag.getCompilation()); + metadata.set(XMPDM.RELEASE_DATE, tag.getYear()); + metadata.set(XMPDM.GENRE, tag.getGenre()); + + List<String> comments = new ArrayList<String>(); + for (ID3Comment comment : tag.getComments()) { + StringBuffer cmt = new StringBuffer(); + if (comment.getLanguage() != null) { + cmt.append(comment.getLanguage()); + cmt.append(" - "); + } + if (comment.getDescription() != null) { + cmt.append(comment.getDescription()); + if (comment.getText() != null) { + cmt.append("\n"); + } + } + if (comment.getText() != null) { + cmt.append(comment.getText()); + } + + comments.add(cmt.toString()); + metadata.add(XMPDM.LOG_COMMENT.getName(), cmt.toString()); + } + + xhtml.element("h1", tag.getTitle()); + xhtml.element("p", tag.getArtist()); + + // ID3v1.1 Track addition + StringBuilder sb = new StringBuilder(); + sb.append(tag.getAlbum()); + if (tag.getTrackNumber() != null) { + sb.append(", track ").append(tag.getTrackNumber()); + metadata.set(XMPDM.TRACK_NUMBER, tag.getTrackNumber()); + } + if (tag.getDisc() != null) { + sb.append(", disc ").append(tag.getDisc()); + metadata.set(XMPDM.DISC_NUMBER, tag.getDisc()); + } + xhtml.element("p", sb.toString()); + + xhtml.element("p", tag.getYear()); + xhtml.element("p", tag.getGenre()); + xhtml.element("p", String.valueOf(audioAndTags.duration)); + for (String comment : comments) { + xhtml.element("p", comment); + } + } + if (audioAndTags.duration > 0) { + metadata.set(XMPDM.DURATION, audioAndTags.duration); + } + if (audioAndTags.audio != null) { + metadata.set("samplerate", String.valueOf(audioAndTags.audio.getSampleRate())); + metadata.set("channels", String.valueOf(audioAndTags.audio.getChannels())); + metadata.set("version", audioAndTags.audio.getVersion()); + + metadata.set( + XMPDM.AUDIO_SAMPLE_RATE, + Integer.toString(audioAndTags.audio.getSampleRate())); + if(audioAndTags.audio.getChannels() == 1) { + metadata.set(XMPDM.AUDIO_CHANNEL_TYPE, "Mono"); + } else if(audioAndTags.audio.getChannels() == 2) { + metadata.set(XMPDM.AUDIO_CHANNEL_TYPE, "Stereo"); + } else if(audioAndTags.audio.getChannels() == 5) { + metadata.set(XMPDM.AUDIO_CHANNEL_TYPE, "5.1"); + } else if(audioAndTags.audio.getChannels() == 7) { + metadata.set(XMPDM.AUDIO_CHANNEL_TYPE, "7.1"); + } + } + if (audioAndTags.lyrics != null && audioAndTags.lyrics.hasLyrics()) { + xhtml.startElement("p", "class", "lyrics"); + xhtml.characters(audioAndTags.lyrics.lyricsText); + xhtml.endElement("p"); + } + + xhtml.endDocument(); + } + + /** + * Scans the MP3 frames for ID3 tags, and creates ID3Tag Handlers + * for each supported set of tags. + */ + protected static ID3TagsAndAudio getAllTagHandlers(InputStream stream, ContentHandler handler) + throws IOException, SAXException, TikaException { + ID3v24Handler v24 = null; + ID3v23Handler v23 = null; + ID3v22Handler v22 = null; + ID3v1Handler v1 = null; + LyricsHandler lyrics = null; + AudioFrame firstAudio = null; + + TailStream tailStream = new TailStream(stream, 10240+128); + MpegStream mpegStream = new MpegStream(tailStream); + + // ID3v2 tags live at the start of the file + // You can apparently have several different ID3 tag blocks + // So, keep going until we don't find any more + MP3Frame f; + while ((f = ID3v2Frame.createFrameIfPresent(mpegStream)) != null) { + if(f instanceof ID3v2Frame) { + ID3v2Frame id3F = (ID3v2Frame)f; + if (id3F.getMajorVersion() == 4) { + v24 = new ID3v24Handler(id3F); + } else if(id3F.getMajorVersion() == 3) { + v23 = new ID3v23Handler(id3F); + } else if(id3F.getMajorVersion() == 2) { + v22 = new ID3v22Handler(id3F); + } + } + } + + // Now iterate over all audio frames in the file + AudioFrame frame = mpegStream.nextFrame(); + float duration = 0; + while (frame != null) + { + duration += frame.getDuration(); + if (firstAudio == null) + { + firstAudio = frame; + } + mpegStream.skipFrame(); + frame = mpegStream.nextFrame(); + } + + // ID3v1 tags live at the end of the file + // Lyrics live just before ID3v1, at the end of the file + // Search for both (handlers seek to the end for us) + lyrics = new LyricsHandler(tailStream.getTail()); + v1 = lyrics.id3v1; + + // Go in order of preference + // Currently, that's newest to oldest + List<ID3Tags> tags = new ArrayList<ID3Tags>(); + + if(v24 != null && v24.getTagsPresent()) { + tags.add(v24); + } + if(v23 != null && v23.getTagsPresent()) { + tags.add(v23); + } + if(v22 != null && v22.getTagsPresent()) { + tags.add(v22); + } + if(v1 != null && v1.getTagsPresent()) { + tags.add(v1); + } + + ID3TagsAndAudio ret = new ID3TagsAndAudio(); + ret.audio = firstAudio; + ret.lyrics = lyrics; + ret.tags = tags.toArray(new ID3Tags[tags.size()]); + ret.duration = duration; + return ret; + } + + protected static class ID3TagsAndAudio { + private ID3Tags[] tags; + private AudioFrame audio; + private LyricsHandler lyrics; + private float duration; + } + +}
Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/MpegStream.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/MpegStream.java?rev=1722029&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/MpegStream.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/MpegStream.java Mon Dec 28 23:22:46 2015 @@ -0,0 +1,469 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.mp3; + +import java.io.IOException; +import java.io.InputStream; +import java.io.PushbackInputStream; + +/** + * <p> + * A specialized stream class which can be used to extract single frames of MPEG + * audio files. + * </p> + * <p> + * Instances of this class are constructed with an underlying stream which + * should point to an audio file. Read operations are possible in the usual way. + * However, there are special methods for searching and extracting headers of + * MPEG frames. Some meta information of frames can be queried. + * </p> + */ +class MpegStream extends PushbackInputStream +{ + /** Bit rate table for MPEG V1, layer 1. */ + private static final int[] BIT_RATE_MPEG1_L1 = { + 0, 32000, 64000, 96000, 128000, 160000, 192000, 224000, 256000, + 288000, 320000, 352000, 384000, 416000, 448000 + }; + + /** Bit rate table for MPEG V1, layer 2. */ + private static final int[] BIT_RATE_MPEG1_L2 = { + 0, 32000, 48000, 56000, 64000, 80000, 96000, 112000, 128000, + 160000, 192000, 224000, 256000, 320000, 384000 + }; + + /** Bit rate table for MPEG V1, layer 3. */ + private static final int[] BIT_RATE_MPEG1_L3 = { + 0, 32000, 40000, 48000, 56000, 64000, 80000, 96000, 112000, 128000, + 160000, 192000, 224000, 256000, 320000 + }; + + /** Bit rate table for MPEG V2/V2.5, layer 1. */ + private static final int[] BIT_RATE_MPEG2_L1 = { + 0, 32000, 48000, 56000, 64000, 80000, 96000, 112000, 128000, + 144000, 160000, 176000, 192000, 224000, 256000 + }; + + /** Bit rate table for MPEG V2/V2.5, layer 2 and 3. */ + private static final int[] BIT_RATE_MPEG2_L2 = { + 0, 8000, 16000, 24000, 32000, 40000, 48000, 56000, 64000, 80000, + 96000, 112000, 128000, 144000, 160000 + }; + + /** Sample rate table for MPEG V1. */ + private static final int[] SAMPLE_RATE_MPEG1 = { + 44100, 48000, 32000 + }; + + /** Sample rate table for MPEG V2. */ + private static final int[] SAMPLE_RATE_MPEG2 = { + 22050, 24000, 16000 + }; + + /** Sample rate table for MPEG V2.5. */ + private static final int[] SAMPLE_RATE_MPEG2_5 = { + 11025, 12000, 8000 + }; + + /** Sample rate table for all MPEG versions. */ + private static final int[][] SAMPLE_RATE = createSampleRateTable(); + + /** Constant for the number of samples for a layer 1 frame. */ + private static final int SAMPLE_COUNT_L1 = 384; + + /** Constant for the number of samples for a layer 2 or 3 frame. */ + private static final int SAMPLE_COUNT_L2 = 1152; + + /** Constant for the size of an MPEG frame header in bytes. */ + private static final int HEADER_SIZE = 4; + + /** The current MPEG header. */ + private AudioFrame currentHeader; + + /** A flag whether the end of the stream is reached. */ + private boolean endOfStream; + + /** + * Creates a new instance of {@code MpegStream} and initializes it with the + * underlying stream. + * + * @param in the underlying audio stream + */ + public MpegStream(InputStream in) + { + super(in, 2 * HEADER_SIZE); + } + + /** + * Searches for the next MPEG frame header from the current stream position + * on. This method advances the underlying input stream until it finds a + * valid frame header or the end of the stream is reached. In the former + * case a corresponding {@code AudioFrame} object is created. In the latter + * case there are no more headers, so the end of the stream is probably + * reached. + * + * @return the next {@code AudioFrame} or <b>null</b> + * @throws IOException if an IO error occurs + */ + public AudioFrame nextFrame() throws IOException + { + AudioFrame frame = null; + while (!endOfStream && frame == null) + { + findFrameSyncByte(); + if (!endOfStream) + { + HeaderBitField headerField = createHeaderField(); + if (!endOfStream) + { + frame = createHeader(headerField); + if (frame == null) + { + pushBack(headerField); + } + } + } + } + + currentHeader = frame; + return frame; + } + + /** + * Skips the current MPEG frame. This method can be called after a valid + * MPEG header has been retrieved using {@code nextFrame()}. In this case + * the underlying stream is advanced to the end of the associated MPEG + * frame. Otherwise, this method has no effect. The return value indicates + * whether a frame could be skipped. + * + * @return <b>true</b> if a frame could be skipped, <b>false</b> otherwise + * @throws IOException if an IO error occurs + */ + public boolean skipFrame() throws IOException + { + if (currentHeader != null) + { + skipStream(in, currentHeader.getLength() - HEADER_SIZE); + currentHeader = null; + return true; + } + return false; + } + + /** + * Advances the underlying stream until the first byte of frame sync is + * found. + * + * @throws IOException if an error occurs + */ + private void findFrameSyncByte() throws IOException + { + boolean found = false; + while (!found && !endOfStream) + { + if (nextByte() == 0xFF) + { + found = true; + } + } + } + + /** + * Creates a bit field for the MPEG frame header. + * + * @return the bit field + * @throws IOException if an error occurs + */ + private HeaderBitField createHeaderField() throws IOException + { + HeaderBitField field = new HeaderBitField(); + field.add(nextByte()); + field.add(nextByte()); + field.add(nextByte()); + return field; + } + + /** + * Creates an {@code AudioFrame} object based on the given header field. If + * the header field contains invalid values, result is <b>null</b>. + * + * @param bits the header bit field + * @return the {@code AudioFrame} + */ + private AudioFrame createHeader(HeaderBitField bits) + { + if (bits.get(21, 23) != 7) + { + return null; + } + + int mpegVer = bits.get(19, 20); + int layer = bits.get(17, 18); + int bitRateCode = bits.get(12, 15); + int sampleRateCode = bits.get(10, 11); + int padding = bits.get(9); + + if (mpegVer == 1 || layer == 0 || bitRateCode == 0 || bitRateCode == 15 + || sampleRateCode == 3) + { + // invalid header values + return null; + } + + int bitRate = calculateBitRate(mpegVer, layer, bitRateCode); + int sampleRate = calculateSampleRate(mpegVer, sampleRateCode); + int length = calculateFrameLength(layer, bitRate, sampleRate, padding); + float duration = calculateDuration(layer, sampleRate); + int channels = calculateChannels(bits.get(6, 7)); + return new AudioFrame(mpegVer, layer, bitRate, sampleRate, channels, + length, duration); + } + + /** + * Reads the next byte. + * + * @return the next byte + * @throws IOException if an error occurs + */ + private int nextByte() throws IOException + { + int result = 0; + if (!endOfStream) + { + result = read(); + if (result == -1) + { + endOfStream = true; + } + } + return endOfStream ? 0 : result; + } + + /** + * Pushes the given header field back in the stream so that the bytes are + * read again. This method is called if an invalid header was detected. Then + * search has to continue at the next byte after the frame sync byte. + * + * @param field the header bit field with the invalid frame header + * @throws IOException if an error occurs + */ + private void pushBack(HeaderBitField field) throws IOException + { + unread(field.toArray()); + } + + /** + * Skips the given number of bytes from the specified input stream. + * + * @param in the input stream + * @param count the number of bytes to skip + * @throws IOException if an IO error occurs + */ + private static void skipStream(InputStream in, long count) + throws IOException + { + long size = count; + long skipped = 0; + while (size > 0 && skipped >= 0) + { + skipped = in.skip(size); + if (skipped != -1) + { + size -= skipped; + } + } + } + + /** + * Calculates the bit rate based on the given parameters. + * + * @param mpegVer the MPEG version + * @param layer the layer + * @param code the code for the bit rate + * @return the bit rate in bits per second + */ + private static int calculateBitRate(int mpegVer, int layer, int code) + { + int[] arr = null; + + if (mpegVer == AudioFrame.MPEG_V1) + { + switch (layer) + { + case AudioFrame.LAYER_1: + arr = BIT_RATE_MPEG1_L1; + break; + case AudioFrame.LAYER_2: + arr = BIT_RATE_MPEG1_L2; + break; + case AudioFrame.LAYER_3: + arr = BIT_RATE_MPEG1_L3; + break; + } + } + else + { + if (layer == AudioFrame.LAYER_1) + { + arr = BIT_RATE_MPEG2_L1; + } + else + { + arr = BIT_RATE_MPEG2_L2; + } + } + return arr[code]; + } + + /** + * Calculates the sample rate based on the given parameters. + * + * @param mpegVer the MPEG version + * @param code the code for the sample rate + * @return the sample rate in samples per second + */ + private static int calculateSampleRate(int mpegVer, int code) + { + return SAMPLE_RATE[mpegVer][code]; + } + + /** + * Calculates the length of an MPEG frame based on the given parameters. + * + * @param layer the layer + * @param bitRate the bit rate + * @param sampleRate the sample rate + * @param padding the padding flag + * @return the length of the frame in bytes + */ + private static int calculateFrameLength(int layer, int bitRate, + int sampleRate, int padding) + { + if (layer == AudioFrame.LAYER_1) + { + return (12 * bitRate / sampleRate + padding) * 4; + } + else + { + return 144 * bitRate / sampleRate + padding; + } + } + + /** + * Calculates the duration of a MPEG frame based on the given parameters. + * + * @param layer the layer + * @param sampleRate the sample rate + * @return the duration of this frame in milliseconds + */ + private static float calculateDuration(int layer, int sampleRate) + { + int sampleCount = + (layer == AudioFrame.LAYER_1) ? SAMPLE_COUNT_L1 + : SAMPLE_COUNT_L2; + return (1000.0f / sampleRate) * sampleCount; + } + + /** + * Calculates the number of channels based on the given parameters. + * + * @param chan the code for the channels + * @return the number of channels + */ + private static int calculateChannels(int chan) + { + return chan < 3 ? 2 : 1; + } + + /** + * Creates the complete array for the sample rate mapping. + * + * @return the table for the sample rates + */ + private static int[][] createSampleRateTable() + { + int[][] arr = new int[4][]; + arr[AudioFrame.MPEG_V1] = SAMPLE_RATE_MPEG1; + arr[AudioFrame.MPEG_V2] = SAMPLE_RATE_MPEG2; + arr[AudioFrame.MPEG_V2_5] = SAMPLE_RATE_MPEG2_5; + return arr; + } + + /** + * A class representing the bit field of an MPEG header. It allows + * convenient access to specific bit groups. + */ + private static class HeaderBitField + { + /** The internal value. */ + private int value; + + /** + * Adds a byte to this field. + * + * @param b the byte to be added + */ + public void add(int b) + { + value <<= 8; + value |= b; + } + + /** + * Returns the value of the bit group from the given start and end + * index. E.g. ''from'' = 0, ''to'' = 3 will return the value of the + * first 4 bits. + * + * @param the from index + * @param to the to index + * @return the value of this group of bits + */ + public int get(int from, int to) + { + int shiftVal = value >> from; + int mask = (1 << (to - from + 1)) - 1; + return shiftVal & mask; + } + + /** + * Returns the value of the bit with the given index. The bit index is + * 0-based. Result is either 0 or 1, depending on the value of this bit. + * + * @param bit the bit index + * @return the value of this bit + */ + public int get(int bit) + { + return get(bit, bit); + } + + /** + * Returns the internal value of this field as an array. The array + * contains 3 bytes. + * + * @return the internal value of this field as int array + */ + public byte[] toArray() + { + byte[] result = new byte[3]; + result[0] = (byte) get(16, 23); + result[1] = (byte) get(8, 15); + result[2] = (byte) get(0, 7); + return result; + } + } +} Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp4/DirectFileReadDataSource.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp4/DirectFileReadDataSource.java?rev=1722029&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp4/DirectFileReadDataSource.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp4/DirectFileReadDataSource.java Mon Dec 28 23:22:46 2015 @@ -0,0 +1,100 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.mp4; + +import com.googlecode.mp4parser.DataSource; + +import java.io.File; +import java.io.IOException; +import java.io.RandomAccessFile; +import java.nio.ByteBuffer; +import java.nio.channels.WritableByteChannel; + +import static com.googlecode.mp4parser.util.CastUtils.l2i; + +/** + * A {@link DataSource} implementation that relies on direct reads from a {@link RandomAccessFile}. + * It should be slower than {@link com.googlecode.mp4parser.FileDataSourceImpl} but does not incur the implicit file locks of + * memory mapped I/O on some JVMs. This implementation allows for a more controlled deletion of files + * and might be preferred when working with temporary files. + * @see <a href="http://bugs.java.com/view_bug.do?bug_id=4724038">JDK-4724038 : (fs) Add unmap method to MappedByteBuffer</a> + * @see <a href="http://bugs.java.com/view_bug.do?bug_id=6359560">JDK-6359560 : (fs) File.deleteOnExit() doesn't work when MappedByteBuffer exists (win)</a> + */ +public class DirectFileReadDataSource implements DataSource { + + private static final int TRANSFER_SIZE = 8192; + + private RandomAccessFile raf; + + public DirectFileReadDataSource(File f) throws IOException { + this.raf = new RandomAccessFile(f, "r"); + } + + public int read(ByteBuffer byteBuffer) throws IOException { + int len = byteBuffer.remaining(); + int totalRead = 0; + int bytesRead = 0; + byte[] buf = new byte[TRANSFER_SIZE]; + while (totalRead < len) { + int bytesToRead = Math.min((len - totalRead), TRANSFER_SIZE); + bytesRead = raf.read(buf, 0, bytesToRead); + if (bytesRead < 0) { + break; + } else { + totalRead += bytesRead; + } + byteBuffer.put(buf, 0, bytesRead); + } + return ((bytesRead < 0) && (totalRead == 0)) ? -1 : totalRead; + } + + public int readAllInOnce(ByteBuffer byteBuffer) throws IOException { + byte[] buf = new byte[byteBuffer.remaining()]; + int read = raf.read(buf); + byteBuffer.put(buf, 0, read); + return read; + } + + public long size() throws IOException { + return raf.length(); + } + + public long position() throws IOException { + return raf.getFilePointer(); + } + + public void position(long nuPos) throws IOException { + raf.seek(nuPos); + } + + public long transferTo(long position, long count, WritableByteChannel target) throws IOException { + return target.write(map(position, count)); + } + + public ByteBuffer map(long startPosition, long size) throws IOException { + raf.seek(startPosition); + byte[] payload = new byte[l2i(size)]; + raf.readFully(payload); + return ByteBuffer.wrap(payload); + } + + public void close() throws IOException { + raf.close(); + } + + +} Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp4/MP4Parser.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp4/MP4Parser.java?rev=1722029&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp4/MP4Parser.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp4/MP4Parser.java Mon Dec 28 23:22:46 2015 @@ -0,0 +1,325 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.mp4; + +import java.io.IOException; +import java.io.InputStream; +import java.text.DecimalFormat; +import java.text.NumberFormat; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Set; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TemporaryResources; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Property; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.metadata.XMP; +import org.apache.tika.metadata.XMPDM; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +import com.coremedia.iso.IsoFile; +import com.coremedia.iso.boxes.Box; +import com.coremedia.iso.boxes.Container; +import com.coremedia.iso.boxes.FileTypeBox; +import com.coremedia.iso.boxes.MetaBox; +import com.coremedia.iso.boxes.MovieBox; +import com.coremedia.iso.boxes.MovieHeaderBox; +import com.coremedia.iso.boxes.SampleDescriptionBox; +import com.coremedia.iso.boxes.SampleTableBox; +import com.coremedia.iso.boxes.TrackBox; +import com.coremedia.iso.boxes.TrackHeaderBox; +import com.coremedia.iso.boxes.UserDataBox; +import com.coremedia.iso.boxes.apple.AppleItemListBox; +import com.coremedia.iso.boxes.sampleentry.AudioSampleEntry; +import com.googlecode.mp4parser.boxes.apple.AppleAlbumBox; +import com.googlecode.mp4parser.boxes.apple.AppleArtistBox; +import com.googlecode.mp4parser.boxes.apple.AppleArtist2Box; +import com.googlecode.mp4parser.boxes.apple.AppleCommentBox; +import com.googlecode.mp4parser.boxes.apple.AppleCompilationBox; +import com.googlecode.mp4parser.boxes.apple.AppleDiskNumberBox; +import com.googlecode.mp4parser.boxes.apple.AppleEncoderBox; +import com.googlecode.mp4parser.boxes.apple.AppleGenreBox; +import com.googlecode.mp4parser.boxes.apple.AppleNameBox; +import com.googlecode.mp4parser.boxes.apple.AppleRecordingYear2Box; +import com.googlecode.mp4parser.boxes.apple.AppleTrackAuthorBox; +import com.googlecode.mp4parser.boxes.apple.AppleTrackNumberBox; +import com.googlecode.mp4parser.boxes.apple.Utf8AppleDataBox; + +/** + * Parser for the MP4 media container format, as well as the older + * QuickTime format that MP4 is based on. + * + * This uses the MP4Parser project from http://code.google.com/p/mp4parser/ + * to do the underlying parsing + */ +public class MP4Parser extends AbstractParser { + /** Serial version UID */ + private static final long serialVersionUID = 84011216792285L; + /** TODO Replace this with a 2dp Duration Property Converter */ + private static final DecimalFormat DURATION_FORMAT = + (DecimalFormat)NumberFormat.getNumberInstance(Locale.ROOT); + static { + DURATION_FORMAT.applyPattern("0.0#"); + } + + // Ensure this stays in Sync with the entries in tika-mimetypes.xml + private static final Map<MediaType,List<String>> typesMap = new HashMap<MediaType, List<String>>(); + static { + // All types should be 4 bytes long, space padded as needed + typesMap.put(MediaType.audio("mp4"), Arrays.asList( + "M4A ", "M4B ", "F4A ", "F4B ")); + typesMap.put(MediaType.video("3gpp"), Arrays.asList( + "3ge6", "3ge7", "3gg6", "3gp1", "3gp2", "3gp3", "3gp4", "3gp5", "3gp6", "3gs7")); + typesMap.put(MediaType.video("3gpp2"), Arrays.asList( + "3g2a", "3g2b", "3g2c")); + typesMap.put(MediaType.video("mp4"), Arrays.asList( + "mp41", "mp42")); + typesMap.put(MediaType.video("x-m4v"), Arrays.asList( + "M4V ", "M4VH", "M4VP")); + + typesMap.put(MediaType.video("quicktime"), Collections.<String>emptyList()); + typesMap.put(MediaType.application("mp4"), Collections.<String>emptyList()); + } + + private static final Set<MediaType> SUPPORTED_TYPES = + Collections.unmodifiableSet(typesMap.keySet()); + + public Set<MediaType> getSupportedTypes(ParseContext context) { + return SUPPORTED_TYPES; + } + + + public void parse( + InputStream stream, ContentHandler handler, + Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { + IsoFile isoFile; + + // The MP4Parser library accepts either a File, or a byte array + // As MP4 video files are typically large, always use a file to + // avoid OOMs that may occur with in-memory buffering + TemporaryResources tmp = new TemporaryResources(); + TikaInputStream tstream = TikaInputStream.get(stream, tmp); + try { + isoFile = new IsoFile(new DirectFileReadDataSource(tstream.getFile())); + tmp.addResource(isoFile); + + // Grab the file type box + FileTypeBox fileType = getOrNull(isoFile, FileTypeBox.class); + if (fileType != null) { + // Identify the type + MediaType type = MediaType.application("mp4"); + for (MediaType t : typesMap.keySet()) { + if (typesMap.get(t).contains(fileType.getMajorBrand())) { + type = t; + break; + } + } + metadata.set(Metadata.CONTENT_TYPE, type.toString()); + + if (type.getType().equals("audio")) { + metadata.set(XMPDM.AUDIO_COMPRESSOR, fileType.getMajorBrand().trim()); + } + } else { + // Some older QuickTime files lack the FileType + metadata.set(Metadata.CONTENT_TYPE, "video/quicktime"); + } + + + // Get the main MOOV box + MovieBox moov = getOrNull(isoFile, MovieBox.class); + if (moov == null) { + // Bail out + return; + } + + + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + xhtml.startDocument(); + + + // Pull out some information from the header box + MovieHeaderBox mHeader = getOrNull(moov, MovieHeaderBox.class); + if (mHeader != null) { + // Get the creation and modification dates + metadata.set(Metadata.CREATION_DATE, mHeader.getCreationTime()); + metadata.set(TikaCoreProperties.MODIFIED, mHeader.getModificationTime()); + + // Get the duration + double durationSeconds = ((double)mHeader.getDuration()) / mHeader.getTimescale(); + metadata.set(XMPDM.DURATION, DURATION_FORMAT.format(durationSeconds)); + + // The timescale is normally the sampling rate + metadata.set(XMPDM.AUDIO_SAMPLE_RATE, (int)mHeader.getTimescale()); + } + + + // Get some more information from the track header + // TODO Decide how to handle multiple tracks + List<TrackBox> tb = moov.getBoxes(TrackBox.class); + if (tb.size() > 0) { + TrackBox track = tb.get(0); + + TrackHeaderBox header = track.getTrackHeaderBox(); + // Get the creation and modification dates + metadata.set(TikaCoreProperties.CREATED, header.getCreationTime()); + metadata.set(TikaCoreProperties.MODIFIED, header.getModificationTime()); + + // Get the video with and height + metadata.set(Metadata.IMAGE_WIDTH, (int)header.getWidth()); + metadata.set(Metadata.IMAGE_LENGTH, (int)header.getHeight()); + + // Get the sample information + SampleTableBox samples = track.getSampleTableBox(); + SampleDescriptionBox sampleDesc = samples.getSampleDescriptionBox(); + if (sampleDesc != null) { + // Look for the first Audio Sample, if present + AudioSampleEntry sample = getOrNull(sampleDesc, AudioSampleEntry.class); + if (sample != null) { + XMPDM.ChannelTypePropertyConverter.convertAndSet(metadata, sample.getChannelCount()); + //metadata.set(XMPDM.AUDIO_SAMPLE_TYPE, sample.getSampleSize()); // TODO Num -> Type mapping + metadata.set(XMPDM.AUDIO_SAMPLE_RATE, (int)sample.getSampleRate()); + //metadata.set(XMPDM.AUDIO_, sample.getSamplesPerPacket()); + //metadata.set(XMPDM.AUDIO_, sample.getBytesPerSample()); + } + } + } + + // Get metadata from the User Data Box + UserDataBox userData = getOrNull(moov, UserDataBox.class); + if (userData != null) { + MetaBox meta = getOrNull(userData, MetaBox.class); + + // Check for iTunes Metadata + // See http://atomicparsley.sourceforge.net/mpeg-4files.html and + // http://code.google.com/p/mp4v2/wiki/iTunesMetadata for more on these + AppleItemListBox apple = getOrNull(meta, AppleItemListBox.class); + if (apple != null) { + // Title + AppleNameBox title = getOrNull(apple, AppleNameBox.class); + addMetadata(TikaCoreProperties.TITLE, metadata, title); + + // Artist + AppleArtistBox artist = getOrNull(apple, AppleArtistBox.class); + addMetadata(TikaCoreProperties.CREATOR, metadata, artist); + addMetadata(XMPDM.ARTIST, metadata, artist); + + // Album Artist + AppleArtist2Box artist2 = getOrNull(apple, AppleArtist2Box.class); + addMetadata(XMPDM.ALBUM_ARTIST, metadata, artist2); + + // Album + AppleAlbumBox album = getOrNull(apple, AppleAlbumBox.class); + addMetadata(XMPDM.ALBUM, metadata, album); + + // Composer + AppleTrackAuthorBox composer = getOrNull(apple, AppleTrackAuthorBox.class); + addMetadata(XMPDM.COMPOSER, metadata, composer); + + // Genre + AppleGenreBox genre = getOrNull(apple, AppleGenreBox.class); + addMetadata(XMPDM.GENRE, metadata, genre); + + // Year + AppleRecordingYear2Box year = getOrNull(apple, AppleRecordingYear2Box.class); + if (year != null) { + metadata.set(XMPDM.RELEASE_DATE, year.getValue()); + } + + // Track number + AppleTrackNumberBox trackNum = getOrNull(apple, AppleTrackNumberBox.class); + if (trackNum != null) { + metadata.set(XMPDM.TRACK_NUMBER, trackNum.getA()); + //metadata.set(XMPDM.NUMBER_OF_TRACKS, trackNum.getB()); // TODO + } + + // Disc number + AppleDiskNumberBox discNum = getOrNull(apple, AppleDiskNumberBox.class); + if (discNum != null) { + metadata.set(XMPDM.DISC_NUMBER, discNum.getA()); + } + + // Compilation + AppleCompilationBox compilation = getOrNull(apple, AppleCompilationBox.class); + if (compilation != null) { + metadata.set(XMPDM.COMPILATION, (int)compilation.getValue()); + } + + // Comment + AppleCommentBox comment = getOrNull(apple, AppleCommentBox.class); + addMetadata(XMPDM.LOG_COMMENT, metadata, comment); + + // Encoder + AppleEncoderBox encoder = getOrNull(apple, AppleEncoderBox.class); + if (encoder != null) { + metadata.set(XMP.CREATOR_TOOL, encoder.getValue()); + } + + + // As text + for (Box box : apple.getBoxes()) { + if (box instanceof Utf8AppleDataBox) { + xhtml.element("p", ((Utf8AppleDataBox)box).getValue()); + } + } + } + + // TODO Check for other kinds too + } + + // All done + xhtml.endDocument(); + + } finally { + tmp.dispose(); + } + + } + + private static void addMetadata(String key, Metadata m, Utf8AppleDataBox metadata) { + if (metadata != null) { + m.add(key, metadata.getValue()); + } + } + private static void addMetadata(Property prop, Metadata m, Utf8AppleDataBox metadata) { + if (metadata != null) { + m.set(prop, metadata.getValue()); + } + } + + private static <T extends Box> T getOrNull(Container box, Class<T> clazz) { + if (box == null) return null; + + List<T> boxes = box.getBoxes(clazz); + if (boxes.size() == 0) { + return null; + } + return boxes.get(0); + } +} Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/video/FLVParser.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/video/FLVParser.java?rev=1722029&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/video/FLVParser.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/video/FLVParser.java Mon Dec 28 23:22:46 2015 @@ -0,0 +1,268 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.video; + +import java.io.ByteArrayInputStream; +import java.io.DataInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Date; +import java.util.HashMap; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Set; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +import static java.nio.charset.StandardCharsets.UTF_8; + +/** + * <p> + * Parser for metadata contained in Flash Videos (.flv). Resources: + * http://osflash.org/flv and for AMF: + * http://download.macromedia.com/pub/labs/amf/amf0_spec_121207.pdf + * <p> + * This parser is capable of extracting the general metadata from header as well + * as embedded metadata. + * <p> + * Known keys for metadata (from file header): + * <ol> + * <li>hasVideo: true|false + * <li>hasSound: true|false + * </ol> + * <p> + * In addition to the above values also metadata that is inserted in to the + * actual stream will be picked. Usually there are keys like: + * hasKeyframes, lastkeyframetimestamp, audiocodecid, keyframes, filepositions, + * hasMetadata, audiosamplerate, videodatarate metadatadate, videocodecid, + * metadatacreator, audiosize, hasVideo, height, audiosamplesize, framerate, + * hasCuePoints width, cuePoints, lasttimestamp, canSeekToEnd, datasize, + * duration, videosize, filesize, audiodatarate, hasAudio, stereo audiodelay + */ +public class FLVParser extends AbstractParser { + + /** Serial version UID */ + private static final long serialVersionUID = -8718013155719197679L; + + private static int TYPE_METADATA = 0x12; + private static byte MASK_AUDIO = 1; + private static byte MASK_VIDEO = 4; + + private static final Set<MediaType> SUPPORTED_TYPES = + Collections.singleton(MediaType.video("x-flv")); + + public Set<MediaType> getSupportedTypes(ParseContext context) { + return SUPPORTED_TYPES; + } + + private long readUInt32(DataInputStream input) throws IOException { + return input.readInt() & 0xFFFFFFFFL; + } + + private int readUInt24(DataInputStream input) throws IOException { + int uint = input.read()<<16; + uint += input.read()<<8; + uint += input.read(); + return uint; + } + + private Object readAMFData(DataInputStream input, int type) + throws IOException { + if (type == -1) { + type = input.readUnsignedByte(); + } + switch (type) { + case 0: + return input.readDouble(); + case 1: + return input.readUnsignedByte() == 1; + case 2: + return readAMFString(input); + case 3: + return readAMFObject(input); + case 8: + return readAMFEcmaArray(input); + case 10: + return readAMFStrictArray(input); + case 11: + final Date date = new Date((long) input.readDouble()); + input.readShort(); // time zone + return date; + case 13: + return "UNDEFINED"; + default: + return null; + } + } + + private Object readAMFStrictArray(DataInputStream input) throws IOException { + long count = readUInt32(input); + ArrayList<Object> list = new ArrayList<Object>(); + for (int i = 0; i < count; i++) { + list.add(readAMFData(input, -1)); + } + return list; + } + + + private String readAMFString(DataInputStream input) throws IOException { + int size = input.readUnsignedShort(); + byte[] chars = new byte[size]; + input.readFully(chars); + return new String(chars, UTF_8); + } + + private Object readAMFObject(DataInputStream input) throws IOException { + HashMap<String, Object> array = new HashMap<String, Object>(); + while (true) { + String key = readAMFString(input); + int dataType = input.read(); + if (dataType == 9) { // object end marker + break; + } + array.put(key, readAMFData(input, dataType)); + } + return array; + } + + private Object readAMFEcmaArray(DataInputStream input) throws IOException { + long size = readUInt32(input); + HashMap<String, Object> array = new HashMap<String, Object>(); + for (int i = 0; i < size; i++) { + String key = readAMFString(input); + int dataType = input.read(); + array.put(key, readAMFData(input, dataType)); + } + return array; + } + + private boolean checkSignature(DataInputStream fis) throws IOException { + return fis.read() == 'F' && fis.read() == 'L' && fis.read() == 'V'; + } + + public void parse( + InputStream stream, ContentHandler handler, + Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { + DataInputStream datainput = new DataInputStream(stream); + if (!checkSignature(datainput)) { + throw new TikaException("FLV signature not detected"); + } + + // header + int version = datainput.readUnsignedByte(); + if (version != 1) { + // should be 1, perhaps this is not flv? + throw new TikaException("Unpexpected FLV version: " + version); + } + + int typeFlags = datainput.readUnsignedByte(); + + long len = readUInt32(datainput); + if (len != 9) { + // we only know about format with header of 9 bytes + throw new TikaException("Unpexpected FLV header length: " + len); + } + + long sizePrev = readUInt32(datainput); + if (sizePrev != 0) { + // should be 0, perhaps this is not flv? + throw new TikaException( + "Unpexpected FLV first previous block size: " + sizePrev); + } + + metadata.set(Metadata.CONTENT_TYPE, "video/x-flv"); + metadata.set("hasVideo", Boolean.toString((typeFlags & MASK_VIDEO) != 0)); + metadata.set("hasAudio", Boolean.toString((typeFlags & MASK_AUDIO) != 0)); + + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + xhtml.startDocument(); + + // flv tag stream follows... + while (true) { + int type = datainput.read(); + if (type == -1) { + // EOF + break; + } + + int datalen = readUInt24(datainput); //body length + readUInt32(datainput); // timestamp + readUInt24(datainput); // streamid + + if (type == TYPE_METADATA) { + // found metadata Tag, read content to buffer + byte[] metaBytes = new byte[datalen]; + for (int readCount = 0; readCount < datalen;) { + int r = stream.read(metaBytes, readCount, datalen - readCount); + if(r!=-1) { + readCount += r; + + } else { + break; + } + } + + ByteArrayInputStream is = new ByteArrayInputStream(metaBytes); + + DataInputStream dis = new DataInputStream(is); + + Object data = null; + + for (int i = 0; i < 2; i++) { + data = readAMFData(dis, -1); + } + + if (data instanceof Map) { + // TODO if there are multiple metadata values with same key (in + // separate AMF blocks, we currently loose previous values) + Map<String, Object> extractedMetadata = (Map<String, Object>) data; + for (Entry<String, Object> entry : extractedMetadata.entrySet()) { + if (entry.getValue() == null) { + continue; + } + metadata.set(entry.getKey(), entry.getValue().toString()); + } + } + + } else { + // Tag was not metadata, skip over data we cannot handle + for (int i = 0; i < datalen; i++) { + datainput.readByte(); + } + } + + sizePrev = readUInt32(datainput); // previous block size + if (sizePrev != datalen + 11) { + // file was corrupt or we could not parse it... + break; + } + } + + xhtml.endDocument(); + } + +} Modified: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=1722029&r1=1722028&r2=1722029&view=diff ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser (original) +++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser Mon Dec 28 23:22:46 2015 @@ -14,5 +14,14 @@ # limitations under the License. +org.apache.tika.parser.image.BPGParser org.apache.tika.parser.image.ImageParser +org.apache.tika.parser.image.PSDParser +org.apache.tika.parser.image.TiffParser +org.apache.tika.parser.image.WebPParser org.apache.tika.parser.jpeg.JpegParser +org.apache.tika.parser.audio.AudioParser +org.apache.tika.parser.audio.MidiParser +org.apache.tika.parser.mp3.Mp3Parser +org.apache.tika.parser.mp4.MP4Parser +org.apache.tika.parser.video.FLVParser Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/audio/AudioParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/audio/AudioParserTest.java?rev=1722029&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/audio/AudioParserTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/audio/AudioParserTest.java Mon Dec 28 23:22:46 2015 @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.audio; + +import static org.junit.Assert.assertEquals; + +import org.apache.tika.Tika; +import org.apache.tika.metadata.Metadata; +import org.junit.Test; + +public class AudioParserTest { + + @Test + public void testWAV() throws Exception { + String path = "/test-documents/testWAV.wav"; + Metadata metadata = new Metadata(); + String content = new Tika().parseToString( + AudioParserTest.class.getResourceAsStream(path), metadata); + + assertEquals("audio/x-wav", metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("44100.0", metadata.get("samplerate")); + assertEquals("2", metadata.get("channels")); + assertEquals("16", metadata.get("bits")); + assertEquals("PCM_SIGNED", metadata.get("encoding")); + + assertEquals("", content); + } + + @Test + public void testAIFF() throws Exception { + String path = "/test-documents/testAIFF.aif"; + Metadata metadata = new Metadata(); + String content = new Tika().parseToString( + AudioParserTest.class.getResourceAsStream(path), metadata); + + assertEquals("audio/x-aiff", metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("44100.0", metadata.get("samplerate")); + assertEquals("2", metadata.get("channels")); + assertEquals("16", metadata.get("bits")); + assertEquals("PCM_SIGNED", metadata.get("encoding")); + + assertEquals("", content); + } + + @Test + public void testAU() throws Exception { + String path = "/test-documents/testAU.au"; + Metadata metadata = new Metadata(); + String content = new Tika().parseToString( + AudioParserTest.class.getResourceAsStream(path), metadata); + + assertEquals("audio/basic", metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("44100.0", metadata.get("samplerate")); + assertEquals("2", metadata.get("channels")); + assertEquals("16", metadata.get("bits")); + assertEquals("PCM_SIGNED", metadata.get("encoding")); + + assertEquals("", content); + } + +} Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/audio/MidiParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/audio/MidiParserTest.java?rev=1722029&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/audio/MidiParserTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/audio/MidiParserTest.java Mon Dec 28 23:22:46 2015 @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.audio; + +import static org.junit.Assert.assertEquals; +import static org.apache.tika.TikaTest.assertContains; + +import org.apache.tika.Tika; +import org.apache.tika.metadata.Metadata; +import org.junit.Test; + +public class MidiParserTest { + + @Test + public void testMID() throws Exception { + String path = "/test-documents/testMID.mid"; + Metadata metadata = new Metadata(); + String content = new Tika().parseToString( + MidiParserTest.class.getResourceAsStream(path), metadata); + + assertEquals("audio/midi", metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("2", metadata.get("tracks")); + assertEquals("0", metadata.get("patches")); + assertEquals("PPQ", metadata.get("divisionType")); + + assertContains("Untitled", content); + } +} Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/image/BPGParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/image/BPGParserTest.java?rev=1722029&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/image/BPGParserTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/image/BPGParserTest.java Mon Dec 28 23:22:46 2015 @@ -0,0 +1,133 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.image; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.io.InputStream; +import java.util.Arrays; +import java.util.List; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Photoshop; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.junit.Test; +import org.xml.sax.helpers.DefaultHandler; + +public class BPGParserTest { + private final Parser parser = new BPGParser(); + + /** + * Tests a very basic file, without much metadata + */ + @Test + public void testBPG() throws Exception { + Metadata metadata = new Metadata(); + metadata.set(Metadata.CONTENT_TYPE, "image/x-bpg"); + InputStream stream = + getClass().getResourceAsStream("/test-documents/testBPG.bpg"); + parser.parse(stream, new DefaultHandler(), metadata, new ParseContext()); + + assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH)); + assertEquals("75", metadata.get(Metadata.IMAGE_LENGTH)); + assertEquals("10", metadata.get(Metadata.BITS_PER_SAMPLE)); + assertEquals("YCbCr Colour", metadata.get(Photoshop.COLOR_MODE)); + } + + /** + * Tests a file with comments + */ + @Test + public void testBPG_Commented() throws Exception { + Metadata metadata = new Metadata(); + metadata.set(Metadata.CONTENT_TYPE, "image/x-bpg"); + InputStream stream = + getClass().getResourceAsStream("/test-documents/testBPG_commented.bpg"); + parser.parse(stream, new DefaultHandler(), metadata, new ParseContext()); + + assertEquals("103", metadata.get(Metadata.IMAGE_WIDTH)); + assertEquals("77", metadata.get(Metadata.IMAGE_LENGTH)); + assertEquals("10", metadata.get(Metadata.BITS_PER_SAMPLE)); + assertEquals("YCbCr Colour", metadata.get(Photoshop.COLOR_MODE)); + + // TODO Get the exif comment data to be properly extracted, see TIKA-1495 + if (false) { + assertEquals("Tosteberga \u00C4ngar", metadata.get(TikaCoreProperties.TITLE)); + assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new line)", metadata.get(TikaCoreProperties.DESCRIPTION)); + List<String> keywords = Arrays.asList(metadata.getValues(Metadata.KEYWORDS)); + assertTrue(keywords.contains("coast")); + assertTrue(keywords.contains("bird watching")); + assertEquals(keywords, Arrays.asList(metadata.getValues(TikaCoreProperties.KEYWORDS))); + } + + // TODO Get the exif data to be properly extracted, see TIKA-1495 + if (false) { + assertEquals("1.0E-6", metadata.get(Metadata.EXPOSURE_TIME)); // 1/1000000 + assertEquals("2.8", metadata.get(Metadata.F_NUMBER)); + assertEquals("4.6", metadata.get(Metadata.FOCAL_LENGTH)); + assertEquals("114", metadata.get(Metadata.ISO_SPEED_RATINGS)); + assertEquals(null, metadata.get(Metadata.EQUIPMENT_MAKE)); + assertEquals(null, metadata.get(Metadata.EQUIPMENT_MODEL)); + assertEquals(null, metadata.get(Metadata.SOFTWARE)); + assertEquals("1", metadata.get(Metadata.ORIENTATION)); + assertEquals("300.0", metadata.get(Metadata.RESOLUTION_HORIZONTAL)); + assertEquals("300.0", metadata.get(Metadata.RESOLUTION_VERTICAL)); + assertEquals("Inch", metadata.get(Metadata.RESOLUTION_UNIT)); + } + } + + /** + * Tests a file with geographic information in it + */ + @Test + public void testBPG_Geo() throws Exception { + Metadata metadata = new Metadata(); + metadata.set(Metadata.CONTENT_TYPE, "image/x-bpg"); + InputStream stream = + getClass().getResourceAsStream("/test-documents/testBPG_GEO.bpg"); + parser.parse(stream, new DefaultHandler(), metadata, new ParseContext()); + + assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH)); + assertEquals("68", metadata.get(Metadata.IMAGE_LENGTH)); + assertEquals("10", metadata.get(Metadata.BITS_PER_SAMPLE)); + assertEquals("YCbCr Colour", metadata.get(Photoshop.COLOR_MODE)); + + // TODO Get the geographic data to be properly extracted, see TIKA-1495 + if (false) { + assertEquals("12.54321", metadata.get(Metadata.LATITUDE)); + assertEquals("-54.1234", metadata.get(Metadata.LONGITUDE)); + } + + // TODO Get the exif data to be properly extracted, see TIKA-1495 + if (false) { + assertEquals("6.25E-4", metadata.get(Metadata.EXPOSURE_TIME)); // 1/1600 + assertEquals("5.6", metadata.get(Metadata.F_NUMBER)); + assertEquals("false", metadata.get(Metadata.FLASH_FIRED)); + assertEquals("194.0", metadata.get(Metadata.FOCAL_LENGTH)); + assertEquals("400", metadata.get(Metadata.ISO_SPEED_RATINGS)); + assertEquals("Canon", metadata.get(Metadata.EQUIPMENT_MAKE)); + assertEquals("Canon EOS 40D", metadata.get(Metadata.EQUIPMENT_MODEL)); + assertEquals("Adobe Photoshop CS3 Macintosh", metadata.get(Metadata.SOFTWARE)); + assertEquals("240.0", metadata.get(Metadata.RESOLUTION_HORIZONTAL)); + assertEquals("240.0", metadata.get(Metadata.RESOLUTION_VERTICAL)); + assertEquals("Inch", metadata.get(Metadata.RESOLUTION_UNIT)); + } + } +} Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/image/ImageMetadataExtractorTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/image/ImageMetadataExtractorTest.java?rev=1722029&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/image/ImageMetadataExtractorTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/image/ImageMetadataExtractorTest.java Mon Dec 28 23:22:46 2015 @@ -0,0 +1,139 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.image; + + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertTrue; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +import java.util.Arrays; +import java.util.GregorianCalendar; +import java.util.Iterator; +import java.util.List; +import java.util.Locale; +import java.util.TimeZone; + +import com.drew.metadata.Directory; +import com.drew.metadata.MetadataException; +import com.drew.metadata.Tag; +import com.drew.metadata.exif.ExifIFD0Directory; +import com.drew.metadata.exif.ExifSubIFDDirectory; +import com.drew.metadata.jpeg.JpegCommentDirectory; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.junit.Test; + +public class ImageMetadataExtractorTest { + + @SuppressWarnings({"rawtypes", "unchecked"}) + @Test + public void testHandleDirectories() throws MetadataException { + Metadata metadata = mock(Metadata.class); + ImageMetadataExtractor.DirectoryHandler handler1 = mock(ImageMetadataExtractor.DirectoryHandler.class); + ImageMetadataExtractor e = new ImageMetadataExtractor(metadata, handler1); + + Directory directory = new JpegCommentDirectory(); + Iterator directories = mock(Iterator.class); + when(directories.hasNext()).thenReturn(true, false); + when(directories.next()).thenReturn(directory); + when(handler1.supports(JpegCommentDirectory.class)).thenReturn(true); + + e.handle(directories); + verify(handler1).supports(JpegCommentDirectory.class); + verify(handler1).handle(directory, metadata); + } + + @Test + public void testExifHandlerSupports() { + assertTrue(new ImageMetadataExtractor.ExifHandler().supports(ExifIFD0Directory.class)); + assertTrue(new ImageMetadataExtractor.ExifHandler().supports(ExifSubIFDDirectory.class)); + assertFalse(new ImageMetadataExtractor.ExifHandler().supports(Directory.class)); + assertFalse(new ImageMetadataExtractor.ExifHandler().supports(JpegCommentDirectory.class)); + } + + @Test + public void testExifHandlerParseDate() throws MetadataException { + ExifSubIFDDirectory exif = mock(ExifSubIFDDirectory.class); + when(exif.containsTag(ExifSubIFDDirectory.TAG_DATETIME_ORIGINAL)).thenReturn(true); + GregorianCalendar calendar = new GregorianCalendar(TimeZone.getDefault(), Locale.ROOT); + calendar.setTimeInMillis(0); + calendar.set(2000, 0, 1, 0, 0, 0); + when(exif.getDate(ExifSubIFDDirectory.TAG_DATETIME_ORIGINAL)).thenReturn( + calendar.getTime()); // jvm default timezone as in Metadata Extractor + Metadata metadata = new Metadata(); + + new ImageMetadataExtractor.ExifHandler().handle(exif, metadata); + assertEquals("Should be ISO date without time zone", "2000-01-01T00:00:00", + metadata.get(TikaCoreProperties.CREATED)); + } + + @Test + public void testExifHandlerParseDateFallback() throws MetadataException { + ExifIFD0Directory exif = mock(ExifIFD0Directory.class); + when(exif.containsTag(ExifIFD0Directory.TAG_DATETIME)).thenReturn(true); + GregorianCalendar calendar = new GregorianCalendar(TimeZone.getDefault(), Locale.ROOT); + calendar.setTimeInMillis(0); + calendar.set(1999, 0, 1, 0, 0, 0); + when(exif.getDate(ExifIFD0Directory.TAG_DATETIME)).thenReturn( + calendar.getTime()); // jvm default timezone as in Metadata Extractor + Metadata metadata = new Metadata(); + + new ImageMetadataExtractor.ExifHandler().handle(exif, metadata); + assertEquals("Should try EXIF Date/Time if Original is not set", "1999-01-01T00:00:00", + metadata.get(TikaCoreProperties.CREATED)); + } + + @Test + public void testExifHandlerParseDateError() throws MetadataException { + ExifIFD0Directory exif = mock(ExifIFD0Directory.class); + when(exif.containsTag(ExifSubIFDDirectory.TAG_DATETIME_ORIGINAL)).thenReturn(true); + when(exif.getDate(ExifSubIFDDirectory.TAG_DATETIME_ORIGINAL)).thenReturn(null); + Metadata metadata = new Metadata(); + + new ImageMetadataExtractor.ExifHandler().handle(exif, metadata); + assertEquals("Parsing should proceed without date", null, + metadata.get(TikaCoreProperties.CREATED)); + } + + @Test + public void testCopyUnknownFieldsHandler() throws MetadataException { + Directory d = mock(Directory.class); + Tag t1 = mock(Tag.class); + when(t1.getTagName()).thenReturn("Image Description"); + when(t1.getDescription()).thenReturn("t1"); + Tag t2 = mock(Tag.class); + when(t2.getTagName()).thenReturn(Metadata.KEYWORDS); + when(t2.getDescription()).thenReturn("known"); + Tag t3 = mock(Tag.class); + when(t3.getTagName()).thenReturn(TikaCoreProperties.DESCRIPTION.getName()); + when(t3.getDescription()).thenReturn("known"); + List<Tag> tags = Arrays.asList(t1, t2, t3); + when(d.getTags()).thenReturn(tags); + Metadata metadata = new Metadata(); + new ImageMetadataExtractor.CopyUnknownFieldsHandler().handle(d, metadata); + assertEquals("t1", metadata.get("Image Description")); + assertNull("keywords should be excluded from bulk copy because it is a defined field", + metadata.get(Metadata.KEYWORDS)); + assertNull(metadata.get(TikaCoreProperties.DESCRIPTION)); + } + +}
