jukka
Mon, 14 Dec 2009 14:18:01 -0800
Author: jukka Date: Mon Dec 14 22:17:28 2009 New Revision: 890503 URL: http://svn.apache.org/viewvc?rev=890503&view=rev Log: TIKA-328: Add parser for .flv videos Patch by Sami Siren Added: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/video/ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/video/FLVParser.java lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/video/ lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/video/FLVParserTest.java lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testFLV.flv (with props) Modified: lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/tika-config.xml Modified: lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/tika-config.xml URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/tika-config.xml?rev=890503&r1=890502&r2=890503&view=diff ============================================================================== --- lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/tika-config.xml (original) +++ lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/tika-config.xml Mon Dec 14 22:17:28 2009 @@ -172,6 +172,10 @@ <mime>application/epub+zip</mime> </parser> + <parser name="parse-flv" class="org.apache.tika.parser.video.FLVParser"> + <mime>video/x-flv</mime> + </parser> + </parsers> </properties> \ No newline at end of file Added: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/video/FLVParser.java URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/video/FLVParser.java?rev=890503&view=auto ============================================================================== --- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/video/FLVParser.java (added) +++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/video/FLVParser.java Mon Dec 14 22:17:28 2009 @@ -0,0 +1,254 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.video; + +import java.io.ByteArrayInputStream; +import java.io.DataInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Date; +import java.util.HashMap; +import java.util.Map; +import java.util.Map.Entry; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +/** + * <p> + * Parser for metadata contained in Flash Videos (.flv). Resources: + * http://osflash.org/flv and for AMF: + * http://download.macromedia.com/pub/labs/amf/amf0_spec_121207.pdf + * <p> + * This parser is capable of extracting the general metadata from header as well + * as embedded metadata. + * <p> + * Known keys for metadata (from file header): + * <ol> + * <li>hasVideo: true|false + * <li>hasSound: true|false + * </ol> + * <p> + * In addition to the above values also metadata that is inserted in to the + * actual stream will be picked. Usually there are keys like: + * hasKeyframes, lastkeyframetimestamp, audiocodecid, keyframes, filepositions, + * hasMetadata, audiosamplerate, videodatarate metadatadate, videocodecid, + * metadatacreator, audiosize, hasVideo, height, audiosamplesize, framerate, + * hasCuePoints width, cuePoints, lasttimestamp, canSeekToEnd, datasize, + * duration, videosize, filesize, audiodatarate, hasAudio, stereo audiodelay + */ +public class FLVParser implements Parser { + + private static int TYPE_METADATA = 0x12; + private static byte MASK_AUDIO = 1; + private static byte MASK_VIDEO = 4; + + private long readUInt32(DataInputStream input) throws IOException { + return input.readInt() & 0xFFFFFFFFL; + } + + private int readUInt24(DataInputStream input) throws IOException { + int uint = input.read()<<16; + uint += input.read()<<8; + uint += input.read(); + return uint; + } + + private Object readAMFData(DataInputStream input, int type) + throws IOException { + if (type == -1) { + type = input.readUnsignedByte(); + } + switch (type) { + case 0: + return input.readDouble(); + case 1: + return input.readUnsignedByte() == 1; + case 2: + return readAMFString(input); + case 3: + return readAMFObject(input); + case 8: + return readAMFEcmaArray(input); + case 10: + return readAMFStrictArray(input); + case 11: + final Date date = new Date((long) input.readDouble()); + input.skip(2); // time zone + return date; + case 13: + return "UNDEFINED"; + default: + return null; + } + } + + private Object readAMFStrictArray(DataInputStream input) throws IOException { + long count = readUInt32(input); + ArrayList<Object> list = new ArrayList<Object>(); + for (int i = 0; i < count; i++) { + list.add(readAMFData(input, -1)); + } + return list; + } + + + private String readAMFString(DataInputStream input) throws IOException { + int size = input.readUnsignedShort(); + byte[] chars = new byte[size]; + input.readFully(chars); + String value = new String(chars); + return value; + } + + private Object readAMFObject(DataInputStream input) throws IOException { + HashMap<String, Object> array = new HashMap<String, Object>(); + while (true) { + String key = readAMFString(input); + int dataType = input.read(); + if (dataType == 9) { // object end marker + break; + } + array.put(key, readAMFData(input, dataType)); + } + return array; + } + + private Object readAMFEcmaArray(DataInputStream input) throws IOException { + long size = readUInt32(input); + HashMap<String, Object> array = new HashMap<String, Object>(); + for (int i = 0; i < size; i++) { + String key = readAMFString(input); + int dataType = input.read(); + array.put(key, readAMFData(input, dataType)); + } + return array; + } + + private boolean checkSignature(DataInputStream fis) throws IOException { + return fis.read() == 'F' && fis.read() == 'L' && fis.read() == 'V'; + } + + public void parse( + InputStream stream, ContentHandler handler, + Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + xhtml.startDocument(); + + DataInputStream datainput = new DataInputStream(stream); + if (!checkSignature(datainput)) { + throw new TikaException("FLV signature not detected"); + } + + // header + int version = datainput.readUnsignedByte(); + if (version != 1) { + // should be 1, perhaps this is not flv? + return; + } + + int typeFlags = datainput.readUnsignedByte(); + metadata.add("hasVideo", Boolean.toString((typeFlags & MASK_VIDEO) != 0)); + metadata.add("hasAudio", Boolean.toString((typeFlags & MASK_AUDIO) != 0)); + + long len = readUInt32(datainput); + if (len != 9) { + // we only know about format with header of 9 bytes + return; + } + + long sizePrev = readUInt32(datainput); + if (sizePrev != 0) { + // should be 0, perhaps this is not flv? + return; + } + + // flv tag stream follows... + while (true) { + int type = datainput.read(); + if (type == -1) { + // EOF + break; + } + + int datalen = readUInt24(datainput); //body length + stream.skip(4); // timestamp + stream.skip(3); // streamid + + if (type == TYPE_METADATA) { + // found metadata Tag, read content to buffer + byte[] metaBytes = new byte[datalen]; + for (int readCount = 0; readCount < datalen;) { + int r = stream.read(metaBytes, readCount, datalen - readCount); + if(r!=-1) { + readCount += r; + + } else { + break; + } + } + + ByteArrayInputStream is = new ByteArrayInputStream(metaBytes); + + DataInputStream dis = new DataInputStream(is); + + Object data = null; + + for (int i = 0; i < 2; i++) { + data = readAMFData(dis, -1); + } + + if (data instanceof Map) { + // TODO if there are multiple metadata values with same key (in + // separate AMF blocks, we currently loose previous values) + Map<String, Object> extractedMetadata = (Map<String, Object>) data; + for (Entry<String, Object> entry : extractedMetadata.entrySet()) { + metadata.set(entry.getKey(), entry.getValue().toString()); + } + } + + } else { + // Tag was not metadata, skip over data we cannot handle + for (int skiplen = 0; skiplen < datalen;) { + long currentSkipLen = datainput.skip(datalen - skiplen); + skiplen += currentSkipLen; + } + } + + sizePrev = readUInt32(datainput); // previous block size + if (sizePrev != datalen + 11) { + // file was corrupt or we could not parse it... + break; + } + } + + xhtml.endDocument(); + } + + public void parse(InputStream stream, ContentHandler handler, + Metadata metadata) throws IOException, SAXException, TikaException { + parse(stream, handler, metadata, null); + } + +} Added: lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/video/FLVParserTest.java URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/video/FLVParserTest.java?rev=890503&view=auto ============================================================================== --- lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/video/FLVParserTest.java (added) +++ lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/video/FLVParserTest.java Mon Dec 14 22:17:28 2009 @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.video; + +import junit.framework.TestCase; + +import org.apache.tika.Tika; +import org.apache.tika.metadata.Metadata; + +public class FLVParserTest extends TestCase { + + public void testFLV() throws Exception { + String path = "/test-documents/testFLV.flv"; + Metadata metadata = new Metadata(); + + String content = new Tika().parseToString(FLVParserTest.class. + getResourceAsStream(path), metadata); + + System.out.println(metadata); + assertEquals("video/x-flv", metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("true", metadata.get("hasVideo")); + assertEquals("false", metadata.get("stereo")); + assertEquals("true", metadata.get("hasAudio")); + assertEquals("120.0", metadata.get("height")); + assertEquals("16.0", metadata.get("audiosamplesize")); + } + +} Added: lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testFLV.flv URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testFLV.flv?rev=890503&view=auto ============================================================================== Binary file - no diff available. Propchange: lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testFLV.flv ------------------------------------------------------------------------------ svn:mime-type = application/octet-stream