Hi all.
Here is the first Matroska parser for the mmpython package.
It currently handles :
 - Stream recognition
 - Video lenght
 - Tracks identification
 - Video specific settings (width, height, fps, codec)
 - Audio specific settings (samplerate, channels, codec)

The Matroska format is based on EBML, a kind of binary XML. It means that
this format is very stable against new releases (ie new fields of data are
ingored by previous parser without crash), but it means too that it is
much more complicated to parse than 'hardcoded' bitstream, as some data
field can be present or not.

Don't hesitate to test it against your MKV bitstreams.

Of course, you nee to add the : import video.mkvinfo in the __init__.py of
mmpython.

BTW some questions about MMpython internals :
1 - What is the unit of the lenght ? Seconds ? Minutes ? Date string ?
2 - Matroska manage subtitles as track (ie like video an audio). I do not
find a 'SubtitleMedia' object in the MediaAV object. is there anyway to
specify embedded multi-subtitles tracks in mmpython ?

Regards, Sylvain.
#if 0
# -----------------------------------------------------------------------
# mkvinfo.py - Matroska Streaming Video Files
# -----------------------------------------------------------------------
# $Id$
#
# $Log$
# -----------------------------------------------------------------------
# MMPython - Media Metadata for Python
# Copyright (C) 2003 Thomas Schueppel, Dirk Meyer
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MER-
# CHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
# Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
# -----------------------------------------------------------------------
#endif


from mmpython import mediainfo
import mmpython
import struct
import re
import stat
import os
import math
from types import *
from struct import *
from string import *

_print = mediainfo._debug

# Main IDs for the Matroska streams
MATROSKA_HEADER_ID       = '\x1A\x45\xDF\xA3'
MATROSKA_TRACKS_ID       = '\x16\x54\xAE\x6B'
MATROSKA_SEGMENT_ID      = '\x18\x53\x80\x67'
MATROSKA_SEEK_HEAD_ID    = '\x11\x4D\x9B\x74'
MATROSKA_SEGMENT_INFO_ID = '\x15\x49\xA9\x66'
MATROSKA_CLUSTER_ID      = '\x1F\x43\xB6\x75'
MATROSKA_VOID_ID         = '\xEC'
MATROSKA_CRC_ID          = '\xBF'
MATROSKA_VIDEO_TYPE_ID   = '\x83'
MATROSKA_CODEC_ID        = '\x86'
MATROSKA_CODEC_NAME_ID   = '\x25\x86\x88'
MATROSKA_FRAME_DURATION_ID = '\x23\xE3\x83'
MATROSKA_VIDEO_SETTINGS_ID = '\xE0'
MATROSKA_VID_WIDTH_ID      = '\xB0'
MATROSKA_VID_HEIGHT_ID     = '\xBA'
MATROSKA_AUDIO_SETTINGS_ID = '\xE1'
MATROSKA_AUDIO_SAMPLERATE_ID = '\xB5'
MATROSKA_AUDIO_CHANNELS_ID   = '\x9F'
MATROSKA_TRACK_LANGUAGE_ID   = '\x22\xB5\x9C'
MATROSKA_TIMECODESCALE_ID    = '\x2A\xD7\xB1'
MATROSKA_DURATION_ID         = '\x44\x89'
MATROSKA_MUXING_APP_ID       = '\x4D\x80'
MATROSKA_WRITING_APP_ID      = '\x57\x41'

MATROSKA_VIDEO_TRACK = 1
MATROSKA_AUDIO_TRACK = 2

# This is class that is responsible to handle one Ebml entity as described in the Matroska/Ebml spec
class EbmlEntity:
    def __init__(self, inbuf):
        # Compute the EBML id
        self.compute_id(inbuf)
        #_print("Entity id : %08X" % self.entity_id)
        if ( self.id_len == 0):
            self.valid = 0
            _print("EBML entity not found, bad file format")
            return
        self.valid = 1
        self.entity_len = self.compute_len(inbuf[self.id_len:])
        # Obviously, the segment can be very long (ie the whole file, so we truncate it at the read buffer size
        if (self.entity_len == -1):
            self.entity_data = inbuf[self.id_len+self.len_size:]
            self.entity_len = len(self.entity_data) # Set the remaining size
        else:
            self.entity_data = inbuf[self.id_len+self.len_size:self.id_len+self.len_size+self.entity_len]
        #_print("Entity len : %d" % self.entity_len)
        # if the size is 1, 2 3 or 4 it could be a numeric value, so do the job
        self.value = 0
        if self.entity_len == 1:
            self.value = ord(self.entity_data[0])
        if self.entity_len == 2:
            self.value = unpack('!H', self.entity_data)[0]
        if self.entity_len == 3:
            self.value = ord(self.entity_data[0])<<16 | ord(self.entity_data[1])<<8 | ord(self.entity_data[2])
        if self.entity_len == 4:
            self.value = unpack('!I', self.entity_data)[0]

    def compute_id(self, inbuf):
        first = ord(inbuf[0])
        self.id_len = 0
        if (first & 0x80):
            self.id_len = 1
            self.entity_id = first
        elif (first & 0x40):
            self.id_len = 2
            self.entity_id = ord(inbuf[0])<<8 | ord(inbuf[1])
        elif (first & 0x20):
            self.id_len = 3
            self.entity_id = (ord(inbuf[0])<<16) | (ord(inbuf[1])<<8) | (ord(inbuf[2]))
        elif (first & 0x10):
            self.id_len = 4
            self.entity_id = (ord(inbuf[0])<<24) | (ord(inbuf[1])<<16) | (ord(inbuf[2])<<8) | (ord(inbuf[3]))
        self.entity_str = inbuf[0:self.id_len]
        return

    def compute_len(self, inbuf):
        # Here we just handle the size up to 4 bytes
        # The size above will be truncated by the read buffer itself
        first = ord(inbuf[0])
        if (first & 0x80):
            self.len_size = 1
            return first - 0x80
        if (first & 0x40):
            self.len_size = 2
            (c1,c2) = unpack('BB',inbuf[:2])
            return ((c1-0x40)<<8) | (c2)
        if (first & 0x20):
            self.len_size = 3
            (c1, c2, c3) = unpack('BBB',inbuf[:3])
            return ((c1-0x20)<<16) | (c2<<8) | (c3)
        if (first & 0x10):
            self.len_size = 4
            (len) = unpack('!I',inbuf[:4])
            return len
        if (first & 0x08):
            self.len_size = 5
            return -1
        if (first & 0x04):
            self.len_size = 6
            return -1
        if (first & 0x02):
            self.len_size = 7
            return -1
        if (first & 0x01):
            self.len_size = 8
            return -1

    def get_value(self):
        value = self.value
        return value

    def get_data(self):
        return self.entity_data

    def get_id(self):
        return self.entity_id

    def get_str_id(self):
        return self.entity_str

    def get_len(self):
        return self.entity_len

    def get_total_len(self):
        return self.entity_len+self.id_len+self.len_size


# This ithe main Matroska object
class MkvInfo(mediainfo.AVInfo):
    def __init__(self, file):
        mediainfo.AVInfo.__init__(self)
        self.samplerate = 1

        buffer = file.read(80000)
        if len(buffer) == 0:
            # Regular File end
            return None

        # Check the Matroska header
        header = EbmlEntity(buffer)
        if ( header.get_str_id() == MATROSKA_HEADER_ID ):
            _print("HEADER ID found %08X" % header.get_id() )
            self.valid = 1
            self.mime = 'application/mkv'
            self.type = 'Matroska'
            # Now get the segment
            segment = EbmlEntity(buffer[header.get_total_len():])
            if ( segment.get_str_id() == MATROSKA_SEGMENT_ID):
                #MEDIACORE = ['title', 'caption', 'comment', 'artist', 'size', 'type', 'subtype',
                #'date', 'keywords', 'country', 'language', 'url']
                segtab = self.process_one_level(segment)
                seginfotab = self.process_one_level(segtab[MATROSKA_SEGMENT_INFO_ID])
                try:
                    scalecode = seginfotab[MATROSKA_TIMECODESCALE_ID].get_value()
                except:
                    scalecode = 1000
                try:
                    duration = unpack('!f', seginfotab[MATROSKA_DURATION_ID].get_data() )[0]
                    duration = duration / scalecode
                    self.length = duration
                except:
                    pass
                try:
                    entity = segtab[MATROSKA_TRACKS_ID]
                    self.process_tracks(entity)
                except:
                    _print("TRACKS ID not found !!" )
            else:
                _print("SEGMENT ID not found %08X" % segment.get_id() )
        else:
            self.valid = 0

    def process_tracks(self, tracks):
        tracksbuf = tracks.get_data()
        indice = 0
        while indice < tracks.get_len():
            trackelem = EbmlEntity(tracksbuf[indice:])
            self.process_one_track(trackelem)
            indice += trackelem.get_total_len()

    def process_one_level(self, item):
        buf = item.get_data()
        indice = 0
        tabelem = {}
        while indice < item.get_len():
            elem = EbmlEntity(buf[indice:])
            tabelem[elem.get_str_id()] = elem
            #print "Found elem %s" % elem.get_str_id()
            indice += elem.get_total_len()
        return tabelem

    def process_one_track(self, track):
        # Process all the items at the track level
        tabelem = self.process_one_level(track)
        # We have the dict of track eleme, now build the MMPYTHON information
        type = tabelem[MATROSKA_VIDEO_TYPE_ID]
        if (type.get_value() == MATROSKA_VIDEO_TRACK ):
            #VIDEOCORE = ['length', 'encoder', 'bitrate', 'samplerate', 'codec', 'samplebits',
            #     'width', 'height', 'fps', 'aspect']
            vi = mediainfo.VideoInfo()
            try:
                elem = tabelem[MATROSKA_CODEC_ID]
                vi.codec = elem.get_data()
            except:
                vi.codec = 'Unknown'
            try:
                elem = tabelem[MATROSKA_FRAME_DURATION_ID]
                vi.fps = 1 / (pow(10, -9) * (elem.get_value()))
            except:
                vi.fps = 0
            try:
                vinfo = tabelem[MATROSKA_VIDEO_SETTINGS_ID]
                vidtab = self.process_one_level(vinfo)
                vi.width  = vidtab[MATROSKA_VID_WIDTH_ID].get_value()
                vi.height = vidtab[MATROSKA_VID_HEIGHT_ID].get_value()
            except:
                _print("No info about video track !!!")
            self.video.append(vi)
        elif (type.get_value() == MATROSKA_AUDIO_TRACK ):
            #AUDIOCORE = ['channels', 'samplerate', 'length', 'encoder', 'codec', 'samplebits',
            #     'bitrate', 'language']
            ai = mediainfo.AudioInfo()
            try:
                elem = tabelem[MATROSKA_TRACK_LANGUAGE_ID]
                ai.language = elem.get_data()
            except:
                ai.language = 'Unknown'
            try:
                elem = tabelem[MATROSKA_CODEC_ID]
                ai.codec = elem.get_data()
            except:
                ai.codec = 'Unknown'
            try:
                ainfo = tabelem[MATROSKA_AUDIO_SETTINGS_ID]
                audtab = self.process_one_level(vinfo)
                ai.samplerate  = unpack('!f', audtab[MATROSKA_AUDIO_SAMPLERATE_ID].get_value())[0]
                ai.channels = audtab[MATROSKA_AUDIO_CHANNELS_ID].get_value()
            except:
                _print("No info about audio track !!!")
            self.audio.append(ai)

        _print("Found %d elem for this track" % len(tabelem) )

mmpython.registertype( 'application/mkv', ('mkv', 'mka',), mediainfo.TYPE_AV, MkvInfo )

Reply via email to