Author: tack Date: Thu Dec 7 22:31:30 2006 New Revision: 2181 Modified: trunk/metadata/src/video/mkvinfo.py
Log: Significant rework of mkv parser: much more correct. Modified: trunk/metadata/src/video/mkvinfo.py ============================================================================== --- trunk/metadata/src/video/mkvinfo.py (original) +++ trunk/metadata/src/video/mkvinfo.py Thu Dec 7 22:31:30 2006 @@ -102,6 +102,11 @@ MATROSKA_FILE_MIME_TYPE_ID = 0x4660 MATROSKA_FILE_DATA_ID = 0x465C +MATROSKA_SEEKHEAD_ID = 0x114D9B74 +MATROSKA_SEEK_ID = 0x4DBB +MATROSKA_SEEKID_ID = 0x53AB +MATROSKA_SEEK_POSITION_ID = 0x53AC + # See mkv spec for details: # http://www.matroska.org/technical/specs/index.html @@ -145,23 +150,26 @@ log.debug("EBML entity not found, bad file format") raise mediainfo.KaaMetadataParseError() - self.entity_len = self.compute_len(inbuf[self.id_len:]) - # Obviously, the segment can be very long (ie the whole file, so we - # truncate it at the read buffer size - if self.entity_len == -1: - self.entity_data = inbuf[self.id_len+self.len_size:] - self.entity_len = len(self.entity_data) # Set the remaining size - else: - self.entity_data = inbuf[self.id_len + \ - self.len_size:self.id_len + \ - self.len_size+self.entity_len] + self.entity_len, self.len_size = self.compute_len(inbuf[self.id_len:]) + self.entity_data = inbuf[self.get_header_len() : self.get_total_len()] + self.ebml_length = self.entity_len + self.entity_len = min(len(self.entity_data), self.entity_len) - # if the size is 1, 2 3 or 4 it could be a numeric value, so do the job + # if the data size is 8 or less, it could be a numeric value self.value = 0 if self.entity_len <= 8: for pos, shift in zip(range(self.entity_len), range((self.entity_len-1)*8, -1, -8)): self.value |= ord(self.entity_data[pos]) << shift + + def add_data(self, data): + maxlen = self.ebml_length - self.get_total_len() + if maxlen <= 0: + return + self.entity_data += data[:maxlen] + self.entity_len = len(self.entity_data) + + def compute_id(self, inbuf): first = ord(inbuf[0]) self.id_len = 0 @@ -181,44 +189,37 @@ (ord(inbuf[2])<<8) | (ord(inbuf[3])) self.entity_str = inbuf[0:self.id_len] + def compute_len(self, inbuf): - # Here we just handle the size up to 4 bytes - # The size above will be truncated by the read buffer itself - first = ord(inbuf[0]) - if first & 0x80: - self.len_size = 1 - return first - 0x80 - if first & 0x40: - self.len_size = 2 - (c1,c2) = unpack('BB',inbuf[:2]) - return ((c1-0x40)<<8) | (c2) - if first & 0x20: - self.len_size = 3 - (c1, c2, c3) = unpack('BBB',inbuf[:3]) - return ((c1-0x20)<<16) | (c2<<8) | (c3) - if first & 0x10: - self.len_size = 4 - (c1, c2, c3, c4) = unpack('BBBB',inbuf[:4]) - return ((c1-0x10)<<24) | (c2<<16) | (c3<<8) | c4 - if first & 0x08: - self.len_size = 5 - return -1 - if first & 0x04: - self.len_size = 6 - return -1 - if first & 0x02: - self.len_size = 7 - return -1 - if first & 0x01: - self.len_size = 8 - return -1 + i = num_ffs = 0 + len_mask = 0x80 + len = ord(inbuf[0]) + while not len & len_mask: + i += 1 + len_mask >>= 1 + if i >= 8: + return 0, 0 + + len &= len_mask - 1 + if len == len_mask - 1: + num_ffs += 1 + for p in range(i): + len = (len << 8) | ord(inbuf[p + 1]) + if len & 0xff == 0xff: + num_ffs += 1 + if num_ffs == i + 1: + len = 0 + return len, i + 1 + def get_crc_len(self): return self.crc_len + def get_value(self): return self.value + def get_float_value(self): if len(self.entity_data) == 4: return unpack('!f', self.entity_data)[0] @@ -226,26 +227,38 @@ return unpack('!d', self.entity_data)[0] return 0.0 + def get_data(self): return self.entity_data + def get_utf8(self): return unicode(self.entity_data, 'utf-8', 'replace') + def get_str(self): return unicode(self.entity_data, 'ascii', 'replace') + def get_id(self): return self.entity_id + def get_str_id(self): return self.entity_str + def get_len(self): return self.entity_len + def get_total_len(self): - return self.entity_len + self.id_len+self.len_size + return self.entity_len + self.id_len + self.len_size + + + def get_header_len(self): + return self.id_len + self.len_size + class MkvInfo(mediainfo.AVInfo): @@ -257,7 +270,9 @@ self.samplerate = 1 self.media = 'audio' - buffer = file.read(80000) + self.file = file + # Read enough that we're likely to get the full seekhead (FIXME: kludge) + buffer = file.read(2000) if len(buffer) == 0: # Regular File end raise mediainfo.KaaMetadataParseError() @@ -270,207 +285,256 @@ log.debug("HEADER ID found %08X" % header.get_id() ) self.mime = 'application/mkv' self.type = 'Matroska' - # Now get the segment - segment = EbmlEntity(buffer[header.get_total_len():]) - if segment.get_id() == MATROSKA_SEGMENT_ID: - log.debug("SEGMENT ID found %08X" % segment.get_id()) - segtab = self.process_one_level(segment) - l = segtab[MATROSKA_SEGMENT_INFO_ID] - seginfotab = self.process_one_level(l) - try: - # Express scalecode in ms instead of ns - # Rescale it to the second - scalecode = seginfotab[MATROSKA_TIMECODESCALE_ID].get_value() - except (ZeroDivisionError, KeyError, IndexError): - scalecode = 1000000.0 - - try: - duration = seginfotab[MATROSKA_DURATION_ID].get_float_value() - self.length = duration * scalecode / 1000000000.0 - except (ZeroDivisionError, KeyError, IndexError): - pass - - if MATROSKA_TITLE_ID in seginfotab: - self.title = seginfotab[MATROSKA_TITLE_ID].get_utf8() - - if MATROSKA_DATE_UTC_ID in seginfotab: - self.date = unpack('!q', seginfotab[MATROSKA_DATE_UTC_ID].get_data())[0] / 10.0**9 - # Date is offset 2001-01-01 00:00:00 (timestamp 978307200.0) - self.date += 978307200.0 - - try: - log.debug("Searching for id : %X" % MATROSKA_TRACKS_ID) - entity = segtab[MATROSKA_TRACKS_ID] - self.process_tracks(entity) - except (ZeroDivisionError, KeyError, IndexError): - log.debug("TRACKS ID not found !!" ) - - if MATROSKA_CHAPTERS_ID in segtab: - self.process_chapters(segtab[MATROSKA_CHAPTERS_ID]) - if MATROSKA_ATTACHMENTS_ID in segtab: - self.process_attachments(segtab[MATROSKA_ATTACHMENTS_ID]) - - else: + # Now get the segment + self.segment = segment = EbmlEntity(buffer[header.get_total_len():]) + # Record file offset of segment data for seekheads + self.segment.offset = header.get_total_len() + segment.get_header_len() + if segment.get_id() != MATROSKA_SEGMENT_ID: log.debug("SEGMENT ID not found %08X" % segment.get_id()) + return + + log.debug("SEGMENT ID found %08X" % segment.get_id()) + for elem in self.process_one_level(segment): + if elem.get_id() == MATROSKA_SEEKHEAD_ID: + self.process_elem(elem) + + + def process_elem(self, elem): + elem_id = elem.get_id() + log.debug('BEGIN: process element %s' % hex(elem_id)) + if elem_id == MATROSKA_SEGMENT_INFO_ID: + duration = 0 + scalecode = 1000000.0 + + for ielem in self.process_one_level(elem): + ielem_id = ielem.get_id() + if ielem_id == MATROSKA_TIMECODESCALE_ID: + scalecode = ielem.get_value() + elif ielem_id == MATROSKA_DURATION_ID: + duration = ielem.get_float_value() + elif ielem_id == MATROSKA_TITLE_ID: + self.title = ielem.get_utf8() + elif ielem_id == MATROSKA_DATE_UTC_ID: + self.date = unpack('!q', ielem.get_data())[0] / 10.0**9 + # Date is offset 2001-01-01 00:00:00 (timestamp 978307200.0) + self.date += 978307200.0 + + self.length = duration * scalecode / 1000000000.0 + + elif elem_id == MATROSKA_TRACKS_ID: + self.process_tracks(elem) + + elif elem_id == MATROSKA_CHAPTERS_ID: + self.process_chapters(elem) + + elif elem_id == MATROSKA_ATTACHMENTS_ID: + self.process_attachments(elem) + + elif elem_id == MATROSKA_SEEKHEAD_ID: + self.process_seekhead(elem) + + log.debug('END: process element %s' % hex(elem_id)) + return True + + + def process_seekhead(self, elem): + for seek_elem in self.process_one_level(elem): + if seek_elem.get_id() != MATROSKA_SEEK_ID: + continue + for sub_elem in self.process_one_level(seek_elem): + if sub_elem.get_id() == MATROSKA_SEEKID_ID: + if sub_elem.get_value() == MATROSKA_CLUSTER_ID: + # Not interested in these. + return + + elif sub_elem.get_id() == MATROSKA_SEEK_POSITION_ID: + self.file.seek(self.segment.offset + sub_elem.get_value()) + buffer = self.file.read(100) + elem = EbmlEntity(buffer) + # Fetch all data necessary for this element. + elem.add_data(self.file.read(elem.ebml_length)) + self.process_elem(elem) def process_tracks(self, tracks): tracksbuf = tracks.get_data() - indice = 0 - while indice < tracks.get_len(): - trackelem = EbmlEntity(tracksbuf[indice:]) + index = 0 + while index < tracks.get_len(): + trackelem = EbmlEntity(tracksbuf[index:]) log.debug ("ELEMENT %X found" % trackelem.get_id()) - self.process_one_track(trackelem) - indice += trackelem.get_total_len() + trackelem.get_crc_len() + self.process_track(trackelem) + index += trackelem.get_total_len() + trackelem.get_crc_len() def process_one_level(self, item): buf = item.get_data() - indice = 0 - tabelem = {} - while indice < item.get_len(): - if len(buf[indice:]) == 0: + index = 0 + while index < item.get_len(): + if len(buf[index:]) == 0: break - elem = EbmlEntity(buf[indice:]) - tabelem[elem.get_id()] = elem - indice += elem.get_total_len() + elem.get_crc_len() - return tabelem - - - def process_one_track(self, track): - # Process all the items at the track level - tabelem = self.process_one_level(track) - # We have the dict of track eleme, now build the information - type = tabelem[MATROSKA_TRACK_TYPE_ID] - mytype = type.get_value() - log.debug ("Track type found with UID %d" % mytype) - track = None + elem = EbmlEntity(buf[index:]) + yield elem + index += elem.get_total_len() + elem.get_crc_len() - if mytype == MATROSKA_VIDEO_TRACK: - log.debug("VIDEO TRACK found !!") - track = mediainfo.VideoInfo() - try: - elem = tabelem[MATROSKA_CODEC_ID] - track.codec = elem.get_str() - except (ZeroDivisionError, KeyError, IndexError): - track.codec = u'Unknown' - - # convert codec information - # http://haali.cs.msu.ru/mkv/codecs.pdf - if track.codec in FOURCCMap: - track.codec = FOURCCMap[track.codec] - elif track.codec.endswith('FOURCC') and \ - MATROSKA_CODEC_PRIVATE_ID in tabelem and \ - tabelem[MATROSKA_CODEC_PRIVATE_ID].get_len() == 40: - track.codec = tabelem[MATROSKA_CODEC_PRIVATE_ID].get_data()[16:20] - elif track.codec.startswith('V_REAL/'): - track.codec = track.codec[7:] - elif track.codec.startswith('V_'): - # FIXME: add more video codecs here - track.codec = track.codec[2:] - - try: - elem = tabelem[MATROSKA_FRAME_DURATION_ID] - track.fps = 1 / (pow(10, -9) * (elem.get_value())) - except (ZeroDivisionError, KeyError, IndexError): - track.fps = 0 - - try: - vinfo = tabelem[MATROSKA_VIDEO_SETTINGS_ID] - vidtab = self.process_one_level(vinfo) - track.width = vidtab[MATROSKA_VID_WIDTH_ID].get_value() - track.height = vidtab[MATROSKA_VID_HEIGHT_ID].get_value() - if vidtab.has_key(MATROSKA_DISPLAY_VID_WIDTH_ID) and \ - vidtab.has_key(MATROSKA_DISPLAY_VID_HEIGHT_ID): - track.aspect = float(vidtab[MATROSKA_DISPLAY_VID_WIDTH_ID].get_value()) / \ - vidtab[MATROSKA_DISPLAY_VID_HEIGHT_ID].get_value() - if MATROSKA_VID_INTERLACED in vidtab: - value = int(vidtab[MATROSKA_VID_INTERLACED].get_value()) - self._set('interlaced', value) - - except Exception, e: - log.debug("No other info about video track !!!") - self.media = 'video' - self.video.append(track) - - elif mytype == MATROSKA_AUDIO_TRACK: - log.debug("AUDIO TRACK found !!") - track = mediainfo.AudioInfo() - try: - elem = tabelem[MATROSKA_CODEC_ID] - track.codec = elem.get_str() - if track.codec in FOURCCMap: - track.codec = FOURCCMap[track.codec] - elif track.codec.startswith('A_'): - track.codec = track.codec[2:] - except (KeyError, IndexError): - track.codec = u"Unknown" - - try: - ainfo = tabelem[MATROSKA_AUDIO_SETTINGS_ID] - audtab = self.process_one_level(ainfo) - track.samplerate = audtab[MATROSKA_AUDIO_SAMPLERATE_ID].get_float_value() - track.channels = audtab[MATROSKA_AUDIO_CHANNELS_ID].get_value() - except (KeyError, IndexError): - log.debug("No other info about audio track !!!") + def process_track(self, track): + # Collapse generator into a list since we need to iterate over it + # twice. + elements = [ x for x in self.process_one_level(track) ] + track_type = [ x.get_value() for x in elements if x.get_id() == MATROSKA_TRACK_TYPE_ID ] + if not track_type: + log.debug('Bad track: no type id found') + return - self.audio.append(track) + track_type = track_type[0] + track = None - elif mytype == MATROSKA_SUBTITLES_TRACK: + if track_type == MATROSKA_VIDEO_TRACK: + log.debug("Video track found") + track = self.process_video_track(elements) + elif track_type == MATROSKA_AUDIO_TRACK: + log.debug("Audio track found") + track = self.process_audio_track(elements) + elif track_type == MATROSKA_SUBTITLES_TRACK: + log.debug("Subtitle track found") track = mediainfo.SubtitleInfo() self.subtitles.append(track) + for elem in elements: + self.process_track_common(elem, track) - if not track: - return - if MATROSKA_TRACK_LANGUAGE_ID in tabelem: - track.language = tabelem[MATROSKA_TRACK_LANGUAGE_ID].get_str() + def process_track_common(self, elem, track): + track.language = u'und' + elem_id = elem.get_id() + if elem_id == MATROSKA_TRACK_LANGUAGE_ID: + track.language = elem.get_str() log.debug("Track language found: %s" % track.language) - else: - track.language = u"und" + elif elem_id == MATROSKA_NAME_ID: + track.title = elem.get_utf8() + elif elem_id == MATROSKA_TRACK_NUMBER_ID: + track.trackno = elem.get_value() + + + def process_video_track(self, elements): + track = mediainfo.VideoInfo() + codec_private_id = '' + # Defaults + track.codec = u'Unknown' + track.fps = 0 + + for elem in elements: + elem_id = elem.get_id() + if elem_id == MATROSKA_CODEC_ID: + track.codec = elem.get_str() - if MATROSKA_NAME_ID in tabelem: - track.title = tabelem[MATROSKA_NAME_ID].get_utf8() + elif elem_id == MATROSKA_FRAME_DURATION_ID: + try: + track.fps = 1 / (pow(10, -9) * (elem.get_value())) + except ZeroDivisionError: + pass + + elif elem_id == MATROSKA_VIDEO_SETTINGS_ID: + d_width = d_height = None + for settings_elem in self.process_one_level(elem): + settings_elem_id = settings_elem.get_id() + if settings_elem_id == MATROSKA_VID_WIDTH_ID: + track.width = settings_elem.get_value() + elif settings_elem_id == MATROSKA_VID_HEIGHT_ID: + track.height = settings_elem.get_value() + elif settings_elem_id == MATROSKA_DISPLAY_VID_WIDTH_ID: + d_width = settings_elem.get_value() + elif settings_elem_id == MATROSKA_DISPLAY_VID_HEIGHT_ID: + d_height = settings_elem.get_value() + elif settings_elem_id == MATROSKA_VID_INTERLACED: + value = int(settings_elem.get_value()) + self._set('interlaced', value) + + if None not in (d_width, d_height): + track.aspect = float(d_width) / d_height + + elif elem_id == MATROSKA_CODEC_PRIVATE_ID: + codec_private_id = elem.get_data() + + else: + self.process_track_common(elem, track) + + # convert codec information + # http://haali.cs.msu.ru/mkv/codecs.pdf + if track.codec in FOURCCMap: + track.codec = FOURCCMap[track.codec] + elif track.codec.endswith('FOURCC') and len(codec_private_id) == 40: + track.codec = codec_private_id[16:20] + elif track.codec.startswith('V_REAL/'): + track.codec = track.codec[7:] + elif track.codec.startswith('V_'): + # FIXME: add more video codecs here + track.codec = track.codec[2:] + + self.media = 'video' + self.video.append(track) + return track + + + def process_audio_track(self, elements): + track = mediainfo.AudioInfo() + track.codec = u'Unknown' + + for elem in elements: + elem_id = elem.get_id() + if elem_id == MATROSKA_CODEC_ID: + track.codec = elem.get_str() + elif elem_id == MATROSKA_AUDIO_SETTINGS_ID: + for settings_elem in self.process_one_level(elem): + settings_elem_id = settings_elem.get_id() + if settings_elem_id == MATROSKA_AUDIO_SAMPLERATE_ID: + track.samplerate = settings_elem.get_float_value() + elif settings_elem_id == MATROSKA_AUDIO_CHANNELS_ID: + track.channels = settings_elem.get_value() + else: + self.process_track_common(elem, track) + + + if track.codec in FOURCCMap: + track.codec = FOURCCMap[track.codec] + elif track.codec.startswith('A_'): + track.codec = track.codec[2:] - if MATROSKA_TRACK_NUMBER_ID in tabelem: - track.trackno = tabelem[MATROSKA_TRACK_NUMBER_ID].get_value() + self.audio.append(track) + return track def process_chapters(self, chapters): - tabelem = self.process_one_level(chapters) - if MATROSKA_EDITION_ENTRY_ID not in tabelem: - return - - entry = tabelem[MATROSKA_EDITION_ENTRY_ID] - buf = entry.get_data() - indice = 0 - while indice < entry.get_len(): - elem = EbmlEntity(buf[indice:]) - if elem.get_id() == MATROSKA_CHAPTER_ATOM_ID: - self.process_chapter_atom(elem) - indice += elem.get_total_len() + elem.get_crc_len() + elements = self.process_one_level(chapters) + for elem in elements: + if elem.get_id() == MATROSKA_EDITION_ENTRY_ID: + buf = elem.get_data() + index = 0 + while index < elem.get_len(): + sub_elem = EbmlEntity(buf[index:]) + if sub_elem.get_id() == MATROSKA_CHAPTER_ATOM_ID: + self.process_chapter_atom(sub_elem) + index += sub_elem.get_total_len() + sub_elem.get_crc_len() def process_chapter_atom(self, atom): - tabelem = self.process_one_level(atom) + elements = self.process_one_level(atom) chap = mediainfo.ChapterInfo() - if MATROSKA_CHAPTER_TIME_START_ID in tabelem: - # Scale timecode to seconds (float) - chap.pos = tabelem[MATROSKA_CHAPTER_TIME_START_ID].get_value() / 1000000 / 1000.0 - - if MATROSKA_CHAPTER_FLAG_ENABLED_ID in tabelem: - chap.enabled = tabelem[MATROSKA_CHAPTER_FLAG_ENABLED_ID].get_value() - - if MATROSKA_CHAPTER_DISPLAY_ID in tabelem: - # Matroska supports multiple (chapter name, language) pairs for - # each chapter, so chapter names can be internationalized. This - # logic will only take the last one in the list. - tabelem = self.process_one_level(tabelem[MATROSKA_CHAPTER_DISPLAY_ID]) - if MATROSKA_CHAPTER_STRING_ID in tabelem: - chap.name = tabelem[MATROSKA_CHAPTER_STRING_ID].get_utf8() + for elem in elements: + elem_id = elem.get_id() + if elem_id == MATROSKA_CHAPTER_TIME_START_ID: + # Scale timecode to seconds (float) + chap.pos = elem.get_value() / 1000000 / 1000.0 + elif elem_id == MATROSKA_CHAPTER_FLAG_ENABLED_ID: + chap.enabled = elem.get_value() + elif elem_id == MATROSKA_CHAPTER_DISPLAY_ID: + # Matroska supports multiple (chapter name, language) pairs for + # each chapter, so chapter names can be internationalized. This + # logic will only take the last one in the list. + for display_elem in self.process_one_level(elem): + if display_elem.get_id() == MATROSKA_CHAPTER_STRING_ID: + chap.name = display_elem.get_utf8() log.debug('Chapter "%s" found' % str(chap.name)) self.chapters.append(chap) @@ -478,28 +542,29 @@ def process_attachments(self, attachments): buf = attachments.get_data() - indice = 0 - while indice < attachments.get_len(): - elem = EbmlEntity(buf[indice:]) + index = 0 + while index < attachments.get_len(): + elem = EbmlEntity(buf[index:]) if elem.get_id() == MATROSKA_ATTACHED_FILE_ID: self.process_attachment(elem) - indice += elem.get_total_len() + elem.get_crc_len() + index += elem.get_total_len() + elem.get_crc_len() def process_attachment(self, attachment): - tabelem = self.process_one_level(attachment) + elements = self.process_one_level(attachment) name = desc = mimetype = "" + data = None - if MATROSKA_FILE_NAME_ID in tabelem: - name = tabelem[MATROSKA_FILE_NAME_ID].get_utf8() - if MATROSKA_FILE_DESC_ID in tabelem: - desc = tabelem[MATROSKA_FILE_DESC_ID].get_utf8() - if MATROSKA_FILE_MIME_TYPE_ID in tabelem: - mimetype = tabelem[MATROSKA_FILE_MIME_TYPE_ID].get_data() - if MATROSKA_FILE_DATA_ID in tabelem: - data = tabelem[MATROSKA_FILE_DATA_ID].get_data() - else: - data = None + for elem in elements: + elem_id = elem.get_id() + if elem_id == MATROSKA_FILE_NAME_ID: + name = elem.get_utf8() + elif elem_id == MATROSKA_FILE_DESC_ID: + desc = elem.get_utf8() + elif elem_id == MATROSKA_FILE_MIME_TYPE_ID: + mimetype = elem.get_data() + elif elem_id == MATROSKA_FILE_DATA_ID: + data = elem.get_data() # Right now we only support attachments that could be cover images. # Make a guess to see if this attachment is a cover image. ------------------------------------------------------------------------- Take Surveys. Earn Cash. Influence the Future of IT Join SourceForge.net's Techsay panel and you'll get the chance to share your opinions on IT & business topics through brief surveys - and earn cash http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV _______________________________________________ Freevo-cvslog mailing list [email protected] https://lists.sourceforge.net/lists/listinfo/freevo-cvslog
