[FFmpeg-devel] [PATCH] avformat/dss: fix DSS SP demux of voice-activated (paused) recordings

Guillain d'Erceville via ffmpeg-devel Thu, 25 Jun 2026 06:57:27 -0700

dss_sp_read_packet() reads the audio area as a flat concatenation of
506-byte block payloads. That is correct only for continuous, gap-free
recordings. Real Olympus dictation is voice-activated: when the speaker
pauses, the recorder emits a block with frame_count == 0 (header byte 2)
whose payload, after the few bytes that finish the frame straddling the
block boundary, is padding to be discarded; the next block restarts the
frame grid at its anchor 2*byte1 (+2 when byte-swapped). The demuxer
read straight through both, so the stream desynced from the first pause
to end of file -- audible garble after the pause, the rest intact.


Skip the padding of empty blocks and re-sync the following block at its
anchor, restarting the byte-swap parity from that block -- the same
re-anchoring dss_read_seek() already performs. The change is a no-op on
gap-free audio: no block is empty there, so the new path is never taken
(verified bit-identical to the previous output up to the first pause on
a real recording, and the existing DSS SP FATE coverage is unchanged).

The re-anchoring rule was confirmed against the Olympus DssParser.dll
reference: the emitted frame positions match the live parser, and the
paused sample now decodes correctly across the boundary where it was
previously garbled.

Signed-off-by: Guillain d'Erceville <[email protected]>
---
 libavformat/dss.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 53 insertions(+), 1 deletion(-)

diff --git a/libavformat/dss.c b/libavformat/dss.c
index ce86b32..998dcf9 100644
--- a/libavformat/dss.c
+++ b/libavformat/dss.c
@@ -53,6 +53,7 @@ typedef struct DSSDemuxContext {
     int counter;
     int swap;
     int dss_sp_swap_byte;
+    int resync_pending;
 
     int packet_size;
     int dss_header_size;
@@ -171,6 +172,7 @@ static int dss_read_header(AVFormatContext *s)
 
     ctx->counter = 0;
     ctx->swap    = 0;
+    ctx->resync_pending = 0;
 
     return 0;
 }
@@ -179,8 +181,57 @@ static void dss_skip_audio_header(AVFormatContext *s, 
AVPacket *pkt)
 {
     DSSDemuxContext *ctx = s->priv_data;
     AVIOContext *pb = s->pb;
+    uint8_t header[DSS_AUDIO_BLOCK_HEADER_SIZE];
+    int offset, read_size;
+
+    /* Second half of the VOX-pause handling (see below): the frame that
+     * straddled into the empty block has now been completed from its
+     * leading bytes. Discard the rest of that block (padding) by
+     * aligning to the next 512-byte block boundary, then re-sync the
+     * frame grid at the following block's anchor 2*byte1 (+2 when
+     * byte-swapped), restarting the byte-swap parity from that block. */
+    if (ctx->audio_codec == DSS_ACODEC_DSS_SP && ctx->resync_pending) {
+        int64_t rel = avio_tell(pb) - ctx->dss_header_size;
+        int pad = (DSS_BLOCK_SIZE - (int)(rel % DSS_BLOCK_SIZE)) % 
DSS_BLOCK_SIZE;
+
+        ctx->resync_pending = 0;
+        avio_skip(pb, pad);
+        if (avio_read(pb, header, DSS_AUDIO_BLOCK_HEADER_SIZE) <
+            DSS_AUDIO_BLOCK_HEADER_SIZE) {
+            ctx->counter = 0;
+            return;
+        }
+        ctx->swap = !!(header[0] & 0x80);
+        offset    = 2 * header[1] + 2 * ctx->swap;
+        if (offset < DSS_AUDIO_BLOCK_HEADER_SIZE)
+            offset = DSS_AUDIO_BLOCK_HEADER_SIZE;
+        if (offset > DSS_BLOCK_SIZE)
+            offset = DSS_BLOCK_SIZE;
+        avio_skip(pb, offset - DSS_AUDIO_BLOCK_HEADER_SIZE);
+        ctx->counter          = DSS_BLOCK_SIZE - offset;
+        ctx->dss_sp_swap_byte = -1;
+        return;
+    }
+
+    if (avio_read(pb, header, DSS_AUDIO_BLOCK_HEADER_SIZE) <
+        DSS_AUDIO_BLOCK_HEADER_SIZE) {
+        ctx->counter = 0;
+        return;
+    }
+
+    /* Real Olympus dictation is voice-activated: a pause emits an empty
+     * block (frame_count == 0) that carries no fresh frames. Its leading
+     * bytes complete the frame that straddles into it (read normally by
+     * the caller); make the next call consume nothing more from this
+     * block, so the trailing padding is skipped and the grid re-syncs.
+     * On gap-free audio no block is empty and this never triggers. */
+    if (ctx->audio_codec == DSS_ACODEC_DSS_SP && header[2] == 0) {
+        read_size           = ctx->swap ? DSS_FRAME_SIZE - 2 : DSS_FRAME_SIZE;
+        ctx->counter        = read_size;
+        ctx->resync_pending = 1;
+        return;
+    }
 
-    avio_skip(pb, DSS_AUDIO_BLOCK_HEADER_SIZE);
     ctx->counter += DSS_BLOCK_SIZE - DSS_AUDIO_BLOCK_HEADER_SIZE;
 }
 
@@ -344,6 +395,7 @@ static int dss_read_seek(AVFormatContext *s, int 
stream_index,
     if (ret < 0)
         return ret;
     ctx->swap = !!(header[0] & 0x80);
+    ctx->resync_pending = 0;
     offset = 2*header[1] + 2*ctx->swap;
     if (offset < DSS_AUDIO_BLOCK_HEADER_SIZE)
         return AVERROR_INVALIDDATA;
-- 
2.43.0

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-devel] [PATCH] avformat/dss: fix DSS SP demux of voice-activated (paused) recordings

Reply via email to