Files
intercept/utils/sstv/image_decoder.py
Smittix 2e9bab75b1 fix: Correct Scottie sync search to prevent decoder stall
The previous sync search used search_margin = line_samples/10 (~306
samples for Scottie2), reaching deep into B channel pixel data behind
pos and well past the expected sync end ahead of pos.

When _find_sync returned a position in the late portion of that wide
region, pos + R_channel_samples exceeded the buffer length. The
buffer-too-short guard in _decode_line then returned early without
consuming data or advancing the line counter, causing the stall guard
in feed() to permanently break the decode loop.

Fix: use a 50-sample backward margin (covers >130 ppm SDR drift) and
a forward margin capped to whatever the current buffer can safely
support for the R channel. A final candidate-position check before
committing pos ensures no overflow is possible.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-19 10:01:28 +00:00

497 lines
19 KiB
Python

"""SSTV scanline-by-scanline image decoder.
Decodes raw audio samples into a PIL Image for all supported SSTV modes.
Handles sync pulse re-synchronization on each line for robust decoding
under weak-signal or drifting conditions.
"""
from __future__ import annotations
from typing import Callable
import numpy as np
from .constants import (
FREQ_BLACK,
FREQ_PIXEL_HIGH,
FREQ_PIXEL_LOW,
FREQ_SYNC,
SAMPLE_RATE,
)
from .dsp import (
goertzel,
samples_for_duration,
)
from .modes import (
ColorModel,
SSTVMode,
SyncPosition,
)
# Pillow is imported lazily to keep the module importable when Pillow
# is not installed (is_sstv_available() just returns True, but actual
# decoding would fail gracefully).
try:
from PIL import Image
except ImportError:
Image = None # type: ignore[assignment,misc]
# Type alias for progress callback: (current_line, total_lines)
ProgressCallback = Callable[[int, int], None]
class SSTVImageDecoder:
"""Decode an SSTV image from a stream of audio samples.
Usage::
decoder = SSTVImageDecoder(mode)
decoder.feed(samples)
...
if decoder.is_complete:
image = decoder.get_image()
"""
def __init__(self, mode: SSTVMode, sample_rate: int = SAMPLE_RATE,
progress_cb: ProgressCallback | None = None):
self._mode = mode
self._sample_rate = sample_rate
self._progress_cb = progress_cb
self._buffer = np.array([], dtype=np.float64)
self._current_line = 0
self._complete = False
# Pre-calculate sample counts
self._sync_samples = samples_for_duration(
mode.sync_duration_ms / 1000.0, sample_rate)
self._porch_samples = samples_for_duration(
mode.sync_porch_ms / 1000.0, sample_rate)
self._line_samples = samples_for_duration(
mode.line_duration_ms / 1000.0, sample_rate)
self._separator_samples = (
samples_for_duration(mode.channel_separator_ms / 1000.0, sample_rate)
if mode.channel_separator_ms > 0 else 0
)
self._channel_samples = [
samples_for_duration(ch.duration_ms / 1000.0, sample_rate)
for ch in mode.channels
]
# For PD modes, each "line" of audio produces 2 image lines
if mode.color_model == ColorModel.YCRCB_DUAL:
self._total_audio_lines = mode.height // 2
else:
self._total_audio_lines = mode.height
# Initialize pixel data arrays per channel
self._channel_data: list[np.ndarray] = []
for _i, _ch_spec in enumerate(mode.channels):
if mode.color_model == ColorModel.YCRCB_DUAL:
# Y1, Cr, Cb, Y2 - all are width-wide
self._channel_data.append(
np.zeros((self._total_audio_lines, mode.width), dtype=np.uint8))
else:
self._channel_data.append(
np.zeros((mode.height, mode.width), dtype=np.uint8))
# Track sync position for re-synchronization
self._expected_line_start = 0 # Sample offset within buffer
self._synced = False
@property
def is_complete(self) -> bool:
return self._complete
@property
def current_line(self) -> int:
return self._current_line
@property
def total_lines(self) -> int:
return self._total_audio_lines
@property
def progress_percent(self) -> int:
if self._total_audio_lines == 0:
return 0
return min(100, int(100 * self._current_line / self._total_audio_lines))
def feed(self, samples: np.ndarray) -> bool:
"""Feed audio samples into the decoder.
Args:
samples: Float64 audio samples.
Returns:
True when image is complete.
"""
if self._complete:
return True
self._buffer = np.concatenate([self._buffer, samples])
# Process complete lines.
# Guard against stalls: if _decode_line() cannot consume data
# (e.g. sub-component samples exceed line_samples due to rounding),
# break out and wait for more audio.
while not self._complete and len(self._buffer) >= self._line_samples:
prev_line = self._current_line
prev_len = len(self._buffer)
self._decode_line()
if self._current_line == prev_line and len(self._buffer) == prev_len:
break # No progress — need more data
# Prevent unbounded buffer growth - keep at most 2 lines worth
max_buffer = self._line_samples * 2
if len(self._buffer) > max_buffer and not self._complete:
self._buffer = self._buffer[-max_buffer:]
return self._complete
def _find_sync(self, search_region: np.ndarray) -> int | None:
"""Find the 1200 Hz sync pulse within a search region.
Scans through the region looking for a stretch of 1200 Hz
tone of approximately the right duration.
Args:
search_region: Audio samples to search within.
Returns:
Sample offset of the sync pulse start, or None if not found.
"""
window_size = min(self._sync_samples, 200)
if len(search_region) < window_size:
return None
best_pos = None
best_energy = 0.0
step = window_size // 2
for pos in range(0, len(search_region) - window_size, step):
chunk = search_region[pos:pos + window_size]
sync_energy = goertzel(chunk, FREQ_SYNC, self._sample_rate)
# Check it's actually sync, not data at 1200 Hz area
black_energy = goertzel(chunk, FREQ_BLACK, self._sample_rate)
if sync_energy > best_energy and sync_energy > black_energy * 2:
best_energy = sync_energy
best_pos = pos
return best_pos
def _decode_line(self) -> None:
"""Decode one scanline from the buffer."""
if self._current_line >= self._total_audio_lines:
self._complete = True
return
# Try to find sync pulse for re-synchronization
# Search within +/-10% of expected line start
search_margin = max(100, self._line_samples // 10)
line_start = 0
if self._mode.sync_position in (SyncPosition.FRONT, SyncPosition.FRONT_PD):
# Sync is at the beginning of each line
search_start = 0
search_end = min(len(self._buffer), self._sync_samples + search_margin)
search_region = self._buffer[search_start:search_end]
sync_pos = self._find_sync(search_region)
if sync_pos is not None:
line_start = sync_pos
# Skip sync + porch to get to pixel data
pixel_start = line_start + self._sync_samples + self._porch_samples
elif self._mode.sync_position == SyncPosition.MIDDLE:
# Scottie: sep(1.5ms) -> G -> sep(1.5ms) -> B -> sync(9ms) -> porch(1.5ms) -> R
# Skip initial separator (same duration as porch)
pixel_start = self._porch_samples
line_start = 0
else:
pixel_start = self._sync_samples + self._porch_samples
# Decode each channel
pos = pixel_start
for ch_idx, ch_samples in enumerate(self._channel_samples):
if pos + ch_samples > len(self._buffer):
# Not enough data yet - put the data back and wait
return
channel_audio = self._buffer[pos:pos + ch_samples]
pixels = self._decode_channel_pixels(channel_audio)
self._channel_data[ch_idx][self._current_line, :] = pixels
pos += ch_samples
# Add inter-channel gaps based on mode family
if ch_idx < len(self._channel_samples) - 1:
if self._mode.sync_position == SyncPosition.MIDDLE:
if ch_idx == 0:
# Scottie: separator between G and B
pos += self._porch_samples
else:
# Scottie: sync + porch between B and R.
# Search for the actual sync pulse to correct per-line
# SDR clock drift — without this, timing errors
# accumulate line-by-line producing a visible slant.
#
# Constraints:
# - Backward margin is small (50 samples ≈ 4.5 ms)
# so we don't stray deep into B pixel data.
# - Forward margin is bounded by available buffer so
# the R channel decode never overflows the buffer.
# - The candidate position is validated before use.
r_samples = self._channel_samples[-1]
bwd = min(50, pos)
fwd = max(0, len(self._buffer) - pos
- self._sync_samples - self._porch_samples
- r_samples)
fwd = min(fwd, self._sync_samples)
if bwd + fwd > 0:
sync_region = self._buffer[
pos - bwd: pos + self._sync_samples + fwd]
sync_found = self._find_sync(sync_region)
if sync_found is not None:
candidate = (pos - bwd + sync_found
+ self._sync_samples
+ self._porch_samples)
if candidate + r_samples <= len(self._buffer):
pos = candidate
else:
pos += self._sync_samples + self._porch_samples
else:
pos += self._sync_samples + self._porch_samples
else:
pos += self._sync_samples + self._porch_samples
elif self._separator_samples > 0:
# Robot: separator + porch between channels
pos += self._separator_samples
elif (self._mode.sync_position == SyncPosition.FRONT
and self._mode.color_model == ColorModel.RGB):
# Martin: porch between channels
pos += self._porch_samples
# Advance buffer past this line
consumed = max(pos, self._line_samples)
self._buffer = self._buffer[consumed:]
self._current_line += 1
if self._progress_cb:
self._progress_cb(self._current_line, self._total_audio_lines)
if self._current_line >= self._total_audio_lines:
self._complete = True
def _decode_channel_pixels(self, audio: np.ndarray) -> np.ndarray:
"""Decode pixel values from a channel's audio data.
Uses the analytic signal (Hilbert transform via FFT) to compute
the instantaneous frequency at every sample, then averages over
each pixel's duration. This is the same FM-demodulation approach
used by QSSTV and other professional SSTV decoders, and provides
far better frequency resolution than windowed Goertzel — especially
for fast modes (Martin2, Scottie2) where each pixel spans only
~11-13 audio samples.
Args:
audio: Audio samples for one channel of one scanline.
Returns:
Array of pixel values (0-255), shape (width,).
"""
width = self._mode.width
n = len(audio)
if n < width:
return np.zeros(width, dtype=np.uint8)
# --- Analytic signal via Hilbert transform (FFT method) ---
spectrum = np.fft.fft(audio)
# Build the analytic-signal multiplier:
# h[0] = 1 (DC), h[1..N/2-1] = 2 (positive freqs),
# h[N/2] = 1 (Nyquist), h[N/2+1..] = 0 (negative freqs)
h = np.zeros(n)
if n % 2 == 0:
h[0] = h[n // 2] = 1
h[1:n // 2] = 2
else:
h[0] = 1
h[1:(n + 1) // 2] = 2
analytic = np.fft.ifft(spectrum * h)
# --- Instantaneous frequency ---
phase = np.unwrap(np.angle(analytic))
inst_freq = np.diff(phase) * (self._sample_rate / (2.0 * np.pi))
# --- Average frequency per pixel ---
freq_len = len(inst_freq)
if freq_len < width:
# Fewer freq samples than pixels — index directly
indices = np.linspace(0, freq_len - 1, width).astype(int)
avg_freqs = inst_freq[indices]
else:
pixel_edges = np.linspace(0, freq_len, width + 1).astype(int)
segment_starts = pixel_edges[:-1]
segment_lengths = np.diff(pixel_edges)
segment_lengths = np.maximum(segment_lengths, 1)
sums = np.add.reduceat(inst_freq, segment_starts)
avg_freqs = sums / segment_lengths
# Map to pixel values (1500 Hz → 0, 2300 Hz → 255)
normalized = (avg_freqs - FREQ_PIXEL_LOW) / (
FREQ_PIXEL_HIGH - FREQ_PIXEL_LOW)
return np.clip(normalized * 255 + 0.5, 0, 255).astype(np.uint8)
def get_image(self) -> Image.Image | None:
"""Convert decoded channel data to a PIL Image.
Returns:
PIL Image in RGB mode, or None if Pillow is not available
or decoding is incomplete.
"""
if Image is None:
return None
mode = self._mode
if mode.color_model == ColorModel.RGB:
return self._assemble_rgb()
elif mode.color_model == ColorModel.YCRCB:
return self._assemble_ycrcb()
elif mode.color_model == ColorModel.YCRCB_DUAL:
return self._assemble_ycrcb_dual()
return None
def _assemble_rgb(self) -> Image.Image:
"""Assemble RGB image from sequential R, G, B channel data.
Martin/Scottie channel order: G, B, R.
"""
height = self._mode.height
# Channel order for Martin/Scottie: [0]=G, [1]=B, [2]=R
g_data = self._channel_data[0][:height]
b_data = self._channel_data[1][:height]
r_data = self._channel_data[2][:height]
rgb = np.stack([r_data, g_data, b_data], axis=-1)
return Image.fromarray(rgb, 'RGB')
def _assemble_ycrcb(self) -> Image.Image:
"""Assemble image from YCrCb data (Robot modes).
Robot36: Y every line, Cr/Cb alternating (half-rate chroma).
Robot72: Y, Cr, Cb every line (full-rate chroma).
"""
height = self._mode.height
width = self._mode.width
if not self._mode.has_half_rate_chroma:
# Full-rate chroma (Robot72): Y, Cr, Cb as separate channels
y_data = self._channel_data[0][:height].astype(np.float64)
cr = self._channel_data[1][:height].astype(np.float64)
cb = self._channel_data[2][:height].astype(np.float64)
return self._ycrcb_to_rgb(y_data, cr, cb, height, width)
# Half-rate chroma (Robot36): Y + alternating Cr/Cb
y_data = self._channel_data[0][:height].astype(np.float64)
chroma_data = self._channel_data[1][:height].astype(np.float64)
# Separate Cr (even lines) and Cb (odd lines), then interpolate
cr = np.zeros((height, width), dtype=np.float64)
cb = np.zeros((height, width), dtype=np.float64)
for line in range(height):
if line % 2 == 0:
cr[line] = chroma_data[line]
else:
cb[line] = chroma_data[line]
# Interpolate missing chroma lines
for line in range(height):
if line % 2 == 1:
# Missing Cr - interpolate from neighbors
prev_cr = line - 1 if line > 0 else line + 1
next_cr = line + 1 if line + 1 < height else line - 1
cr[line] = (cr[prev_cr] + cr[next_cr]) / 2
else:
# Missing Cb - interpolate from neighbors
prev_cb = line - 1 if line > 0 else line + 1
next_cb = line + 1 if line + 1 < height else line - 1
if prev_cb >= 0 and next_cb < height:
cb[line] = (cb[prev_cb] + cb[next_cb]) / 2
elif prev_cb >= 0:
cb[line] = cb[prev_cb]
else:
cb[line] = cb[next_cb]
return self._ycrcb_to_rgb(y_data, cr, cb, height, width)
def _assemble_ycrcb_dual(self) -> Image.Image:
"""Assemble image from dual-luminance YCrCb data (PD modes).
PD modes send Y1, Cr, Cb, Y2 per audio line, producing 2 image lines.
"""
audio_lines = self._total_audio_lines
width = self._mode.width
height = self._mode.height
y1_data = self._channel_data[0][:audio_lines].astype(np.float64)
cr_data = self._channel_data[1][:audio_lines].astype(np.float64)
cb_data = self._channel_data[2][:audio_lines].astype(np.float64)
y2_data = self._channel_data[3][:audio_lines].astype(np.float64)
# Interleave Y1 and Y2 to produce full-height luminance
y_full = np.zeros((height, width), dtype=np.float64)
cr_full = np.zeros((height, width), dtype=np.float64)
cb_full = np.zeros((height, width), dtype=np.float64)
for i in range(audio_lines):
even_line = i * 2
odd_line = i * 2 + 1
if even_line < height:
y_full[even_line] = y1_data[i]
cr_full[even_line] = cr_data[i]
cb_full[even_line] = cb_data[i]
if odd_line < height:
y_full[odd_line] = y2_data[i]
cr_full[odd_line] = cr_data[i]
cb_full[odd_line] = cb_data[i]
return self._ycrcb_to_rgb(y_full, cr_full, cb_full, height, width)
@staticmethod
def _ycrcb_to_rgb(y: np.ndarray, cr: np.ndarray, cb: np.ndarray,
height: int, width: int) -> Image.Image:
"""Convert YCrCb pixel data to an RGB PIL Image.
Uses the SSTV convention where pixel values 0-255 map to the
standard Y'CbCr color space used by JPEG/SSTV.
"""
# Normalize from 0-255 pixel range to standard ranges
# Y: 0-255, Cr/Cb: 0-255 centered at 128
y_norm = y
cr_norm = cr - 128.0
cb_norm = cb - 128.0
# ITU-R BT.601 conversion
r = y_norm + 1.402 * cr_norm
g = y_norm - 0.344136 * cb_norm - 0.714136 * cr_norm
b = y_norm + 1.772 * cb_norm
# Clip and convert
r = np.clip(r, 0, 255).astype(np.uint8)
g = np.clip(g, 0, 255).astype(np.uint8)
b = np.clip(b, 0, 255).astype(np.uint8)
rgb = np.stack([r, g, b], axis=-1)
return Image.fromarray(rgb, 'RGB')