mirror of
https://github.com/smittix/intercept.git
synced 2026-04-24 22:59:59 -07:00
566 lines
22 KiB
Python
566 lines
22 KiB
Python
"""SSTV scanline-by-scanline image decoder.
|
||
|
||
Decodes raw audio samples into a PIL Image for all supported SSTV modes.
|
||
Handles sync pulse re-synchronization on each line for robust decoding
|
||
under weak-signal or drifting conditions.
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
from typing import Callable
|
||
|
||
import numpy as np
|
||
|
||
from .constants import (
|
||
FREQ_BLACK,
|
||
FREQ_PIXEL_HIGH,
|
||
FREQ_PIXEL_LOW,
|
||
FREQ_SYNC,
|
||
SAMPLE_RATE,
|
||
)
|
||
from .dsp import (
|
||
goertzel,
|
||
goertzel_batch,
|
||
samples_for_duration,
|
||
)
|
||
from .modes import (
|
||
ColorModel,
|
||
SSTVMode,
|
||
SyncPosition,
|
||
)
|
||
|
||
# Pillow is imported lazily to keep the module importable when Pillow
|
||
# is not installed (is_sstv_available() just returns True, but actual
|
||
# decoding would fail gracefully).
|
||
try:
|
||
from PIL import Image
|
||
except ImportError:
|
||
Image = None # type: ignore[assignment,misc]
|
||
|
||
|
||
# Type alias for progress callback: (current_line, total_lines)
|
||
ProgressCallback = Callable[[int, int], None]
|
||
|
||
|
||
class SSTVImageDecoder:
|
||
"""Decode an SSTV image from a stream of audio samples.
|
||
|
||
Usage::
|
||
|
||
decoder = SSTVImageDecoder(mode)
|
||
decoder.feed(samples)
|
||
...
|
||
if decoder.is_complete:
|
||
image = decoder.get_image()
|
||
"""
|
||
|
||
def __init__(self, mode: SSTVMode, sample_rate: int = SAMPLE_RATE,
|
||
progress_cb: ProgressCallback | None = None):
|
||
self._mode = mode
|
||
self._sample_rate = sample_rate
|
||
self._progress_cb = progress_cb
|
||
|
||
self._buffer = np.array([], dtype=np.float64)
|
||
self._current_line = 0
|
||
self._complete = False
|
||
|
||
# Pre-calculate sample counts
|
||
self._sync_samples = samples_for_duration(
|
||
mode.sync_duration_ms / 1000.0, sample_rate)
|
||
self._porch_samples = samples_for_duration(
|
||
mode.sync_porch_ms / 1000.0, sample_rate)
|
||
self._line_samples = samples_for_duration(
|
||
mode.line_duration_ms / 1000.0, sample_rate)
|
||
self._separator_samples = (
|
||
samples_for_duration(mode.channel_separator_ms / 1000.0, sample_rate)
|
||
if mode.channel_separator_ms > 0 else 0
|
||
)
|
||
|
||
self._channel_samples = [
|
||
samples_for_duration(ch.duration_ms / 1000.0, sample_rate)
|
||
for ch in mode.channels
|
||
]
|
||
|
||
# For PD modes, each "line" of audio produces 2 image lines
|
||
if mode.color_model == ColorModel.YCRCB_DUAL:
|
||
self._total_audio_lines = mode.height // 2
|
||
else:
|
||
self._total_audio_lines = mode.height
|
||
|
||
# Initialize pixel data arrays per channel
|
||
self._channel_data: list[np.ndarray] = []
|
||
for _i, _ch_spec in enumerate(mode.channels):
|
||
if mode.color_model == ColorModel.YCRCB_DUAL:
|
||
# Y1, Cr, Cb, Y2 - all are width-wide
|
||
self._channel_data.append(
|
||
np.zeros((self._total_audio_lines, mode.width), dtype=np.uint8))
|
||
else:
|
||
self._channel_data.append(
|
||
np.zeros((mode.height, mode.width), dtype=np.uint8))
|
||
|
||
# Track sync position for re-synchronization
|
||
self._expected_line_start = 0 # Sample offset within buffer
|
||
self._synced = False
|
||
|
||
# Per-line mid-sync deviation measurements (Scottie modes only).
|
||
# Each entry is the measured offset (in samples) of the sync pulse
|
||
# relative to its expected position: negative = early, positive = late.
|
||
# Used by get_image() to apply post-processing slant correction.
|
||
self._sync_deviations: list[float | None] = []
|
||
|
||
@property
|
||
def is_complete(self) -> bool:
|
||
return self._complete
|
||
|
||
@property
|
||
def current_line(self) -> int:
|
||
return self._current_line
|
||
|
||
@property
|
||
def total_lines(self) -> int:
|
||
return self._total_audio_lines
|
||
|
||
@property
|
||
def progress_percent(self) -> int:
|
||
if self._total_audio_lines == 0:
|
||
return 0
|
||
return min(100, int(100 * self._current_line / self._total_audio_lines))
|
||
|
||
def feed(self, samples: np.ndarray) -> bool:
|
||
"""Feed audio samples into the decoder.
|
||
|
||
Args:
|
||
samples: Float64 audio samples.
|
||
|
||
Returns:
|
||
True when image is complete.
|
||
"""
|
||
if self._complete:
|
||
return True
|
||
|
||
self._buffer = np.concatenate([self._buffer, samples])
|
||
|
||
# Process complete lines.
|
||
# Guard against stalls: if _decode_line() cannot consume data
|
||
# (e.g. sub-component samples exceed line_samples due to rounding),
|
||
# break out and wait for more audio.
|
||
while not self._complete and len(self._buffer) >= self._line_samples:
|
||
prev_line = self._current_line
|
||
prev_len = len(self._buffer)
|
||
self._decode_line()
|
||
if self._current_line == prev_line and len(self._buffer) == prev_len:
|
||
break # No progress — need more data
|
||
|
||
# Prevent unbounded buffer growth - keep at most 2 lines worth
|
||
max_buffer = self._line_samples * 2
|
||
if len(self._buffer) > max_buffer and not self._complete:
|
||
self._buffer = self._buffer[-max_buffer:]
|
||
|
||
return self._complete
|
||
|
||
def _find_sync(self, search_region: np.ndarray) -> int | None:
|
||
"""Find the 1200 Hz sync pulse within a search region.
|
||
|
||
Scans through the region looking for a stretch of 1200 Hz
|
||
tone of approximately the right duration.
|
||
|
||
Args:
|
||
search_region: Audio samples to search within.
|
||
|
||
Returns:
|
||
Sample offset of the sync pulse start, or None if not found.
|
||
"""
|
||
window_size = min(self._sync_samples, 200)
|
||
if len(search_region) < window_size:
|
||
return None
|
||
|
||
best_pos = None
|
||
best_energy = 0.0
|
||
|
||
step = window_size // 2
|
||
for pos in range(0, len(search_region) - window_size, step):
|
||
chunk = search_region[pos:pos + window_size]
|
||
sync_energy = goertzel(chunk, FREQ_SYNC, self._sample_rate)
|
||
# Check it's actually sync, not data at 1200 Hz area
|
||
black_energy = goertzel(chunk, FREQ_BLACK, self._sample_rate)
|
||
if sync_energy > best_energy and sync_energy > black_energy * 2:
|
||
best_energy = sync_energy
|
||
best_pos = pos
|
||
|
||
return best_pos
|
||
|
||
def _decode_line(self) -> None:
|
||
"""Decode one scanline from the buffer."""
|
||
if self._current_line >= self._total_audio_lines:
|
||
self._complete = True
|
||
return
|
||
|
||
# Try to find sync pulse for re-synchronization
|
||
# Search within +/-10% of expected line start
|
||
search_margin = max(100, self._line_samples // 10)
|
||
|
||
line_start = 0
|
||
|
||
if self._mode.sync_position in (SyncPosition.FRONT, SyncPosition.FRONT_PD):
|
||
# Sync is at the beginning of each line
|
||
search_start = 0
|
||
search_end = min(len(self._buffer), self._sync_samples + search_margin)
|
||
search_region = self._buffer[search_start:search_end]
|
||
|
||
sync_pos = self._find_sync(search_region)
|
||
if sync_pos is not None:
|
||
line_start = sync_pos
|
||
# Skip sync + porch to get to pixel data
|
||
pixel_start = line_start + self._sync_samples + self._porch_samples
|
||
|
||
elif self._mode.sync_position == SyncPosition.MIDDLE:
|
||
# Scottie: sep(1.5ms) -> G -> sep(1.5ms) -> B -> sync(9ms) -> porch(1.5ms) -> R
|
||
# Skip initial separator (same duration as porch)
|
||
pixel_start = self._porch_samples
|
||
line_start = 0
|
||
|
||
else:
|
||
pixel_start = self._sync_samples + self._porch_samples
|
||
|
||
# Decode each channel
|
||
pos = pixel_start
|
||
for ch_idx, ch_samples in enumerate(self._channel_samples):
|
||
if pos + ch_samples > len(self._buffer):
|
||
# Not enough data yet - put the data back and wait
|
||
return
|
||
|
||
channel_audio = self._buffer[pos:pos + ch_samples]
|
||
pixels = self._decode_channel_pixels(channel_audio)
|
||
self._channel_data[ch_idx][self._current_line, :] = pixels
|
||
pos += ch_samples
|
||
|
||
# Add inter-channel gaps based on mode family
|
||
if ch_idx < len(self._channel_samples) - 1:
|
||
if self._mode.sync_position == SyncPosition.MIDDLE:
|
||
if ch_idx == 0:
|
||
# Scottie: separator between G and B
|
||
pos += self._porch_samples
|
||
else:
|
||
# Scottie: sync + porch between B and R.
|
||
# Measure the actual sync position for post-processing
|
||
# slant correction without touching pos or consumed —
|
||
# so a noisy/false measurement never corrupts the decode.
|
||
r_samples = self._channel_samples[-1]
|
||
# Large window to cover cumulative SDR clock drift over
|
||
# the full image. bwd=50 saturates after ~25 lines for
|
||
# a 2-sample/line drift; 800 samples covers ±200 ppm.
|
||
bwd = min(800, pos)
|
||
remaining = len(self._buffer) - pos
|
||
fwd = min(800, max(
|
||
0,
|
||
remaining - self._sync_samples
|
||
- self._porch_samples - r_samples))
|
||
deviation: float | None = None
|
||
if bwd + fwd > 0:
|
||
region_start = pos - bwd
|
||
sync_region = self._buffer[
|
||
region_start: pos + self._sync_samples + fwd]
|
||
# Full-sync-length window gives best freq resolution
|
||
# (~111 Hz at 48 kHz) to cleanly separate 1200 Hz
|
||
# sync from 1500 Hz pixel data.
|
||
win = self._sync_samples
|
||
n_raw = len(sync_region) - win + 1
|
||
if n_raw > 0:
|
||
# Step 5 samples (~0.1 ms) — enough resolution
|
||
# for pixel-level drift, keeps batch size small.
|
||
step = 5
|
||
all_windows = (
|
||
np.lib.stride_tricks.sliding_window_view(
|
||
sync_region, win))
|
||
windows = all_windows[::step]
|
||
energies = goertzel_batch(
|
||
windows,
|
||
np.array([FREQ_SYNC, FREQ_BLACK]),
|
||
self._sample_rate)
|
||
sync_e = energies[:, 0]
|
||
black_e = energies[:, 1]
|
||
valid_mask = sync_e > black_e * 2
|
||
if valid_mask.any():
|
||
fine_idx = int(
|
||
np.argmax(
|
||
np.where(valid_mask, sync_e, 0.0)))
|
||
deviation = float(fine_idx * step - bwd)
|
||
self._sync_deviations.append(deviation)
|
||
pos += self._sync_samples + self._porch_samples
|
||
elif self._separator_samples > 0:
|
||
# Robot: separator + porch between channels
|
||
pos += self._separator_samples
|
||
elif (self._mode.sync_position == SyncPosition.FRONT
|
||
and self._mode.color_model == ColorModel.RGB):
|
||
# Martin: porch between channels
|
||
pos += self._porch_samples
|
||
|
||
# Advance buffer past this line
|
||
consumed = max(pos, self._line_samples)
|
||
self._buffer = self._buffer[consumed:]
|
||
|
||
self._current_line += 1
|
||
|
||
if self._progress_cb:
|
||
self._progress_cb(self._current_line, self._total_audio_lines)
|
||
|
||
if self._current_line >= self._total_audio_lines:
|
||
self._complete = True
|
||
|
||
def _decode_channel_pixels(self, audio: np.ndarray) -> np.ndarray:
|
||
"""Decode pixel values from a channel's audio data.
|
||
|
||
Uses the analytic signal (Hilbert transform via FFT) to compute
|
||
the instantaneous frequency at every sample, then averages over
|
||
each pixel's duration. This is the same FM-demodulation approach
|
||
used by QSSTV and other professional SSTV decoders, and provides
|
||
far better frequency resolution than windowed Goertzel — especially
|
||
for fast modes (Martin2, Scottie2) where each pixel spans only
|
||
~11-13 audio samples.
|
||
|
||
Args:
|
||
audio: Audio samples for one channel of one scanline.
|
||
|
||
Returns:
|
||
Array of pixel values (0-255), shape (width,).
|
||
"""
|
||
width = self._mode.width
|
||
n = len(audio)
|
||
|
||
if n < width:
|
||
return np.zeros(width, dtype=np.uint8)
|
||
|
||
# --- Analytic signal via Hilbert transform (FFT method) ---
|
||
spectrum = np.fft.fft(audio)
|
||
|
||
# Build the analytic-signal multiplier:
|
||
# h[0] = 1 (DC), h[1..N/2-1] = 2 (positive freqs),
|
||
# h[N/2] = 1 (Nyquist), h[N/2+1..] = 0 (negative freqs)
|
||
h = np.zeros(n)
|
||
if n % 2 == 0:
|
||
h[0] = h[n // 2] = 1
|
||
h[1:n // 2] = 2
|
||
else:
|
||
h[0] = 1
|
||
h[1:(n + 1) // 2] = 2
|
||
|
||
analytic = np.fft.ifft(spectrum * h)
|
||
|
||
# --- Instantaneous frequency ---
|
||
phase = np.unwrap(np.angle(analytic))
|
||
inst_freq = np.diff(phase) * (self._sample_rate / (2.0 * np.pi))
|
||
|
||
# --- Average frequency per pixel ---
|
||
freq_len = len(inst_freq)
|
||
if freq_len < width:
|
||
# Fewer freq samples than pixels — index directly
|
||
indices = np.linspace(0, freq_len - 1, width).astype(int)
|
||
avg_freqs = inst_freq[indices]
|
||
else:
|
||
pixel_edges = np.linspace(0, freq_len, width + 1).astype(int)
|
||
segment_starts = pixel_edges[:-1]
|
||
segment_lengths = np.diff(pixel_edges)
|
||
segment_lengths = np.maximum(segment_lengths, 1)
|
||
sums = np.add.reduceat(inst_freq, segment_starts)
|
||
avg_freqs = sums / segment_lengths
|
||
|
||
# Map to pixel values (1500 Hz → 0, 2300 Hz → 255)
|
||
normalized = (avg_freqs - FREQ_PIXEL_LOW) / (
|
||
FREQ_PIXEL_HIGH - FREQ_PIXEL_LOW)
|
||
return np.clip(normalized * 255 + 0.5, 0, 255).astype(np.uint8)
|
||
|
||
def get_image(self) -> Image.Image | None:
|
||
"""Convert decoded channel data to a PIL Image.
|
||
|
||
Returns:
|
||
PIL Image in RGB mode, or None if Pillow is not available
|
||
or decoding is incomplete.
|
||
"""
|
||
if Image is None:
|
||
return None
|
||
|
||
mode = self._mode
|
||
|
||
if mode.color_model == ColorModel.RGB:
|
||
img = self._assemble_rgb()
|
||
elif mode.color_model == ColorModel.YCRCB:
|
||
img = self._assemble_ycrcb()
|
||
elif mode.color_model == ColorModel.YCRCB_DUAL:
|
||
img = self._assemble_ycrcb_dual()
|
||
else:
|
||
return None
|
||
|
||
return self._apply_slant_correction(img)
|
||
|
||
def _apply_slant_correction(self, img: Image.Image) -> Image.Image:
|
||
"""Apply per-row horizontal correction based on measured sync drift.
|
||
|
||
Uses the sync deviation measurements collected during decoding to
|
||
estimate the per-line SDR clock drift rate via linear regression,
|
||
then circularly shifts each row to compensate. Noisy individual
|
||
measurements are averaged out; if fewer than 10 valid measurements
|
||
exist the image is returned unchanged.
|
||
"""
|
||
valid = [(i, d) for i, d in enumerate(self._sync_deviations)
|
||
if d is not None]
|
||
if len(valid) < 10:
|
||
return img
|
||
|
||
lines = np.array([x[0] for x in valid], dtype=np.float64)
|
||
devs = np.array([x[1] for x in valid], dtype=np.float64)
|
||
|
||
# Linear fit: deviation[i] ≈ slope × i + intercept.
|
||
# slope < 0 → fast SDR (sync arrives early, image leans left).
|
||
# slope > 0 → slow SDR (sync arrives late, image leans right).
|
||
slope, _ = np.polyfit(lines, devs, 1)
|
||
|
||
# Convert samples/line drift to pixels/line for the channel width.
|
||
pixels_per_line = slope * self._mode.width / self._channel_samples[0]
|
||
|
||
# Skip correction if drift is negligible (< 1 px over 20 lines).
|
||
if abs(pixels_per_line) < 0.05:
|
||
return img
|
||
|
||
arr = np.array(img)
|
||
height, width = arr.shape[:2]
|
||
|
||
# Reject clearly implausible estimates. Even with cheap SDR clocks,
|
||
# real SSTV slant is typically modest; extreme values are usually
|
||
# bad sync picks that would over-correct the image.
|
||
total_shift = abs((height - 1) * pixels_per_line)
|
||
if total_shift > width * 0.25:
|
||
return img
|
||
|
||
corrected = np.empty_like(arr)
|
||
|
||
for row in range(height):
|
||
shift = -int(round(row * pixels_per_line))
|
||
corrected[row] = np.roll(arr[row], shift=shift, axis=0)
|
||
|
||
return Image.fromarray(corrected, 'RGB')
|
||
|
||
def _assemble_rgb(self) -> Image.Image:
|
||
"""Assemble RGB image from sequential R, G, B channel data.
|
||
|
||
Martin/Scottie channel order: G, B, R.
|
||
"""
|
||
height = self._mode.height
|
||
|
||
# Channel order for Martin/Scottie: [0]=G, [1]=B, [2]=R
|
||
g_data = self._channel_data[0][:height]
|
||
b_data = self._channel_data[1][:height]
|
||
r_data = self._channel_data[2][:height]
|
||
|
||
rgb = np.stack([r_data, g_data, b_data], axis=-1)
|
||
return Image.fromarray(rgb, 'RGB')
|
||
|
||
def _assemble_ycrcb(self) -> Image.Image:
|
||
"""Assemble image from YCrCb data (Robot modes).
|
||
|
||
Robot36: Y every line, Cr/Cb alternating (half-rate chroma).
|
||
Robot72: Y, Cr, Cb every line (full-rate chroma).
|
||
"""
|
||
height = self._mode.height
|
||
width = self._mode.width
|
||
|
||
if not self._mode.has_half_rate_chroma:
|
||
# Full-rate chroma (Robot72): Y, Cr, Cb as separate channels
|
||
y_data = self._channel_data[0][:height].astype(np.float64)
|
||
cr = self._channel_data[1][:height].astype(np.float64)
|
||
cb = self._channel_data[2][:height].astype(np.float64)
|
||
return self._ycrcb_to_rgb(y_data, cr, cb, height, width)
|
||
|
||
# Half-rate chroma (Robot36): Y + alternating Cr/Cb
|
||
y_data = self._channel_data[0][:height].astype(np.float64)
|
||
chroma_data = self._channel_data[1][:height].astype(np.float64)
|
||
|
||
# Separate Cr (even lines) and Cb (odd lines), then interpolate
|
||
cr = np.zeros((height, width), dtype=np.float64)
|
||
cb = np.zeros((height, width), dtype=np.float64)
|
||
|
||
for line in range(height):
|
||
if line % 2 == 0:
|
||
cr[line] = chroma_data[line]
|
||
else:
|
||
cb[line] = chroma_data[line]
|
||
|
||
# Interpolate missing chroma lines
|
||
for line in range(height):
|
||
if line % 2 == 1:
|
||
# Missing Cr - interpolate from neighbors
|
||
prev_cr = line - 1 if line > 0 else line + 1
|
||
next_cr = line + 1 if line + 1 < height else line - 1
|
||
cr[line] = (cr[prev_cr] + cr[next_cr]) / 2
|
||
else:
|
||
# Missing Cb - interpolate from neighbors
|
||
prev_cb = line - 1 if line > 0 else line + 1
|
||
next_cb = line + 1 if line + 1 < height else line - 1
|
||
if prev_cb >= 0 and next_cb < height:
|
||
cb[line] = (cb[prev_cb] + cb[next_cb]) / 2
|
||
elif prev_cb >= 0:
|
||
cb[line] = cb[prev_cb]
|
||
else:
|
||
cb[line] = cb[next_cb]
|
||
|
||
return self._ycrcb_to_rgb(y_data, cr, cb, height, width)
|
||
|
||
def _assemble_ycrcb_dual(self) -> Image.Image:
|
||
"""Assemble image from dual-luminance YCrCb data (PD modes).
|
||
|
||
PD modes send Y1, Cr, Cb, Y2 per audio line, producing 2 image lines.
|
||
"""
|
||
audio_lines = self._total_audio_lines
|
||
width = self._mode.width
|
||
height = self._mode.height
|
||
|
||
y1_data = self._channel_data[0][:audio_lines].astype(np.float64)
|
||
cr_data = self._channel_data[1][:audio_lines].astype(np.float64)
|
||
cb_data = self._channel_data[2][:audio_lines].astype(np.float64)
|
||
y2_data = self._channel_data[3][:audio_lines].astype(np.float64)
|
||
|
||
# Interleave Y1 and Y2 to produce full-height luminance
|
||
y_full = np.zeros((height, width), dtype=np.float64)
|
||
cr_full = np.zeros((height, width), dtype=np.float64)
|
||
cb_full = np.zeros((height, width), dtype=np.float64)
|
||
|
||
for i in range(audio_lines):
|
||
even_line = i * 2
|
||
odd_line = i * 2 + 1
|
||
if even_line < height:
|
||
y_full[even_line] = y1_data[i]
|
||
cr_full[even_line] = cr_data[i]
|
||
cb_full[even_line] = cb_data[i]
|
||
if odd_line < height:
|
||
y_full[odd_line] = y2_data[i]
|
||
cr_full[odd_line] = cr_data[i]
|
||
cb_full[odd_line] = cb_data[i]
|
||
|
||
return self._ycrcb_to_rgb(y_full, cr_full, cb_full, height, width)
|
||
|
||
@staticmethod
|
||
def _ycrcb_to_rgb(y: np.ndarray, cr: np.ndarray, cb: np.ndarray,
|
||
height: int, width: int) -> Image.Image:
|
||
"""Convert YCrCb pixel data to an RGB PIL Image.
|
||
|
||
Uses the SSTV convention where pixel values 0-255 map to the
|
||
standard Y'CbCr color space used by JPEG/SSTV.
|
||
"""
|
||
# Normalize from 0-255 pixel range to standard ranges
|
||
# Y: 0-255, Cr/Cb: 0-255 centered at 128
|
||
y_norm = y
|
||
cr_norm = cr - 128.0
|
||
cb_norm = cb - 128.0
|
||
|
||
# ITU-R BT.601 conversion
|
||
r = y_norm + 1.402 * cr_norm
|
||
g = y_norm - 0.344136 * cb_norm - 0.714136 * cr_norm
|
||
b = y_norm + 1.772 * cb_norm
|
||
|
||
# Clip and convert
|
||
r = np.clip(r, 0, 255).astype(np.uint8)
|
||
g = np.clip(g, 0, 255).astype(np.uint8)
|
||
b = np.clip(b, 0, 255).astype(np.uint8)
|
||
|
||
rgb = np.stack([r, g, b], axis=-1)
|
||
return Image.fromarray(rgb, 'RGB')
|