intercept/utils/sstv/image_decoder.py

"""SSTV scanline-by-scanline image decoder.

Decodes raw audio samples into a PIL Image for all supported SSTV modes.
Handles sync pulse re-synchronization on each line for robust decoding
under weak-signal or drifting conditions.
"""

from __future__ import annotations

from typing import Callable

import numpy as np

from .constants import (
    FREQ_BLACK,
    FREQ_PIXEL_HIGH,
    FREQ_PIXEL_LOW,
    FREQ_SYNC,
    SAMPLE_RATE,
)
from .dsp import (
    goertzel,
    goertzel_batch,
    samples_for_duration,
)
from .modes import (
    ColorModel,
    SSTVMode,
    SyncPosition,
)

# Pillow is imported lazily to keep the module importable when Pillow
# is not installed (is_sstv_available() just returns True, but actual
# decoding would fail gracefully).
try:
    from PIL import Image
except ImportError:
    Image = None  # type: ignore[assignment,misc]


# Type alias for progress callback: (current_line, total_lines)
ProgressCallback = Callable[[int, int], None]


class SSTVImageDecoder:
    """Decode an SSTV image from a stream of audio samples.

    Usage::

        decoder = SSTVImageDecoder(mode)
        decoder.feed(samples)
        ...
        if decoder.is_complete:
            image = decoder.get_image()
    """

    def __init__(self, mode: SSTVMode, sample_rate: int = SAMPLE_RATE,
                 progress_cb: ProgressCallback | None = None):
        self._mode = mode
        self._sample_rate = sample_rate
        self._progress_cb = progress_cb

        self._buffer = np.array([], dtype=np.float64)
        self._current_line = 0
        self._complete = False

        # Pre-calculate sample counts
        self._sync_samples = samples_for_duration(
            mode.sync_duration_ms / 1000.0, sample_rate)
        self._porch_samples = samples_for_duration(
            mode.sync_porch_ms / 1000.0, sample_rate)
        self._line_samples = samples_for_duration(
            mode.line_duration_ms / 1000.0, sample_rate)
        self._separator_samples = (
            samples_for_duration(mode.channel_separator_ms / 1000.0, sample_rate)
            if mode.channel_separator_ms > 0 else 0
        )

        self._channel_samples = [
            samples_for_duration(ch.duration_ms / 1000.0, sample_rate)
            for ch in mode.channels
        ]

        # For PD modes, each "line" of audio produces 2 image lines
        if mode.color_model == ColorModel.YCRCB_DUAL:
            self._total_audio_lines = mode.height // 2
        else:
            self._total_audio_lines = mode.height

        # Initialize pixel data arrays per channel
        self._channel_data: list[np.ndarray] = []
        for _i, _ch_spec in enumerate(mode.channels):
            if mode.color_model == ColorModel.YCRCB_DUAL:
                # Y1, Cr, Cb, Y2 - all are width-wide
                self._channel_data.append(
                    np.zeros((self._total_audio_lines, mode.width), dtype=np.uint8))
            else:
                self._channel_data.append(
                    np.zeros((mode.height, mode.width), dtype=np.uint8))

        # Track sync position for re-synchronization
        self._expected_line_start = 0  # Sample offset within buffer
        self._synced = False

        # Per-line mid-sync deviation measurements (Scottie modes only).
        # Each entry is the measured offset (in samples) of the sync pulse
        # relative to its expected position: negative = early, positive = late.
        # Used by get_image() to apply post-processing slant correction.
        self._sync_deviations: list[float | None] = []

    @property
    def is_complete(self) -> bool:
        return self._complete

    @property
    def current_line(self) -> int:
        return self._current_line

    @property
    def total_lines(self) -> int:
        return self._total_audio_lines

    @property
    def progress_percent(self) -> int:
        if self._total_audio_lines == 0:
            return 0
        return min(100, int(100 * self._current_line / self._total_audio_lines))

    def feed(self, samples: np.ndarray) -> bool:
        """Feed audio samples into the decoder.

        Args:
            samples: Float64 audio samples.

        Returns:
            True when image is complete.
        """
        if self._complete:
            return True

        self._buffer = np.concatenate([self._buffer, samples])

        # Process complete lines.
        # Guard against stalls: if _decode_line() cannot consume data
        # (e.g. sub-component samples exceed line_samples due to rounding),
        # break out and wait for more audio.
        while not self._complete and len(self._buffer) >= self._line_samples:
            prev_line = self._current_line
            prev_len = len(self._buffer)
            self._decode_line()
            if self._current_line == prev_line and len(self._buffer) == prev_len:
                break  # No progress — need more data

        # Prevent unbounded buffer growth - keep at most 2 lines worth
        max_buffer = self._line_samples * 2
        if len(self._buffer) > max_buffer and not self._complete:
            self._buffer = self._buffer[-max_buffer:]

        return self._complete

    def _find_sync(self, search_region: np.ndarray) -> int | None:
        """Find the 1200 Hz sync pulse within a search region.

        Scans through the region looking for a stretch of 1200 Hz
        tone of approximately the right duration.

        Args:
            search_region: Audio samples to search within.

        Returns:
            Sample offset of the sync pulse start, or None if not found.
        """
        window_size = min(self._sync_samples, 200)
        if len(search_region) < window_size:
            return None

        best_pos = None
        best_energy = 0.0

        step = window_size // 2
        for pos in range(0, len(search_region) - window_size, step):
            chunk = search_region[pos:pos + window_size]
            sync_energy = goertzel(chunk, FREQ_SYNC, self._sample_rate)
            # Check it's actually sync, not data at 1200 Hz area
            black_energy = goertzel(chunk, FREQ_BLACK, self._sample_rate)
            if sync_energy > best_energy and sync_energy > black_energy * 2:
                best_energy = sync_energy
                best_pos = pos

        return best_pos

    def _decode_line(self) -> None:
        """Decode one scanline from the buffer."""
        if self._current_line >= self._total_audio_lines:
            self._complete = True
            return

        # Try to find sync pulse for re-synchronization
        # Search within +/-10% of expected line start
        search_margin = max(100, self._line_samples // 10)

        line_start = 0

        if self._mode.sync_position in (SyncPosition.FRONT, SyncPosition.FRONT_PD):
            # Sync is at the beginning of each line
            search_start = 0
            search_end = min(len(self._buffer), self._sync_samples + search_margin)
            search_region = self._buffer[search_start:search_end]

            sync_pos = self._find_sync(search_region)
            if sync_pos is not None:
                line_start = sync_pos
            # Skip sync + porch to get to pixel data
            pixel_start = line_start + self._sync_samples + self._porch_samples

        elif self._mode.sync_position == SyncPosition.MIDDLE:
            # Scottie: sep(1.5ms) -> G -> sep(1.5ms) -> B -> sync(9ms) -> porch(1.5ms) -> R
            # Skip initial separator (same duration as porch)
            pixel_start = self._porch_samples
            line_start = 0

        else:
            pixel_start = self._sync_samples + self._porch_samples

        # Decode each channel
        pos = pixel_start
        for ch_idx, ch_samples in enumerate(self._channel_samples):
            if pos + ch_samples > len(self._buffer):
                # Not enough data yet - put the data back and wait
                return

            channel_audio = self._buffer[pos:pos + ch_samples]
            pixels = self._decode_channel_pixels(channel_audio)
            self._channel_data[ch_idx][self._current_line, :] = pixels
            pos += ch_samples

            # Add inter-channel gaps based on mode family
            if ch_idx < len(self._channel_samples) - 1:
                if self._mode.sync_position == SyncPosition.MIDDLE:
                    if ch_idx == 0:
                        # Scottie: separator between G and B
                        pos += self._porch_samples
                    else:
                        # Scottie: sync + porch between B and R.
                        # Measure the actual sync position for post-processing
                        # slant correction without touching pos or consumed —
                        # so a noisy/false measurement never corrupts the decode.
                        r_samples = self._channel_samples[-1]
                        # Large window to cover cumulative SDR clock drift over
                        # the full image.  bwd=50 saturates after ~25 lines for
                        # a 2-sample/line drift; 800 samples covers ±200 ppm.
                        bwd = min(800, pos)
                        remaining = len(self._buffer) - pos
                        fwd = min(800, max(
                            0,
                            remaining - self._sync_samples
                            - self._porch_samples - r_samples))
                        deviation: float | None = None
                        if bwd + fwd > 0:
                            region_start = pos - bwd
                            sync_region = self._buffer[
                                region_start: pos + self._sync_samples + fwd]
                            # Full-sync-length window gives best freq resolution
                            # (~111 Hz at 48 kHz) to cleanly separate 1200 Hz
                            # sync from 1500 Hz pixel data.
                            win = self._sync_samples
                            n_raw = len(sync_region) - win + 1
                            if n_raw > 0:
                                # Step 5 samples (~0.1 ms) — enough resolution
                                # for pixel-level drift, keeps batch size small.
                                step = 5
                                all_windows = (
                                    np.lib.stride_tricks.sliding_window_view(
                                        sync_region, win))
                                windows = all_windows[::step]
                                energies = goertzel_batch(
                                    windows,
                                    np.array([FREQ_SYNC, FREQ_BLACK]),
                                    self._sample_rate)
                                sync_e = energies[:, 0]
                                black_e = energies[:, 1]
                                valid_mask = sync_e > black_e * 2
                                if valid_mask.any():
                                    fine_idx = int(
                                        np.argmax(
                                            np.where(valid_mask, sync_e, 0.0)))
                                    deviation = float(fine_idx * step - bwd)
                        self._sync_deviations.append(deviation)
                        pos += self._sync_samples + self._porch_samples
                elif self._separator_samples > 0:
                    # Robot: separator + porch between channels
                    pos += self._separator_samples
                elif (self._mode.sync_position == SyncPosition.FRONT
                      and self._mode.color_model == ColorModel.RGB):
                    # Martin: porch between channels
                    pos += self._porch_samples

        # Advance buffer past this line
        consumed = max(pos, self._line_samples)
        self._buffer = self._buffer[consumed:]

        self._current_line += 1

        if self._progress_cb:
            self._progress_cb(self._current_line, self._total_audio_lines)

        if self._current_line >= self._total_audio_lines:
            self._complete = True

    def _decode_channel_pixels(self, audio: np.ndarray) -> np.ndarray:
        """Decode pixel values from a channel's audio data.

        Uses the analytic signal (Hilbert transform via FFT) to compute
        the instantaneous frequency at every sample, then averages over
        each pixel's duration.  This is the same FM-demodulation approach
        used by QSSTV and other professional SSTV decoders, and provides
        far better frequency resolution than windowed Goertzel — especially
        for fast modes (Martin2, Scottie2) where each pixel spans only
        ~11-13 audio samples.

        Args:
            audio: Audio samples for one channel of one scanline.

        Returns:
            Array of pixel values (0-255), shape (width,).
        """
        width = self._mode.width
        n = len(audio)

        if n < width:
            return np.zeros(width, dtype=np.uint8)

        # --- Analytic signal via Hilbert transform (FFT method) ---
        spectrum = np.fft.fft(audio)

        # Build the analytic-signal multiplier:
        #   h[0] = 1 (DC), h[1..N/2-1] = 2 (positive freqs),
        #   h[N/2] = 1 (Nyquist), h[N/2+1..] = 0 (negative freqs)
        h = np.zeros(n)
        if n % 2 == 0:
            h[0] = h[n // 2] = 1
            h[1:n // 2] = 2
        else:
            h[0] = 1
            h[1:(n + 1) // 2] = 2

        analytic = np.fft.ifft(spectrum * h)

        # --- Instantaneous frequency ---
        phase = np.unwrap(np.angle(analytic))
        inst_freq = np.diff(phase) * (self._sample_rate / (2.0 * np.pi))

        # --- Average frequency per pixel ---
        freq_len = len(inst_freq)
        if freq_len < width:
            # Fewer freq samples than pixels — index directly
            indices = np.linspace(0, freq_len - 1, width).astype(int)
            avg_freqs = inst_freq[indices]
        else:
            pixel_edges = np.linspace(0, freq_len, width + 1).astype(int)
            segment_starts = pixel_edges[:-1]
            segment_lengths = np.diff(pixel_edges)
            segment_lengths = np.maximum(segment_lengths, 1)
            sums = np.add.reduceat(inst_freq, segment_starts)
            avg_freqs = sums / segment_lengths

        # Map to pixel values (1500 Hz → 0, 2300 Hz → 255)
        normalized = (avg_freqs - FREQ_PIXEL_LOW) / (
            FREQ_PIXEL_HIGH - FREQ_PIXEL_LOW)
        return np.clip(normalized * 255 + 0.5, 0, 255).astype(np.uint8)

    def get_image(self) -> Image.Image | None:
        """Convert decoded channel data to a PIL Image.

        Returns:
            PIL Image in RGB mode, or None if Pillow is not available
            or decoding is incomplete.
        """
        if Image is None:
            return None

        mode = self._mode

        if mode.color_model == ColorModel.RGB:
            img = self._assemble_rgb()
        elif mode.color_model == ColorModel.YCRCB:
            img = self._assemble_ycrcb()
        elif mode.color_model == ColorModel.YCRCB_DUAL:
            img = self._assemble_ycrcb_dual()
        else:
            return None

        return self._apply_slant_correction(img)

    def _apply_slant_correction(self, img: Image.Image) -> Image.Image:
        """Apply per-row horizontal correction based on measured sync drift.

        Uses the sync deviation measurements collected during decoding to
        estimate the per-line SDR clock drift rate via linear regression,
        then circularly shifts each row to compensate. Noisy individual
        measurements are averaged out; if fewer than 10 valid measurements
        exist the image is returned unchanged.
        """
        valid = [(i, d) for i, d in enumerate(self._sync_deviations)
                 if d is not None]
        if len(valid) < 10:
            return img

        lines = np.array([x[0] for x in valid], dtype=np.float64)
        devs = np.array([x[1] for x in valid], dtype=np.float64)

        # Linear fit: deviation[i] ≈ slope × i + intercept.
        # slope < 0 → fast SDR (sync arrives early, image leans left).
        # slope > 0 → slow SDR (sync arrives late, image leans right).
        slope, _ = np.polyfit(lines, devs, 1)

        # Convert samples/line drift to pixels/line for the channel width.
        pixels_per_line = slope * self._mode.width / self._channel_samples[0]

        # Skip correction if drift is negligible (< 1 px over 20 lines).
        if abs(pixels_per_line) < 0.05:
            return img

        arr = np.array(img)
        height, width = arr.shape[:2]

        # Reject clearly implausible estimates. Even with cheap SDR clocks,
        # real SSTV slant is typically modest; extreme values are usually
        # bad sync picks that would over-correct the image.
        total_shift = abs((height - 1) * pixels_per_line)
        if total_shift > width * 0.25:
            return img

        corrected = np.empty_like(arr)

        for row in range(height):
            shift = -int(round(row * pixels_per_line))
            corrected[row] = np.roll(arr[row], shift=shift, axis=0)

        return Image.fromarray(corrected, 'RGB')

    def _assemble_rgb(self) -> Image.Image:
        """Assemble RGB image from sequential R, G, B channel data.

        Martin/Scottie channel order: G, B, R.
        """
        height = self._mode.height

        # Channel order for Martin/Scottie: [0]=G, [1]=B, [2]=R
        g_data = self._channel_data[0][:height]
        b_data = self._channel_data[1][:height]
        r_data = self._channel_data[2][:height]

        rgb = np.stack([r_data, g_data, b_data], axis=-1)
        return Image.fromarray(rgb, 'RGB')

    def _assemble_ycrcb(self) -> Image.Image:
        """Assemble image from YCrCb data (Robot modes).

        Robot36: Y every line, Cr/Cb alternating (half-rate chroma).
        Robot72: Y, Cr, Cb every line (full-rate chroma).
        """
        height = self._mode.height
        width = self._mode.width

        if not self._mode.has_half_rate_chroma:
            # Full-rate chroma (Robot72): Y, Cr, Cb as separate channels
            y_data = self._channel_data[0][:height].astype(np.float64)
            cr = self._channel_data[1][:height].astype(np.float64)
            cb = self._channel_data[2][:height].astype(np.float64)
            return self._ycrcb_to_rgb(y_data, cr, cb, height, width)

        # Half-rate chroma (Robot36): Y + alternating Cr/Cb
        y_data = self._channel_data[0][:height].astype(np.float64)
        chroma_data = self._channel_data[1][:height].astype(np.float64)

        # Separate Cr (even lines) and Cb (odd lines), then interpolate
        cr = np.zeros((height, width), dtype=np.float64)
        cb = np.zeros((height, width), dtype=np.float64)

        for line in range(height):
            if line % 2 == 0:
                cr[line] = chroma_data[line]
            else:
                cb[line] = chroma_data[line]

        # Interpolate missing chroma lines
        for line in range(height):
            if line % 2 == 1:
                # Missing Cr - interpolate from neighbors
                prev_cr = line - 1 if line > 0 else line + 1
                next_cr = line + 1 if line + 1 < height else line - 1
                cr[line] = (cr[prev_cr] + cr[next_cr]) / 2
            else:
                # Missing Cb - interpolate from neighbors
                prev_cb = line - 1 if line > 0 else line + 1
                next_cb = line + 1 if line + 1 < height else line - 1
                if prev_cb >= 0 and next_cb < height:
                    cb[line] = (cb[prev_cb] + cb[next_cb]) / 2
                elif prev_cb >= 0:
                    cb[line] = cb[prev_cb]
                else:
                    cb[line] = cb[next_cb]

        return self._ycrcb_to_rgb(y_data, cr, cb, height, width)

    def _assemble_ycrcb_dual(self) -> Image.Image:
        """Assemble image from dual-luminance YCrCb data (PD modes).

        PD modes send Y1, Cr, Cb, Y2 per audio line, producing 2 image lines.
        """
        audio_lines = self._total_audio_lines
        width = self._mode.width
        height = self._mode.height

        y1_data = self._channel_data[0][:audio_lines].astype(np.float64)
        cr_data = self._channel_data[1][:audio_lines].astype(np.float64)
        cb_data = self._channel_data[2][:audio_lines].astype(np.float64)
        y2_data = self._channel_data[3][:audio_lines].astype(np.float64)

        # Interleave Y1 and Y2 to produce full-height luminance
        y_full = np.zeros((height, width), dtype=np.float64)
        cr_full = np.zeros((height, width), dtype=np.float64)
        cb_full = np.zeros((height, width), dtype=np.float64)

        for i in range(audio_lines):
            even_line = i * 2
            odd_line = i * 2 + 1
            if even_line < height:
                y_full[even_line] = y1_data[i]
                cr_full[even_line] = cr_data[i]
                cb_full[even_line] = cb_data[i]
            if odd_line < height:
                y_full[odd_line] = y2_data[i]
                cr_full[odd_line] = cr_data[i]
                cb_full[odd_line] = cb_data[i]

        return self._ycrcb_to_rgb(y_full, cr_full, cb_full, height, width)

    @staticmethod
    def _ycrcb_to_rgb(y: np.ndarray, cr: np.ndarray, cb: np.ndarray,
                      height: int, width: int) -> Image.Image:
        """Convert YCrCb pixel data to an RGB PIL Image.

        Uses the SSTV convention where pixel values 0-255 map to the
        standard Y'CbCr color space used by JPEG/SSTV.
        """
        # Normalize from 0-255 pixel range to standard ranges
        # Y: 0-255, Cr/Cb: 0-255 centered at 128
        y_norm = y
        cr_norm = cr - 128.0
        cb_norm = cb - 128.0

        # ITU-R BT.601 conversion
        r = y_norm + 1.402 * cr_norm
        g = y_norm - 0.344136 * cb_norm - 0.714136 * cr_norm
        b = y_norm + 1.772 * cb_norm

        # Clip and convert
        r = np.clip(r, 0, 255).astype(np.uint8)
        g = np.clip(g, 0, 255).astype(np.uint8)
        b = np.clip(b, 0, 255).astype(np.uint8)

        rgb = np.stack([r, g, b], axis=-1)
        return Image.fromarray(rgb, 'RGB')