ascii-tts-example/render_tts_ascii.py

#!/usr/bin/env python3
"""
render_tts_ascii.py — mode #6 demo: TTS narration ASCII video ("warm machine oracle").

Pipeline:
  1. Synthesize narration locally with espeak-ng (or espeak fallback).
  2. Decode the WAV and compute per-frame RMS + transient features.
  3. Render an ASCII-glyph grid over a non-flat textured background, with
     section transitions (amber data fog -> cyan/purple signal rings ->
     punctuation star-map iris -> title card) reactive to the audio.
  4. Overlay a typewriter quote and CRT post-process (scanlines, grain, bloom).
  5. Mux PNG frames + padded audio into MP4 via ffmpeg.
"""

from __future__ import annotations

import math
import os
import shutil
import struct
import subprocess
import sys
import wave
from pathlib import Path
from typing import Optional

import numpy as np
from PIL import Image, ImageDraw, ImageFilter, ImageFont

# ------------------------------------------------------------------- config --
ROOT      = Path(__file__).resolve().parent
OUT_DIR   = ROOT / "output"
TMP_DIR   = ROOT / "_tmp"
FRAMES    = TMP_DIR / "frames"
LOGS      = TMP_DIR / "logs"
WAV_RAW   = TMP_DIR / "narration_raw.wav"
WAV_PAD   = TMP_DIR / "narration_pad.wav"
OUT_MP4   = OUT_DIR / "tts_ascii_example.mp4"

WIDTH, HEIGHT = 960, 540
FPS           = 24
DUR_MIN       = 10.0   # seconds
DUR_MAX       = 12.0

NARRATION = ("Listen closely. The terminal hums softly. "
             "Each tiny glyph, a small star. "
             "Small text can hold a whole universe. "
             "Welcome to the warm machine.")

QUOTE = "small text can hold a whole universe"
QUOTE_LINE_BREAK = 21  # split index for two-line layout

TITLE_LINES = [
    "WARM MACHINE ORACLE",
    "ascii / tts / signal",
]

# Color palette (linear-ish RGB, 0..1)
C_AMBER    = np.array([1.00, 0.69, 0.25], dtype=np.float32)
C_TEAL     = np.array([0.16, 0.83, 0.83], dtype=np.float32)
C_PURPLE   = np.array([0.71, 0.27, 0.90], dtype=np.float32)
C_HOTAMB   = np.array([1.00, 0.92, 0.78], dtype=np.float32)
C_DEEPBG   = np.array([0.020, 0.025, 0.060], dtype=np.float32)

# Glyph palettes (intensity-ordered, low -> high). ASCII-safe fallbacks below.
PALETTE_FOG_PREF     = " .'`,:;-~+*=oO#"
PALETTE_CIRCUIT_PREF = " .,:-=+*xX#%@▒▓█"
PALETTE_STAR_PREF    = " .,'`*+oO●★"
PALETTE_FOG_ASCII    = " .'`,:;-~+*=oO#"
PALETTE_CIRCUIT_ASCII= " .,:-=+*xX#%@&"
PALETTE_STAR_ASCII   = " .,'`*+oO#@"


# -------------------------------------------------------------- subprocess --
def run(cmd, log_name: str, check: bool = True) -> int:
    """Run a command, redirecting stdout+stderr to LOGS/<log_name>.log."""
    LOGS.mkdir(parents=True, exist_ok=True)
    log_path = LOGS / log_name
    with open(log_path, "ab") as f:
        f.write(("\n$ " + " ".join(map(str, cmd)) + "\n").encode())
        f.flush()
        r = subprocess.run(cmd, stdout=f, stderr=subprocess.STDOUT)
    if check and r.returncode != 0:
        sys.stderr.write(f"[error] {cmd[0]} failed (rc={r.returncode}); see {log_path}\n")
        raise SystemExit(r.returncode)
    return r.returncode


# ---------------------------------------------------------------- tts step --
def generate_tts(text: str, out_path: Path) -> Path:
    """Probe espeak-ng then espeak. Synthesize to WAV at out_path."""
    out_path.parent.mkdir(parents=True, exist_ok=True)
    for exe in ("espeak-ng", "espeak"):
        if shutil.which(exe):
            # Slower, lower pitch — feels like a warm oracle.
            cmd = [exe, "-s", "148", "-p", "38", "-a", "180",
                   "-v", "en+m3", "-w", str(out_path), text]
            run(cmd, "tts.log")
            if out_path.exists() and out_path.stat().st_size > 1024:
                return out_path
    raise RuntimeError("Neither espeak-ng nor espeak is available.")


# ------------------------------------------------------------- audio utils --
def load_wav_mono(path: Path) -> tuple[np.ndarray, int]:
    with wave.open(str(path), "rb") as w:
        sr  = w.getframerate()
        ch  = w.getnchannels()
        sw  = w.getsampwidth()
        n   = w.getnframes()
        raw = w.readframes(n)
    if sw == 2:
        arr = np.frombuffer(raw, dtype="<i2").astype(np.float32) / 32768.0
    elif sw == 1:
        arr = (np.frombuffer(raw, dtype="u1").astype(np.float32) - 128.0) / 128.0
    elif sw == 4:
        arr = np.frombuffer(raw, dtype="<i4").astype(np.float32) / 2147483648.0
    else:
        raise RuntimeError(f"Unsupported sample width: {sw}")
    if ch > 1:
        arr = arr.reshape(-1, ch).mean(axis=1)
    return arr.astype(np.float32), sr


def write_wav_mono(path: Path, audio: np.ndarray, sr: int) -> None:
    audio = np.clip(audio, -1.0, 1.0)
    pcm = (audio * 32767.0).astype("<i2").tobytes()
    with wave.open(str(path), "wb") as w:
        w.setnchannels(1)
        w.setsampwidth(2)
        w.setframerate(sr)
        w.writeframes(pcm)


def pad_or_trim_audio(in_path: Path, out_path: Path, target_dur: float) -> float:
    audio, sr = load_wav_mono(in_path)
    target_n = int(round(target_dur * sr))
    if len(audio) < target_n:
        # Pad with subtle hum-floor (very quiet white noise) so things don't feel dead.
        pad = (np.random.RandomState(7).randn(target_n - len(audio))
               .astype(np.float32) * 0.0008)
        audio = np.concatenate([audio, pad])
    else:
        audio = audio[:target_n]
    write_wav_mono(out_path, audio, sr)
    return len(audio) / sr


def compute_features(audio: np.ndarray, sr: int, fps: int,
                     n_frames: int) -> tuple[np.ndarray, np.ndarray]:
    """Per-video-frame RMS and transient (positive-rectified energy diff)."""
    spf = sr / fps
    rms = np.zeros(n_frames, dtype=np.float32)
    for i in range(n_frames):
        s = int(i * spf)
        e = min(len(audio), int((i + 1) * spf))
        if e > s:
            seg = audio[s:e]
            rms[i] = float(np.sqrt(np.mean(seg * seg) + 1e-12))
    # Light smoothing
    if len(rms) > 5:
        k = np.array([0.15, 0.7, 0.15], dtype=np.float32)
        rms = np.convolve(rms, k, mode="same")
    # Normalize
    rmax = float(rms.max()) if rms.max() > 0 else 1.0
    rms_n = rms / rmax
    # Transients: positive change
    diff = np.diff(rms_n, prepend=rms_n[0])
    trans = np.clip(diff, 0.0, None)
    tmax = float(trans.max()) if trans.max() > 0 else 1.0
    trans_n = trans / tmax
    # Smooth transients with a short decay so beats persist a few frames
    decayed = np.zeros_like(trans_n)
    acc = 0.0
    for i, v in enumerate(trans_n):
        acc = max(v, acc * 0.78)
        decayed[i] = acc
    return rms_n, decayed


# ---------------------------------------------------------- font + sprites --
FONT_CANDIDATES = [
    "/usr/share/fonts/truetype/dejavu/DejaVuSansMono-Bold.ttf",
    "/usr/share/fonts/truetype/dejavu/DejaVuSansMono.ttf",
    "/usr/share/fonts/truetype/liberation/LiberationMono-Bold.ttf",
    "/usr/share/fonts/truetype/liberation/LiberationMono-Regular.ttf",
    "/usr/share/fonts/truetype/freefont/FreeMonoBold.ttf",
]
QUOTE_FONT_CANDIDATES = [
    "/usr/share/fonts/truetype/dejavu/DejaVuSansMono-Bold.ttf",
    "/usr/share/fonts/truetype/dejavu/DejaVuSansMono.ttf",
    "/usr/share/fonts/truetype/liberation/LiberationMono-Bold.ttf",
]
TITLE_FONT_CANDIDATES = [
    "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf",
    "/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf",
    "/usr/share/fonts/truetype/freefont/FreeSansBold.ttf",
]


def find_font(candidates) -> str:
    for p in candidates:
        if Path(p).exists():
            return p
    raise RuntimeError("No suitable font found from candidates.")


def font_supports(font: ImageFont.FreeTypeFont, ch: str) -> bool:
    """True if the font produces any inked pixels for `ch`."""
    try:
        mask = font.getmask(ch, mode="L")
    except Exception:
        return False
    if mask is None:
        return False
    arr = np.asarray(mask, dtype=np.uint8)
    return arr.size > 0 and int(arr.max()) > 0


def filter_palette(palette_pref: str, palette_ascii: str,
                   font: ImageFont.FreeTypeFont) -> str:
    # Always keep the leading space.
    chars = [" "]
    for ch in palette_pref[1:]:
        if ch == " ":
            continue
        if font_supports(font, ch):
            chars.append(ch)
    # If only the space made it, fall back to ASCII-safe.
    if len(chars) < 4:
        chars = [" "] + [c for c in palette_ascii[1:] if c == " " or font_supports(font, c)]
        # Strip duplicates while preserving order
        seen = set()
        clean = []
        for c in chars:
            if c not in seen:
                seen.add(c)
                clean.append(c)
        chars = clean
    if len(chars) < 4:
        # absolute last-ditch fallback
        chars = list(" .:-=+*#@")
    return "".join(chars)


def build_sprite_array(font: ImageFont.FreeTypeFont, charset: str,
                       cell_w: int, cell_h: int) -> tuple[np.ndarray, dict]:
    """For each char, render alpha into a (cell_h, cell_w) uint8 numpy array."""
    char_to_idx = {}
    sprites = np.zeros((len(charset), cell_h, cell_w), dtype=np.uint8)
    for i, ch in enumerate(charset):
        char_to_idx[ch] = i
        if ch == " ":
            continue
        img = Image.new("L", (cell_w, cell_h), 0)
        d = ImageDraw.Draw(img)
        # Center each glyph in its cell.
        d.text((cell_w / 2, cell_h / 2), ch, fill=255, font=font, anchor="mm")
        sprites[i] = np.asarray(img, dtype=np.uint8)
    return sprites, char_to_idx


# -------------------------------------------------------- math + envelopes --
def trap(t: float, in0: float, in1: float, out0: float, out1: float) -> float:
    """Trapezoidal envelope: ramps 0->1 over [in0,in1], holds, ramps 1->0 over [out0,out1]."""
    if t <= in0:
        return 0.0
    if t < in1:
        return (t - in0) / max(in1 - in0, 1e-6)
    if t < out0:
        return 1.0
    if t < out1:
        return 1.0 - (t - out0) / max(out1 - out0, 1e-6)
    return 0.0


def smoothstep(x: float) -> float:
    x = max(0.0, min(1.0, x))
    return x * x * (3.0 - 2.0 * x)


# ---------------------------------------------------------------- fields ---
def fog_field(rows: int, cols: int, t: float, rng_state: np.random.RandomState) -> np.ndarray:
    """Slow rolling amber data-fog noise (sum of low-freq sinusoids)."""
    x = np.arange(cols, dtype=np.float32)[None, :]
    y = np.arange(rows, dtype=np.float32)[:, None]
    v = np.zeros((rows, cols), dtype=np.float32)
    v += np.sin(x * 0.11 + t * 0.55)
    v += np.cos(y * 0.17 + t * 0.41)
    v += np.sin((x + y) * 0.08 + t * 0.33)
    v += np.cos((x * 0.31 - y * 0.24) + t * 0.27)
    v += 0.35 * np.sin(x * 0.42 + y * 0.39 - t * 0.9)
    v -= v.min()
    if v.max() > 0:
        v /= v.max()
    # Tilt brighter toward center vertically for a soft horizon.
    cy = rows / 2.0
    vert = 1.0 - np.abs((np.arange(rows) - cy) / cy)  # 0..1
    v = v * (0.55 + 0.45 * vert[:, None])
    return v.astype(np.float32)


def rings_field(rows: int, cols: int, t: float, energy: float,
                pulse: float) -> tuple[np.ndarray, np.ndarray]:
    """Concentric signal rings. Returns (intensity, angle_norm)."""
    cx = (cols - 1) / 2.0
    cy = (rows - 1) / 2.0
    x = np.arange(cols, dtype=np.float32)[None, :] - cx
    y = (np.arange(rows, dtype=np.float32)[:, None] - cy) * 1.9  # aspect correction
    r = np.sqrt(x * x + y * y)
    # Multi-band sin rings, advanced by time and pulse.
    phase = r * 0.55 - t * 4.5 - pulse * 2.2
    band = 0.5 + 0.5 * np.sin(phase)
    band2 = 0.5 + 0.5 * np.sin(r * 0.22 - t * 1.2)
    field = band * (0.55 + 0.45 * band2)
    # Falloff from center so the rings glow outward then fade.
    rn = r / (np.hypot(cx, cy * 1.9) + 1e-6)
    falloff = np.exp(-((rn - 0.55 - 0.18 * energy) ** 2) / 0.18)
    field = field * (0.4 + 0.9 * falloff)
    field = np.clip(field, 0, 1)
    ang = (np.arctan2(y, x) + math.pi) / (2 * math.pi)  # 0..1
    return field.astype(np.float32), ang.astype(np.float32)


def star_field(rows: int, cols: int, t: float,
               seed: int = 13) -> np.ndarray:
    """Sparse star-map: thresholded noise that twinkles."""
    rng = np.random.RandomState(seed)
    base = rng.rand(rows, cols).astype(np.float32)
    twinkle = 0.5 + 0.5 * np.sin(
        np.arange(rows * cols).reshape(rows, cols) * 0.7 + t * 4.0
    )
    # Stars: top 12% of values, modulated by twinkle.
    mask = (base > 0.88).astype(np.float32)
    near = (base > 0.78).astype(np.float32) * 0.35
    out = mask * (0.7 + 0.3 * twinkle) + near * twinkle
    return np.clip(out, 0, 1).astype(np.float32)


# ----------------------------------------------------------- frame render --
def render_frame(f_idx: int,
                 total_frames: int,
                 total_dur: float,
                 rms: np.ndarray,
                 trans: np.ndarray,
                 sprites: np.ndarray,
                 char_to_idx: dict,
                 palettes: dict,
                 cell_w: int,
                 cell_h: int,
                 cols: int,
                 rows: int,
                 quote_font: ImageFont.FreeTypeFont,
                 title_font: ImageFont.FreeTypeFont,
                 cursor_glyph: str,
                 rng: np.random.RandomState) -> Image.Image:
    t = f_idx / FPS
    D = total_dur
    e = float(rms[f_idx])
    p = float(trans[f_idx])

    # Section envelopes (relative to total duration).
    w_fog   = trap(t, 0.0,      0.06 * D, 0.32 * D, 0.45 * D)
    w_rings = trap(t, 0.28 * D, 0.40 * D, 0.78 * D, 0.90 * D)
    w_iris  = trap(t, 0.58 * D, 0.62 * D, 0.71 * D, 0.76 * D)
    w_title = trap(t, 0.82 * D, 0.92 * D, D + 1.0, D + 2.0)
    s = w_fog + w_rings + w_iris + 1e-6

    # --- Fields -------------------------------------------------------------
    fog_v = fog_field(rows, cols, t, rng)
    rings_v, ang = rings_field(rows, cols, t, energy=e, pulse=p)
    stars_v = star_field(rows, cols, t)

    # Per-cell intensity (weighted combination + audio modulation).
    intensity = (w_fog   * fog_v
                 + w_rings * rings_v
                 + w_iris  * stars_v)
    intensity *= (0.55 + 0.55 * e)
    # Transient pulse: brighten a radial wave outward.
    if p > 0.02:
        cy_, cx_ = (rows - 1) / 2.0, (cols - 1) / 2.0
        xx = np.arange(cols)[None, :] - cx_
        yy = (np.arange(rows)[:, None] - cy_) * 1.9
        rr = np.sqrt(xx * xx + yy * yy)
        rr_n = rr / (np.hypot(cx_, cy_ * 1.9) + 1e-6)
        wave = np.exp(-((rr_n - (0.15 + 0.85 * p)) ** 2) / 0.04) * p * 0.75
        intensity += wave.astype(np.float32)

    intensity = np.clip(intensity, 0.0, 1.4)
    # Adaptive tonemap: percentile-based.
    hi = float(np.percentile(intensity, 96))
    if hi > 0.05:
        intensity = np.clip(intensity / hi, 0.0, 1.0)
    else:
        intensity = np.zeros_like(intensity)

    # --- Char selection (pick palette by dominant section) ------------------
    if w_iris > 0.40 and w_iris >= max(w_fog, w_rings) * 0.85:
        pal = palettes["star"]
    elif w_rings > w_fog:
        pal = palettes["circuit"]
    else:
        pal = palettes["fog"]
    n_pal = len(pal)
    idx_in_pal = np.clip((intensity * (n_pal - 1)).astype(np.int32), 0, n_pal - 1)
    pal_lookup = np.array([char_to_idx[c] for c in pal], dtype=np.int32)
    sprite_idx = pal_lookup[idx_in_pal]  # (rows, cols)

    # --- Per-cell color -----------------------------------------------------
    col_field = np.zeros((rows, cols, 3), dtype=np.float32)
    col_field += w_fog * C_AMBER[None, None, :]
    ring_color = (1.0 - ang)[..., None] * C_TEAL[None, None, :] + ang[..., None] * C_PURPLE[None, None, :]
    col_field += w_rings * ring_color
    col_field += w_iris * C_HOTAMB[None, None, :]
    col_field /= s
    # Subtle hue drift modulated by transient.
    col_field += (p * 0.18) * (C_HOTAMB[None, None, :] - col_field)
    col_field = np.clip(col_field, 0.0, 1.0)
    # Multiply by intensity for brightness.
    col_field *= intensity[..., None]

    # --- Compose ASCII grid into a pixel image -------------------------------
    # alpha tiles: (rows, cols, cell_h, cell_w)
    tiles = sprites[sprite_idx]                           # (rows, cols, ch, cw)
    alpha_big = tiles.transpose(0, 2, 1, 3).reshape(rows * cell_h, cols * cell_w)
    alpha_f = alpha_big.astype(np.float32) / 255.0
    # color_big: repeat each cell's color across its cell pixels
    color_big = np.repeat(np.repeat(col_field, cell_h, axis=0), cell_w, axis=1)

    # Canvas may be slightly smaller than WIDTH/HEIGHT — pad to full size.
    grid_h, grid_w = alpha_f.shape
    base = np.tile(C_DEEPBG, (HEIGHT, WIDTH, 1)).astype(np.float32)
    # Faint background haze (low intensity field) for the unused gutter pixels too.
    base += 0.03 * np.tile(C_PURPLE, (HEIGHT, WIDTH, 1)) * (0.4 + 0.6 * w_rings)
    base += 0.02 * np.tile(C_AMBER,  (HEIGHT, WIDTH, 1)) * (0.4 + 0.6 * w_fog)

    # Offset grid to roughly center (in case grid_w/grid_h < canvas).
    ox = (WIDTH  - grid_w) // 2
    oy = (HEIGHT - grid_h) // 2
    base[oy:oy + grid_h, ox:ox + grid_w, :] += alpha_f[..., None] * color_big

    # Iris brief darkening: pull background toward black, leave stars bright.
    if w_iris > 0.05:
        darken = 1.0 - 0.85 * w_iris
        base *= darken

    img_f = np.clip(base, 0.0, 1.0)

    # --- Postprocess: scanlines, grain, bloom -------------------------------
    img_f = postprocess(img_f, rng)

    img_u8 = (img_f * 255.0 + 0.5).astype(np.uint8)
    out = Image.fromarray(img_u8, mode="RGB")

    # --- Overlays: quote + title -------------------------------------------
    draw_quote_overlay(out, t, D, e, p, quote_font, cursor_glyph)
    if w_title > 0.01:
        draw_title_overlay(out, w_title, title_font)

    return out


# --------------------------------------------------------------- postproc --
def postprocess(img: np.ndarray, rng: np.random.RandomState) -> np.ndarray:
    H, W = img.shape[:2]
    # Scanlines: dim alternate rows.
    sl = np.ones(H, dtype=np.float32)
    sl[::2] = 0.86
    img = img * sl[:, None, None]

    # Grain: subtle additive monochrome noise.
    grain = (rng.rand(H, W).astype(np.float32) - 0.5) * 0.045
    img = img + grain[..., None]

    # Cheap bloom: blur the bright channel, add back.
    bright = np.maximum(0.0, img - 0.55)
    bright_u8 = np.clip(bright * 255.0, 0, 255).astype(np.uint8)
    blurred = np.asarray(
        Image.fromarray(bright_u8, "RGB").filter(ImageFilter.GaussianBlur(radius=5)),
        dtype=np.float32,
    ) / 255.0
    img = img + blurred * 0.65

    # Subtle vignette.
    yy = np.linspace(-1.0, 1.0, H, dtype=np.float32)[:, None]
    xx = np.linspace(-1.0, 1.0, W, dtype=np.float32)[None, :]
    r2 = xx * xx + yy * yy
    vign = 1.0 - 0.35 * np.clip(r2, 0.0, 1.0)
    img = img * vign[..., None]

    return np.clip(img, 0.0, 1.0)


# ----------------------------------------------------------------- text ----
def draw_quote_overlay(img: Image.Image, t: float, D: float,
                       rms_v: float, pulse_v: float,
                       font: ImageFont.FreeTypeFont, cursor_glyph: str) -> None:
    start = 0.32 * D
    end   = 0.75 * D
    if t < start:
        return
    progress = min(1.0, (t - start) / max(end - start, 1e-6))
    progress = smoothstep(progress)
    n_target = int(round(progress * len(QUOTE)))
    visible = QUOTE[:n_target]

    if len(visible) <= QUOTE_LINE_BREAK:
        line1, line2 = visible, ""
    else:
        line1 = QUOTE[:QUOTE_LINE_BREAK]
        line2 = visible[QUOTE_LINE_BREAK:]

    blink_on = (int(t * 2.6) % 2 == 0)
    cursor = cursor_glyph if blink_on else " "
    if n_target < len(QUOTE):
        if line2 or len(visible) >= QUOTE_LINE_BREAK:
            line2 = (line2 + cursor)[:32]
        else:
            line1 = (line1 + cursor)[:32]

    draw = ImageDraw.Draw(img, "RGBA")

    # Backdrop card.
    box_w, box_h = 700, 120
    cx = WIDTH // 2
    y_top = int(HEIGHT * 0.66)
    backdrop = Image.new("RGBA", (box_w, box_h), (6, 8, 22, 195))
    img.paste(backdrop, (cx - box_w // 2, y_top), backdrop)

    # Borders glow (top + bottom).
    glow = int(160 + 70 * rms_v)
    draw.line([(cx - box_w // 2 + 12, y_top + 2),
               (cx + box_w // 2 - 12, y_top + 2)],
              fill=(255, 180, 80, glow), width=2)
    draw.line([(cx - box_w // 2 + 12, y_top + box_h - 3),
               (cx + box_w // 2 - 12, y_top + box_h - 3)],
              fill=(70, 220, 220, glow), width=2)

    # Glow text (drawn twice: blurred behind, sharp on top).
    glow_layer = Image.new("RGBA", img.size, (0, 0, 0, 0))
    gd = ImageDraw.Draw(glow_layer)
    base_color = (255, 200, 110, 230)
    bright_color = (255, 235, 200, 255)
    line1_y = y_top + 30
    line2_y = y_top + 75
    gd.text((cx, line1_y), line1, font=font, anchor="mm", fill=base_color)
    gd.text((cx, line2_y), line2, font=font, anchor="mm", fill=base_color)
    blurred = glow_layer.filter(ImageFilter.GaussianBlur(radius=4))
    img.paste(blurred, (0, 0), blurred)

    draw.text((cx, line1_y), line1, font=font, anchor="mm", fill=bright_color)
    color2 = (255, 220, 160, 255) if pulse_v > 0.15 else (255, 200, 120, 255)
    draw.text((cx, line2_y), line2, font=font, anchor="mm", fill=color2)


def draw_title_overlay(img: Image.Image, weight: float,
                       font_pair: tuple[ImageFont.FreeTypeFont, ImageFont.FreeTypeFont]) -> None:
    big, small = font_pair
    draw = ImageDraw.Draw(img, "RGBA")
    a = int(255 * smoothstep(weight))
    a2 = int(220 * smoothstep(weight))
    cx = WIDTH // 2
    y = int(HEIGHT * 0.18)
    # Glow
    glow_layer = Image.new("RGBA", img.size, (0, 0, 0, 0))
    gd = ImageDraw.Draw(glow_layer)
    gd.text((cx, y), TITLE_LINES[0], font=big, anchor="mm", fill=(255, 190, 100, a))
    gd.text((cx, y + 48), TITLE_LINES[1], font=small, anchor="mm", fill=(80, 220, 220, a2))
    blurred = glow_layer.filter(ImageFilter.GaussianBlur(radius=6))
    img.paste(blurred, (0, 0), blurred)
    draw.text((cx, y), TITLE_LINES[0], font=big, anchor="mm", fill=(255, 230, 170, a))
    draw.text((cx, y + 48), TITLE_LINES[1], font=small, anchor="mm", fill=(160, 240, 240, a2))


# ----------------------------------------------------------------- mux -----
def mux_video(frames_glob_dir: Path, audio_path: Path,
              out_path: Path, total_dur: float) -> None:
    out_path.parent.mkdir(parents=True, exist_ok=True)
    cmd = [
        "ffmpeg", "-y",
        "-framerate", str(FPS),
        "-i", str(frames_glob_dir / "f_%05d.png"),
        "-i", str(audio_path),
        "-map", "0:v:0", "-map", "1:a:0",
        "-c:v", "libx264", "-pix_fmt", "yuv420p",
        "-preset", "medium", "-crf", "20",
        "-c:a", "aac", "-b:a", "160k",
        "-t", f"{total_dur:.3f}",
        "-movflags", "+faststart",
        str(out_path),
    ]
    run(cmd, "ffmpeg_mux.log")


def ffprobe_info(path: Path) -> dict:
    cmd = ["ffprobe", "-v", "error",
           "-show_entries", "stream=codec_type,codec_name,duration:format=duration",
           "-of", "default=noprint_wrappers=1", str(path)]
    log_path = LOGS / "ffprobe.log"
    with open(log_path, "ab") as f:
        f.write(("\n$ " + " ".join(cmd) + "\n").encode())
        f.flush()
        r = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=f)
    out = r.stdout.decode("utf-8", "replace")
    with open(log_path, "ab") as f:
        f.write(out.encode())
    info = {"streams": [], "format_duration": None}
    current = None
    for line in out.splitlines():
        line = line.strip()
        if not line:
            continue
        if "=" not in line:
            continue
        key, _, val = line.partition("=")
        if key == "codec_type":
            current = {"codec_type": val}
            info["streams"].append(current)
        elif key == "codec_name" and current is not None:
            current["codec_name"] = val
        elif key == "duration":
            if current is not None and "codec_type" in current and "stream_duration" not in current:
                current["stream_duration"] = val
            else:
                info["format_duration"] = val
    return info


# ----------------------------------------------------------------- main ----
def main() -> int:
    for d in (OUT_DIR, TMP_DIR, FRAMES, LOGS):
        d.mkdir(parents=True, exist_ok=True)
    # Clean prior frames.
    for p in FRAMES.glob("*.png"):
        p.unlink()

    print("[1/6] Generating TTS narration with espeak-ng...")
    generate_tts(NARRATION, WAV_RAW)
    audio, sr = load_wav_mono(WAV_RAW)
    audio_dur = len(audio) / sr
    print(f"      raw audio: {audio_dur:.2f}s @ {sr}Hz")

    total_dur = max(DUR_MIN, min(DUR_MAX, audio_dur + 0.8))
    total_frames = int(round(total_dur * FPS))
    print(f"      target video: {total_dur:.2f}s, {total_frames} frames @ {FPS}fps")

    print("[2/6] Padding/trimming audio to match video duration...")
    final_audio_dur = pad_or_trim_audio(WAV_RAW, WAV_PAD, total_dur)
    print(f"      padded audio: {final_audio_dur:.2f}s")

    audio_pad, _ = load_wav_mono(WAV_PAD)
    rms_n, trans_n = compute_features(audio_pad, sr, FPS, total_frames)

    print("[3/6] Loading fonts and building glyph sprites...")
    grid_font_path  = find_font(FONT_CANDIDATES)
    quote_font_path = find_font(QUOTE_FONT_CANDIDATES)
    title_font_path = find_font(TITLE_FONT_CANDIDATES)

    grid_font  = ImageFont.truetype(grid_font_path, size=18)
    quote_font = ImageFont.truetype(quote_font_path, size=26)
    title_big  = ImageFont.truetype(title_font_path, size=40)
    title_sm   = ImageFont.truetype(title_font_path, size=22)

    # Figure out cell size from font metrics.
    asc, desc = grid_font.getmetrics()
    cell_h = max(asc + desc, 18)
    # Use 'M' advance, clamp wider so dense glyphs don't clip.
    try:
        adv = int(round(grid_font.getlength("M")))
    except Exception:
        adv = grid_font.getbbox("M")[2]
    cell_w = max(adv, 10)
    cols = WIDTH  // cell_w
    rows = HEIGHT // cell_h
    print(f"      grid: {cols}x{rows} cells, cell={cell_w}x{cell_h}")

    # Filter palettes against the chosen font.
    pal_fog     = filter_palette(PALETTE_FOG_PREF,     PALETTE_FOG_ASCII,     grid_font)
    pal_circuit = filter_palette(PALETTE_CIRCUIT_PREF, PALETTE_CIRCUIT_ASCII, grid_font)
    pal_star    = filter_palette(PALETTE_STAR_PREF,    PALETTE_STAR_ASCII,    grid_font)
    # Union charset for sprite atlas.
    charset = "".join(sorted(set(pal_fog + pal_circuit + pal_star)))
    if " " not in charset:
        charset = " " + charset
    print(f"      palettes — fog:{len(pal_fog)} circuit:{len(pal_circuit)} star:{len(pal_star)}; atlas={len(charset)}")

    sprites, char_to_idx = build_sprite_array(grid_font, charset, cell_w, cell_h)
    palettes = {"fog": pal_fog, "circuit": pal_circuit, "star": pal_star}
    cursor_glyph = "_" if font_supports(quote_font, "_") else "|"

    print("[4/6] Rendering frames...")
    rng = np.random.RandomState(42)
    for f_idx in range(total_frames):
        img = render_frame(
            f_idx=f_idx,
            total_frames=total_frames,
            total_dur=total_dur,
            rms=rms_n, trans=trans_n,
            sprites=sprites, char_to_idx=char_to_idx,
            palettes=palettes,
            cell_w=cell_w, cell_h=cell_h, cols=cols, rows=rows,
            quote_font=quote_font,
            title_font=(title_big, title_sm),
            cursor_glyph=cursor_glyph,
            rng=rng,
        )
        img.save(FRAMES / f"f_{f_idx:05d}.png", optimize=False, compress_level=1)
        if (f_idx + 1) % 24 == 0 or f_idx == total_frames - 1:
            print(f"      frame {f_idx + 1}/{total_frames}")

    print("[5/6] Muxing video + audio with ffmpeg...")
    mux_video(FRAMES, WAV_PAD, OUT_MP4, total_dur)

    print("[6/6] Verifying output with ffprobe...")
    info = ffprobe_info(OUT_MP4)
    stream_types = [s.get("codec_type") for s in info["streams"]]
    print(f"      streams: {info['streams']}")
    print(f"      format duration: {info['format_duration']}")
    has_video = "video" in stream_types
    has_audio = "audio" in stream_types
    dur = float(info["format_duration"] or 0.0)
    if not has_video:
        print("[FAIL] no video stream in output"); return 2
    if not has_audio:
        print("[FAIL] no audio stream in output"); return 3
    if not (DUR_MIN - 0.5 <= dur <= DUR_MAX + 0.5):
        print(f"[FAIL] duration {dur:.2f}s outside expected window"); return 4
    print(f"[OK] {OUT_MP4} — video+audio, {dur:.2f}s")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())