Files
2026-05-15 01:23:51 -04:00

734 lines
28 KiB
Python

#!/usr/bin/env python3
"""
render_tts_ascii.py — mode #6 demo: TTS narration ASCII video ("warm machine oracle").
Pipeline:
1. Synthesize narration locally with espeak-ng (or espeak fallback).
2. Decode the WAV and compute per-frame RMS + transient features.
3. Render an ASCII-glyph grid over a non-flat textured background, with
section transitions (amber data fog -> cyan/purple signal rings ->
punctuation star-map iris -> title card) reactive to the audio.
4. Overlay a typewriter quote and CRT post-process (scanlines, grain, bloom).
5. Mux PNG frames + padded audio into MP4 via ffmpeg.
"""
from __future__ import annotations
import math
import os
import shutil
import struct
import subprocess
import sys
import wave
from pathlib import Path
from typing import Optional
import numpy as np
from PIL import Image, ImageDraw, ImageFilter, ImageFont
# ------------------------------------------------------------------- config --
ROOT = Path(__file__).resolve().parent
OUT_DIR = ROOT / "output"
TMP_DIR = ROOT / "_tmp"
FRAMES = TMP_DIR / "frames"
LOGS = TMP_DIR / "logs"
WAV_RAW = TMP_DIR / "narration_raw.wav"
WAV_PAD = TMP_DIR / "narration_pad.wav"
OUT_MP4 = OUT_DIR / "tts_ascii_example.mp4"
WIDTH, HEIGHT = 960, 540
FPS = 24
DUR_MIN = 10.0 # seconds
DUR_MAX = 12.0
NARRATION = ("Listen closely. The terminal hums softly. "
"Each tiny glyph, a small star. "
"Small text can hold a whole universe. "
"Welcome to the warm machine.")
QUOTE = "small text can hold a whole universe"
QUOTE_LINE_BREAK = 21 # split index for two-line layout
TITLE_LINES = [
"WARM MACHINE ORACLE",
"ascii / tts / signal",
]
# Color palette (linear-ish RGB, 0..1)
C_AMBER = np.array([1.00, 0.69, 0.25], dtype=np.float32)
C_TEAL = np.array([0.16, 0.83, 0.83], dtype=np.float32)
C_PURPLE = np.array([0.71, 0.27, 0.90], dtype=np.float32)
C_HOTAMB = np.array([1.00, 0.92, 0.78], dtype=np.float32)
C_DEEPBG = np.array([0.020, 0.025, 0.060], dtype=np.float32)
# Glyph palettes (intensity-ordered, low -> high). ASCII-safe fallbacks below.
PALETTE_FOG_PREF = " .'`,:;-~+*=oO#"
PALETTE_CIRCUIT_PREF = " .,:-=+*xX#%@▒▓█"
PALETTE_STAR_PREF = " .,'`*+oO●★"
PALETTE_FOG_ASCII = " .'`,:;-~+*=oO#"
PALETTE_CIRCUIT_ASCII= " .,:-=+*xX#%@&"
PALETTE_STAR_ASCII = " .,'`*+oO#@"
# -------------------------------------------------------------- subprocess --
def run(cmd, log_name: str, check: bool = True) -> int:
"""Run a command, redirecting stdout+stderr to LOGS/<log_name>.log."""
LOGS.mkdir(parents=True, exist_ok=True)
log_path = LOGS / log_name
with open(log_path, "ab") as f:
f.write(("\n$ " + " ".join(map(str, cmd)) + "\n").encode())
f.flush()
r = subprocess.run(cmd, stdout=f, stderr=subprocess.STDOUT)
if check and r.returncode != 0:
sys.stderr.write(f"[error] {cmd[0]} failed (rc={r.returncode}); see {log_path}\n")
raise SystemExit(r.returncode)
return r.returncode
# ---------------------------------------------------------------- tts step --
def generate_tts(text: str, out_path: Path) -> Path:
"""Probe espeak-ng then espeak. Synthesize to WAV at out_path."""
out_path.parent.mkdir(parents=True, exist_ok=True)
for exe in ("espeak-ng", "espeak"):
if shutil.which(exe):
# Slower, lower pitch — feels like a warm oracle.
cmd = [exe, "-s", "148", "-p", "38", "-a", "180",
"-v", "en+m3", "-w", str(out_path), text]
run(cmd, "tts.log")
if out_path.exists() and out_path.stat().st_size > 1024:
return out_path
raise RuntimeError("Neither espeak-ng nor espeak is available.")
# ------------------------------------------------------------- audio utils --
def load_wav_mono(path: Path) -> tuple[np.ndarray, int]:
with wave.open(str(path), "rb") as w:
sr = w.getframerate()
ch = w.getnchannels()
sw = w.getsampwidth()
n = w.getnframes()
raw = w.readframes(n)
if sw == 2:
arr = np.frombuffer(raw, dtype="<i2").astype(np.float32) / 32768.0
elif sw == 1:
arr = (np.frombuffer(raw, dtype="u1").astype(np.float32) - 128.0) / 128.0
elif sw == 4:
arr = np.frombuffer(raw, dtype="<i4").astype(np.float32) / 2147483648.0
else:
raise RuntimeError(f"Unsupported sample width: {sw}")
if ch > 1:
arr = arr.reshape(-1, ch).mean(axis=1)
return arr.astype(np.float32), sr
def write_wav_mono(path: Path, audio: np.ndarray, sr: int) -> None:
audio = np.clip(audio, -1.0, 1.0)
pcm = (audio * 32767.0).astype("<i2").tobytes()
with wave.open(str(path), "wb") as w:
w.setnchannels(1)
w.setsampwidth(2)
w.setframerate(sr)
w.writeframes(pcm)
def pad_or_trim_audio(in_path: Path, out_path: Path, target_dur: float) -> float:
audio, sr = load_wav_mono(in_path)
target_n = int(round(target_dur * sr))
if len(audio) < target_n:
# Pad with subtle hum-floor (very quiet white noise) so things don't feel dead.
pad = (np.random.RandomState(7).randn(target_n - len(audio))
.astype(np.float32) * 0.0008)
audio = np.concatenate([audio, pad])
else:
audio = audio[:target_n]
write_wav_mono(out_path, audio, sr)
return len(audio) / sr
def compute_features(audio: np.ndarray, sr: int, fps: int,
n_frames: int) -> tuple[np.ndarray, np.ndarray]:
"""Per-video-frame RMS and transient (positive-rectified energy diff)."""
spf = sr / fps
rms = np.zeros(n_frames, dtype=np.float32)
for i in range(n_frames):
s = int(i * spf)
e = min(len(audio), int((i + 1) * spf))
if e > s:
seg = audio[s:e]
rms[i] = float(np.sqrt(np.mean(seg * seg) + 1e-12))
# Light smoothing
if len(rms) > 5:
k = np.array([0.15, 0.7, 0.15], dtype=np.float32)
rms = np.convolve(rms, k, mode="same")
# Normalize
rmax = float(rms.max()) if rms.max() > 0 else 1.0
rms_n = rms / rmax
# Transients: positive change
diff = np.diff(rms_n, prepend=rms_n[0])
trans = np.clip(diff, 0.0, None)
tmax = float(trans.max()) if trans.max() > 0 else 1.0
trans_n = trans / tmax
# Smooth transients with a short decay so beats persist a few frames
decayed = np.zeros_like(trans_n)
acc = 0.0
for i, v in enumerate(trans_n):
acc = max(v, acc * 0.78)
decayed[i] = acc
return rms_n, decayed
# ---------------------------------------------------------- font + sprites --
FONT_CANDIDATES = [
"/usr/share/fonts/truetype/dejavu/DejaVuSansMono-Bold.ttf",
"/usr/share/fonts/truetype/dejavu/DejaVuSansMono.ttf",
"/usr/share/fonts/truetype/liberation/LiberationMono-Bold.ttf",
"/usr/share/fonts/truetype/liberation/LiberationMono-Regular.ttf",
"/usr/share/fonts/truetype/freefont/FreeMonoBold.ttf",
]
QUOTE_FONT_CANDIDATES = [
"/usr/share/fonts/truetype/dejavu/DejaVuSansMono-Bold.ttf",
"/usr/share/fonts/truetype/dejavu/DejaVuSansMono.ttf",
"/usr/share/fonts/truetype/liberation/LiberationMono-Bold.ttf",
]
TITLE_FONT_CANDIDATES = [
"/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf",
"/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf",
"/usr/share/fonts/truetype/freefont/FreeSansBold.ttf",
]
def find_font(candidates) -> str:
for p in candidates:
if Path(p).exists():
return p
raise RuntimeError("No suitable font found from candidates.")
def font_supports(font: ImageFont.FreeTypeFont, ch: str) -> bool:
"""True if the font produces any inked pixels for `ch`."""
try:
mask = font.getmask(ch, mode="L")
except Exception:
return False
if mask is None:
return False
arr = np.asarray(mask, dtype=np.uint8)
return arr.size > 0 and int(arr.max()) > 0
def filter_palette(palette_pref: str, palette_ascii: str,
font: ImageFont.FreeTypeFont) -> str:
# Always keep the leading space.
chars = [" "]
for ch in palette_pref[1:]:
if ch == " ":
continue
if font_supports(font, ch):
chars.append(ch)
# If only the space made it, fall back to ASCII-safe.
if len(chars) < 4:
chars = [" "] + [c for c in palette_ascii[1:] if c == " " or font_supports(font, c)]
# Strip duplicates while preserving order
seen = set()
clean = []
for c in chars:
if c not in seen:
seen.add(c)
clean.append(c)
chars = clean
if len(chars) < 4:
# absolute last-ditch fallback
chars = list(" .:-=+*#@")
return "".join(chars)
def build_sprite_array(font: ImageFont.FreeTypeFont, charset: str,
cell_w: int, cell_h: int) -> tuple[np.ndarray, dict]:
"""For each char, render alpha into a (cell_h, cell_w) uint8 numpy array."""
char_to_idx = {}
sprites = np.zeros((len(charset), cell_h, cell_w), dtype=np.uint8)
for i, ch in enumerate(charset):
char_to_idx[ch] = i
if ch == " ":
continue
img = Image.new("L", (cell_w, cell_h), 0)
d = ImageDraw.Draw(img)
# Center each glyph in its cell.
d.text((cell_w / 2, cell_h / 2), ch, fill=255, font=font, anchor="mm")
sprites[i] = np.asarray(img, dtype=np.uint8)
return sprites, char_to_idx
# -------------------------------------------------------- math + envelopes --
def trap(t: float, in0: float, in1: float, out0: float, out1: float) -> float:
"""Trapezoidal envelope: ramps 0->1 over [in0,in1], holds, ramps 1->0 over [out0,out1]."""
if t <= in0:
return 0.0
if t < in1:
return (t - in0) / max(in1 - in0, 1e-6)
if t < out0:
return 1.0
if t < out1:
return 1.0 - (t - out0) / max(out1 - out0, 1e-6)
return 0.0
def smoothstep(x: float) -> float:
x = max(0.0, min(1.0, x))
return x * x * (3.0 - 2.0 * x)
# ---------------------------------------------------------------- fields ---
def fog_field(rows: int, cols: int, t: float, rng_state: np.random.RandomState) -> np.ndarray:
"""Slow rolling amber data-fog noise (sum of low-freq sinusoids)."""
x = np.arange(cols, dtype=np.float32)[None, :]
y = np.arange(rows, dtype=np.float32)[:, None]
v = np.zeros((rows, cols), dtype=np.float32)
v += np.sin(x * 0.11 + t * 0.55)
v += np.cos(y * 0.17 + t * 0.41)
v += np.sin((x + y) * 0.08 + t * 0.33)
v += np.cos((x * 0.31 - y * 0.24) + t * 0.27)
v += 0.35 * np.sin(x * 0.42 + y * 0.39 - t * 0.9)
v -= v.min()
if v.max() > 0:
v /= v.max()
# Tilt brighter toward center vertically for a soft horizon.
cy = rows / 2.0
vert = 1.0 - np.abs((np.arange(rows) - cy) / cy) # 0..1
v = v * (0.55 + 0.45 * vert[:, None])
return v.astype(np.float32)
def rings_field(rows: int, cols: int, t: float, energy: float,
pulse: float) -> tuple[np.ndarray, np.ndarray]:
"""Concentric signal rings. Returns (intensity, angle_norm)."""
cx = (cols - 1) / 2.0
cy = (rows - 1) / 2.0
x = np.arange(cols, dtype=np.float32)[None, :] - cx
y = (np.arange(rows, dtype=np.float32)[:, None] - cy) * 1.9 # aspect correction
r = np.sqrt(x * x + y * y)
# Multi-band sin rings, advanced by time and pulse.
phase = r * 0.55 - t * 4.5 - pulse * 2.2
band = 0.5 + 0.5 * np.sin(phase)
band2 = 0.5 + 0.5 * np.sin(r * 0.22 - t * 1.2)
field = band * (0.55 + 0.45 * band2)
# Falloff from center so the rings glow outward then fade.
rn = r / (np.hypot(cx, cy * 1.9) + 1e-6)
falloff = np.exp(-((rn - 0.55 - 0.18 * energy) ** 2) / 0.18)
field = field * (0.4 + 0.9 * falloff)
field = np.clip(field, 0, 1)
ang = (np.arctan2(y, x) + math.pi) / (2 * math.pi) # 0..1
return field.astype(np.float32), ang.astype(np.float32)
def star_field(rows: int, cols: int, t: float,
seed: int = 13) -> np.ndarray:
"""Sparse star-map: thresholded noise that twinkles."""
rng = np.random.RandomState(seed)
base = rng.rand(rows, cols).astype(np.float32)
twinkle = 0.5 + 0.5 * np.sin(
np.arange(rows * cols).reshape(rows, cols) * 0.7 + t * 4.0
)
# Stars: top 12% of values, modulated by twinkle.
mask = (base > 0.88).astype(np.float32)
near = (base > 0.78).astype(np.float32) * 0.35
out = mask * (0.7 + 0.3 * twinkle) + near * twinkle
return np.clip(out, 0, 1).astype(np.float32)
# ----------------------------------------------------------- frame render --
def render_frame(f_idx: int,
total_frames: int,
total_dur: float,
rms: np.ndarray,
trans: np.ndarray,
sprites: np.ndarray,
char_to_idx: dict,
palettes: dict,
cell_w: int,
cell_h: int,
cols: int,
rows: int,
quote_font: ImageFont.FreeTypeFont,
title_font: ImageFont.FreeTypeFont,
cursor_glyph: str,
rng: np.random.RandomState) -> Image.Image:
t = f_idx / FPS
D = total_dur
e = float(rms[f_idx])
p = float(trans[f_idx])
# Section envelopes (relative to total duration).
w_fog = trap(t, 0.0, 0.06 * D, 0.32 * D, 0.45 * D)
w_rings = trap(t, 0.28 * D, 0.40 * D, 0.78 * D, 0.90 * D)
w_iris = trap(t, 0.58 * D, 0.62 * D, 0.71 * D, 0.76 * D)
w_title = trap(t, 0.82 * D, 0.92 * D, D + 1.0, D + 2.0)
s = w_fog + w_rings + w_iris + 1e-6
# --- Fields -------------------------------------------------------------
fog_v = fog_field(rows, cols, t, rng)
rings_v, ang = rings_field(rows, cols, t, energy=e, pulse=p)
stars_v = star_field(rows, cols, t)
# Per-cell intensity (weighted combination + audio modulation).
intensity = (w_fog * fog_v
+ w_rings * rings_v
+ w_iris * stars_v)
intensity *= (0.55 + 0.55 * e)
# Transient pulse: brighten a radial wave outward.
if p > 0.02:
cy_, cx_ = (rows - 1) / 2.0, (cols - 1) / 2.0
xx = np.arange(cols)[None, :] - cx_
yy = (np.arange(rows)[:, None] - cy_) * 1.9
rr = np.sqrt(xx * xx + yy * yy)
rr_n = rr / (np.hypot(cx_, cy_ * 1.9) + 1e-6)
wave = np.exp(-((rr_n - (0.15 + 0.85 * p)) ** 2) / 0.04) * p * 0.75
intensity += wave.astype(np.float32)
intensity = np.clip(intensity, 0.0, 1.4)
# Adaptive tonemap: percentile-based.
hi = float(np.percentile(intensity, 96))
if hi > 0.05:
intensity = np.clip(intensity / hi, 0.0, 1.0)
else:
intensity = np.zeros_like(intensity)
# --- Char selection (pick palette by dominant section) ------------------
if w_iris > 0.40 and w_iris >= max(w_fog, w_rings) * 0.85:
pal = palettes["star"]
elif w_rings > w_fog:
pal = palettes["circuit"]
else:
pal = palettes["fog"]
n_pal = len(pal)
idx_in_pal = np.clip((intensity * (n_pal - 1)).astype(np.int32), 0, n_pal - 1)
pal_lookup = np.array([char_to_idx[c] for c in pal], dtype=np.int32)
sprite_idx = pal_lookup[idx_in_pal] # (rows, cols)
# --- Per-cell color -----------------------------------------------------
col_field = np.zeros((rows, cols, 3), dtype=np.float32)
col_field += w_fog * C_AMBER[None, None, :]
ring_color = (1.0 - ang)[..., None] * C_TEAL[None, None, :] + ang[..., None] * C_PURPLE[None, None, :]
col_field += w_rings * ring_color
col_field += w_iris * C_HOTAMB[None, None, :]
col_field /= s
# Subtle hue drift modulated by transient.
col_field += (p * 0.18) * (C_HOTAMB[None, None, :] - col_field)
col_field = np.clip(col_field, 0.0, 1.0)
# Multiply by intensity for brightness.
col_field *= intensity[..., None]
# --- Compose ASCII grid into a pixel image -------------------------------
# alpha tiles: (rows, cols, cell_h, cell_w)
tiles = sprites[sprite_idx] # (rows, cols, ch, cw)
alpha_big = tiles.transpose(0, 2, 1, 3).reshape(rows * cell_h, cols * cell_w)
alpha_f = alpha_big.astype(np.float32) / 255.0
# color_big: repeat each cell's color across its cell pixels
color_big = np.repeat(np.repeat(col_field, cell_h, axis=0), cell_w, axis=1)
# Canvas may be slightly smaller than WIDTH/HEIGHT — pad to full size.
grid_h, grid_w = alpha_f.shape
base = np.tile(C_DEEPBG, (HEIGHT, WIDTH, 1)).astype(np.float32)
# Faint background haze (low intensity field) for the unused gutter pixels too.
base += 0.03 * np.tile(C_PURPLE, (HEIGHT, WIDTH, 1)) * (0.4 + 0.6 * w_rings)
base += 0.02 * np.tile(C_AMBER, (HEIGHT, WIDTH, 1)) * (0.4 + 0.6 * w_fog)
# Offset grid to roughly center (in case grid_w/grid_h < canvas).
ox = (WIDTH - grid_w) // 2
oy = (HEIGHT - grid_h) // 2
base[oy:oy + grid_h, ox:ox + grid_w, :] += alpha_f[..., None] * color_big
# Iris brief darkening: pull background toward black, leave stars bright.
if w_iris > 0.05:
darken = 1.0 - 0.85 * w_iris
base *= darken
img_f = np.clip(base, 0.0, 1.0)
# --- Postprocess: scanlines, grain, bloom -------------------------------
img_f = postprocess(img_f, rng)
img_u8 = (img_f * 255.0 + 0.5).astype(np.uint8)
out = Image.fromarray(img_u8, mode="RGB")
# --- Overlays: quote + title -------------------------------------------
draw_quote_overlay(out, t, D, e, p, quote_font, cursor_glyph)
if w_title > 0.01:
draw_title_overlay(out, w_title, title_font)
return out
# --------------------------------------------------------------- postproc --
def postprocess(img: np.ndarray, rng: np.random.RandomState) -> np.ndarray:
H, W = img.shape[:2]
# Scanlines: dim alternate rows.
sl = np.ones(H, dtype=np.float32)
sl[::2] = 0.86
img = img * sl[:, None, None]
# Grain: subtle additive monochrome noise.
grain = (rng.rand(H, W).astype(np.float32) - 0.5) * 0.045
img = img + grain[..., None]
# Cheap bloom: blur the bright channel, add back.
bright = np.maximum(0.0, img - 0.55)
bright_u8 = np.clip(bright * 255.0, 0, 255).astype(np.uint8)
blurred = np.asarray(
Image.fromarray(bright_u8, "RGB").filter(ImageFilter.GaussianBlur(radius=5)),
dtype=np.float32,
) / 255.0
img = img + blurred * 0.65
# Subtle vignette.
yy = np.linspace(-1.0, 1.0, H, dtype=np.float32)[:, None]
xx = np.linspace(-1.0, 1.0, W, dtype=np.float32)[None, :]
r2 = xx * xx + yy * yy
vign = 1.0 - 0.35 * np.clip(r2, 0.0, 1.0)
img = img * vign[..., None]
return np.clip(img, 0.0, 1.0)
# ----------------------------------------------------------------- text ----
def draw_quote_overlay(img: Image.Image, t: float, D: float,
rms_v: float, pulse_v: float,
font: ImageFont.FreeTypeFont, cursor_glyph: str) -> None:
start = 0.32 * D
end = 0.75 * D
if t < start:
return
progress = min(1.0, (t - start) / max(end - start, 1e-6))
progress = smoothstep(progress)
n_target = int(round(progress * len(QUOTE)))
visible = QUOTE[:n_target]
if len(visible) <= QUOTE_LINE_BREAK:
line1, line2 = visible, ""
else:
line1 = QUOTE[:QUOTE_LINE_BREAK]
line2 = visible[QUOTE_LINE_BREAK:]
blink_on = (int(t * 2.6) % 2 == 0)
cursor = cursor_glyph if blink_on else " "
if n_target < len(QUOTE):
if line2 or len(visible) >= QUOTE_LINE_BREAK:
line2 = (line2 + cursor)[:32]
else:
line1 = (line1 + cursor)[:32]
draw = ImageDraw.Draw(img, "RGBA")
# Backdrop card.
box_w, box_h = 700, 120
cx = WIDTH // 2
y_top = int(HEIGHT * 0.66)
backdrop = Image.new("RGBA", (box_w, box_h), (6, 8, 22, 195))
img.paste(backdrop, (cx - box_w // 2, y_top), backdrop)
# Borders glow (top + bottom).
glow = int(160 + 70 * rms_v)
draw.line([(cx - box_w // 2 + 12, y_top + 2),
(cx + box_w // 2 - 12, y_top + 2)],
fill=(255, 180, 80, glow), width=2)
draw.line([(cx - box_w // 2 + 12, y_top + box_h - 3),
(cx + box_w // 2 - 12, y_top + box_h - 3)],
fill=(70, 220, 220, glow), width=2)
# Glow text (drawn twice: blurred behind, sharp on top).
glow_layer = Image.new("RGBA", img.size, (0, 0, 0, 0))
gd = ImageDraw.Draw(glow_layer)
base_color = (255, 200, 110, 230)
bright_color = (255, 235, 200, 255)
line1_y = y_top + 30
line2_y = y_top + 75
gd.text((cx, line1_y), line1, font=font, anchor="mm", fill=base_color)
gd.text((cx, line2_y), line2, font=font, anchor="mm", fill=base_color)
blurred = glow_layer.filter(ImageFilter.GaussianBlur(radius=4))
img.paste(blurred, (0, 0), blurred)
draw.text((cx, line1_y), line1, font=font, anchor="mm", fill=bright_color)
color2 = (255, 220, 160, 255) if pulse_v > 0.15 else (255, 200, 120, 255)
draw.text((cx, line2_y), line2, font=font, anchor="mm", fill=color2)
def draw_title_overlay(img: Image.Image, weight: float,
font_pair: tuple[ImageFont.FreeTypeFont, ImageFont.FreeTypeFont]) -> None:
big, small = font_pair
draw = ImageDraw.Draw(img, "RGBA")
a = int(255 * smoothstep(weight))
a2 = int(220 * smoothstep(weight))
cx = WIDTH // 2
y = int(HEIGHT * 0.18)
# Glow
glow_layer = Image.new("RGBA", img.size, (0, 0, 0, 0))
gd = ImageDraw.Draw(glow_layer)
gd.text((cx, y), TITLE_LINES[0], font=big, anchor="mm", fill=(255, 190, 100, a))
gd.text((cx, y + 48), TITLE_LINES[1], font=small, anchor="mm", fill=(80, 220, 220, a2))
blurred = glow_layer.filter(ImageFilter.GaussianBlur(radius=6))
img.paste(blurred, (0, 0), blurred)
draw.text((cx, y), TITLE_LINES[0], font=big, anchor="mm", fill=(255, 230, 170, a))
draw.text((cx, y + 48), TITLE_LINES[1], font=small, anchor="mm", fill=(160, 240, 240, a2))
# ----------------------------------------------------------------- mux -----
def mux_video(frames_glob_dir: Path, audio_path: Path,
out_path: Path, total_dur: float) -> None:
out_path.parent.mkdir(parents=True, exist_ok=True)
cmd = [
"ffmpeg", "-y",
"-framerate", str(FPS),
"-i", str(frames_glob_dir / "f_%05d.png"),
"-i", str(audio_path),
"-map", "0:v:0", "-map", "1:a:0",
"-c:v", "libx264", "-pix_fmt", "yuv420p",
"-preset", "medium", "-crf", "20",
"-c:a", "aac", "-b:a", "160k",
"-t", f"{total_dur:.3f}",
"-movflags", "+faststart",
str(out_path),
]
run(cmd, "ffmpeg_mux.log")
def ffprobe_info(path: Path) -> dict:
cmd = ["ffprobe", "-v", "error",
"-show_entries", "stream=codec_type,codec_name,duration:format=duration",
"-of", "default=noprint_wrappers=1", str(path)]
log_path = LOGS / "ffprobe.log"
with open(log_path, "ab") as f:
f.write(("\n$ " + " ".join(cmd) + "\n").encode())
f.flush()
r = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=f)
out = r.stdout.decode("utf-8", "replace")
with open(log_path, "ab") as f:
f.write(out.encode())
info = {"streams": [], "format_duration": None}
current = None
for line in out.splitlines():
line = line.strip()
if not line:
continue
if "=" not in line:
continue
key, _, val = line.partition("=")
if key == "codec_type":
current = {"codec_type": val}
info["streams"].append(current)
elif key == "codec_name" and current is not None:
current["codec_name"] = val
elif key == "duration":
if current is not None and "codec_type" in current and "stream_duration" not in current:
current["stream_duration"] = val
else:
info["format_duration"] = val
return info
# ----------------------------------------------------------------- main ----
def main() -> int:
for d in (OUT_DIR, TMP_DIR, FRAMES, LOGS):
d.mkdir(parents=True, exist_ok=True)
# Clean prior frames.
for p in FRAMES.glob("*.png"):
p.unlink()
print("[1/6] Generating TTS narration with espeak-ng...")
generate_tts(NARRATION, WAV_RAW)
audio, sr = load_wav_mono(WAV_RAW)
audio_dur = len(audio) / sr
print(f" raw audio: {audio_dur:.2f}s @ {sr}Hz")
total_dur = max(DUR_MIN, min(DUR_MAX, audio_dur + 0.8))
total_frames = int(round(total_dur * FPS))
print(f" target video: {total_dur:.2f}s, {total_frames} frames @ {FPS}fps")
print("[2/6] Padding/trimming audio to match video duration...")
final_audio_dur = pad_or_trim_audio(WAV_RAW, WAV_PAD, total_dur)
print(f" padded audio: {final_audio_dur:.2f}s")
audio_pad, _ = load_wav_mono(WAV_PAD)
rms_n, trans_n = compute_features(audio_pad, sr, FPS, total_frames)
print("[3/6] Loading fonts and building glyph sprites...")
grid_font_path = find_font(FONT_CANDIDATES)
quote_font_path = find_font(QUOTE_FONT_CANDIDATES)
title_font_path = find_font(TITLE_FONT_CANDIDATES)
grid_font = ImageFont.truetype(grid_font_path, size=18)
quote_font = ImageFont.truetype(quote_font_path, size=26)
title_big = ImageFont.truetype(title_font_path, size=40)
title_sm = ImageFont.truetype(title_font_path, size=22)
# Figure out cell size from font metrics.
asc, desc = grid_font.getmetrics()
cell_h = max(asc + desc, 18)
# Use 'M' advance, clamp wider so dense glyphs don't clip.
try:
adv = int(round(grid_font.getlength("M")))
except Exception:
adv = grid_font.getbbox("M")[2]
cell_w = max(adv, 10)
cols = WIDTH // cell_w
rows = HEIGHT // cell_h
print(f" grid: {cols}x{rows} cells, cell={cell_w}x{cell_h}")
# Filter palettes against the chosen font.
pal_fog = filter_palette(PALETTE_FOG_PREF, PALETTE_FOG_ASCII, grid_font)
pal_circuit = filter_palette(PALETTE_CIRCUIT_PREF, PALETTE_CIRCUIT_ASCII, grid_font)
pal_star = filter_palette(PALETTE_STAR_PREF, PALETTE_STAR_ASCII, grid_font)
# Union charset for sprite atlas.
charset = "".join(sorted(set(pal_fog + pal_circuit + pal_star)))
if " " not in charset:
charset = " " + charset
print(f" palettes — fog:{len(pal_fog)} circuit:{len(pal_circuit)} star:{len(pal_star)}; atlas={len(charset)}")
sprites, char_to_idx = build_sprite_array(grid_font, charset, cell_w, cell_h)
palettes = {"fog": pal_fog, "circuit": pal_circuit, "star": pal_star}
cursor_glyph = "_" if font_supports(quote_font, "_") else "|"
print("[4/6] Rendering frames...")
rng = np.random.RandomState(42)
for f_idx in range(total_frames):
img = render_frame(
f_idx=f_idx,
total_frames=total_frames,
total_dur=total_dur,
rms=rms_n, trans=trans_n,
sprites=sprites, char_to_idx=char_to_idx,
palettes=palettes,
cell_w=cell_w, cell_h=cell_h, cols=cols, rows=rows,
quote_font=quote_font,
title_font=(title_big, title_sm),
cursor_glyph=cursor_glyph,
rng=rng,
)
img.save(FRAMES / f"f_{f_idx:05d}.png", optimize=False, compress_level=1)
if (f_idx + 1) % 24 == 0 or f_idx == total_frames - 1:
print(f" frame {f_idx + 1}/{total_frames}")
print("[5/6] Muxing video + audio with ffmpeg...")
mux_video(FRAMES, WAV_PAD, OUT_MP4, total_dur)
print("[6/6] Verifying output with ffprobe...")
info = ffprobe_info(OUT_MP4)
stream_types = [s.get("codec_type") for s in info["streams"]]
print(f" streams: {info['streams']}")
print(f" format duration: {info['format_duration']}")
has_video = "video" in stream_types
has_audio = "audio" in stream_types
dur = float(info["format_duration"] or 0.0)
if not has_video:
print("[FAIL] no video stream in output"); return 2
if not has_audio:
print("[FAIL] no audio stream in output"); return 3
if not (DUR_MIN - 0.5 <= dur <= DUR_MAX + 0.5):
print(f"[FAIL] duration {dur:.2f}s outside expected window"); return 4
print(f"[OK] {OUT_MP4} — video+audio, {dur:.2f}s")
return 0
if __name__ == "__main__":
raise SystemExit(main())