SKILL.md

$27

Linux requires building from source: https://voicebox.sh/linux-install

Build from Source

Prerequisites: Bun, Rust, Python 3.11+, Tauri prerequisites

git clone https://github.com/jamiepine/voicebox.git

cd voicebox

# Install just task runner

brew install just        # macOS

cargo install just       # any platform

# Set up Python venv + all dependencies

just setup

# Start backend + desktop app in dev mode

just dev

# List all available commands

just --list

Architecture

Layer

Technology

Desktop App

Tauri (Rust)

Frontend

React + TypeScript + Tailwind CSS

State

Zustand + React Query

Backend

FastAPI (Python) on port 17493

TTS Engines

Qwen3-TTS, LuxTTS, Chatterbox, Chatterbox Turbo, TADA

Effects

Pedalboard (Spotify)

Transcription

Whisper / Whisper Turbo

Inference

MLX (Apple Silicon) / PyTorch (CUDA/ROCm/XPU/CPU)

Database

SQLite

The Python FastAPI backend handles all ML inference. The Tauri Rust shell wraps the frontend and manages the backend process lifecycle. The API is accessible directly at http://localhost:17493 even when using the desktop app.

REST API Reference

Base URL: http://localhost:17493

Interactive docs: http://localhost:17493/docs

Generate Speech

# Basic generation

curl -X POST http://localhost:17493/generate \

  -H "Content-Type: application/json" \

  -d '{

    "text": "Hello world, this is a voice clone.",

    "profile_id": "abc123",

    "language": "en"

  }'

# With engine selection

curl -X POST http://localhost:17493/generate \

  -H "Content-Type: application/json" \

  -d '{

    "text": "Speak slowly and with gravitas.",

    "profile_id": "abc123",

    "language": "en",

    "engine": "qwen3-tts"

  }'

# With paralinguistic tags (Chatterbox Turbo only)

curl -X POST http://localhost:17493/generate \

  -H "Content-Type: application/json" \

  -d '{

    "text": "That is absolutely hilarious! [laugh] I cannot believe it.",

    "profile_id": "abc123",

    "engine": "chatterbox-turbo",

    "language": "en"

  }'

Voice Profiles

# List all profiles

curl http://localhost:17493/profiles

# Create a new profile

curl -X POST http://localhost:17493/profiles \

  -H "Content-Type: application/json" \

  -d '{

    "name": "Narrator",

    "language": "en",

    "description": "Deep narrative voice"

  }'

# Upload audio sample to a profile

curl -X POST http://localhost:17493/profiles/{profile_id}/samples \

  -F "file=@/path/to/voice-sample.wav"

# Export a profile

curl http://localhost:17493/profiles/{profile_id}/export \

  --output narrator-profile.zip

# Import a profile

curl -X POST http://localhost:17493/profiles/import \

  -F "file=@narrator-profile.zip"

Generation Queue & Status

# Get generation status (SSE stream)

curl -N http://localhost:17493/generate/{generation_id}/status

# List recent generations

curl http://localhost:17493/generations

# Retry a failed generation

curl -X POST http://localhost:17493/generations/{generation_id}/retry

# Download generated audio

curl http://localhost:17493/generations/{generation_id}/audio \

  --output output.wav

Models

# List available models and download status

curl http://localhost:17493/models

# Unload a model from GPU memory (without deleting)

curl -X POST http://localhost:17493/models/{model_id}/unload

TypeScript/JavaScript Integration

Basic TTS Client

const VOICEBOX_URL = process.env.VOICEBOX_API_URL ?? "http://localhost:17493";

interface GenerateRequest {

  text: string;

  profile_id: string;

  language?: string;

  engine?: "qwen3-tts" | "luxtts" | "chatterbox" | "chatterbox-turbo" | "tada";

}

interface GenerateResponse {

  generation_id: string;

  status: "queued" | "processing" | "complete" | "failed";

  audio_url?: string;

}

async function generateSpeech(req: GenerateRequest): Promise<GenerateResponse> {

  const response = await fetch(`${VOICEBOX_URL}/generate`, {

    method: "POST",

    headers: { "Content-Type": "application/json" },

    body: JSON.stringify(req),

  });

  if (!response.ok) {

    throw new Error(`Voicebox API error: ${response.status} ${await response.text()}`);

  }

  return response.json();

}

// Usage

const result = await generateSpeech({

  text: "Welcome to our application.",

  profile_id: "abc123",

  language: "en",

  engine: "qwen3-tts",

});

console.log("Generation ID:", result.generation_id);

Poll for Completion

async function waitForGeneration(

  generationId: string,

  timeoutMs = 60_000

): Promise<string> {

  const start = Date.now();

  while (Date.now() - start < timeoutMs) {

    const res = await fetch(`${VOICEBOX_URL}/generations/${generationId}`);

    const data = await res.json();

    if (data.status === "complete") {

      return `${VOICEBOX_URL}/generations/${generationId}/audio`;

    }

    if (data.status === "failed") {

      throw new Error(`Generation failed: ${data.error}`);

    }

    await new Promise((r) => setTimeout(r, 1000));

  }

  throw new Error("Generation timed out");

}

Stream Status with SSE

function streamGenerationStatus(

  generationId: string,

  onStatus: (status: string) => void

): () => void {

  const eventSource = new EventSource(

    `${VOICEBOX_URL}/generate/${generationId}/status`

  );

  eventSource.onmessage = (event) => {

    const data = JSON.parse(event.data);

    onStatus(data.status);

    if (data.status === "complete" || data.status === "failed") {

      eventSource.close();

    }

  };

  eventSource.onerror = () => eventSource.close();

  // Return cleanup function

  return () => eventSource.close();

}

// Usage

const cleanup = streamGenerationStatus("gen_abc123", (status) => {

  console.log("Status update:", status);

});

Download Audio as Blob

async function downloadAudio(generationId: string): Promise<Blob> {

  const response = await fetch(

    `${VOICEBOX_URL}/generations/${generationId}/audio`

  );

  if (!response.ok) {

    throw new Error(`Failed to download audio: ${response.status}`);

  }

  return response.blob();

}

// Play in browser

async function playGeneratedAudio(generationId: string): Promise<void> {

  const blob = await downloadAudio(generationId);

  const url = URL.createObjectURL(blob);

  const audio = new Audio(url);

  audio.play();

  audio.onended = () => URL.revokeObjectURL(url);

}

Python Integration

import httpx

import asyncio

VOICEBOX_URL = "http://localhost:17493"

async def generate_speech(

    text: str,

    profile_id: str,

    language: str = "en",

    engine: str = "qwen3-tts"

) -> bytes:

    async with httpx.AsyncClient(timeout=120.0) as client:

        # Submit generation

        resp = await client.post(

            f"{VOICEBOX_URL}/generate",

            json={

                "text": text,

                "profile_id": profile_id,

                "language": language,

                "engine": engine,

            }

        )

        resp.raise_for_status()

        generation_id = resp.json()["generation_id"]

        # Poll until complete

        for _ in range(120):

            status_resp = await client.get(

                f"{VOICEBOX_URL}/generations/{generation_id}"

            )

            status_data = status_resp.json()

            if status_data["status"] == "complete":

                audio_resp = await client.get(

                    f"{VOICEBOX_URL}/generations/{generation_id}/audio"

                )

                return audio_resp.content

            if status_data["status"] == "failed":

                raise RuntimeError(f"Generation failed: {status_data.get('error')}")

            await asyncio.sleep(1.0)

        raise TimeoutError("Generation timed out after 120s")

# Usage

audio_bytes = asyncio.run(

    generate_speech(

        text="The quick brown fox jumps over the lazy dog.",

        profile_id="your-profile-id",

        language="en",

        engine="chatterbox",

    )

)

with open("output.wav", "wb") as f:

    f.write(audio_bytes)

TTS Engine Selection Guide

Engine

Best For

Languages

VRAM

Notes

qwen3-tts (0.6B/1.7B)

Quality + instructions

Medium

Supports delivery instructions in text

luxtts

Fast CPU generation

English only

~1GB

150x realtime on CPU, 48kHz

chatterbox

Multilingual coverage

Medium

Arabic, Hindi, Swahili, CJK + more

chatterbox-turbo

Expressive/emotion

English only

Low (350M)

Use [laugh], [sigh], [gasp] tags

tada (1B/3B)

Long-form coherence

High

700s+ audio, HumeAI model

Delivery Instructions (Qwen3-TTS)

Embed natural language instructions directly in the text:

await generateSpeech({

  text: "(whisper) I have a secret to tell you.",

  profile_id: "abc123",

  engine: "qwen3-tts",

});

await generateSpeech({

  text: "(speak slowly and clearly) Step one: open the application.",

  profile_id: "abc123",

  engine: "qwen3-tts",

});

Paralinguistic Tags (Chatterbox Turbo)

const tags = [

  "[laugh]", "[chuckle]", "[gasp]", "[cough]",

  "[sigh]", "[groan]", "[sniff]", "[shush]", "[clear throat]"

];

await generateSpeech({

  text: "Oh really? [gasp] I had no idea! [laugh] That's incredible.",

  profile_id: "abc123",

  engine: "chatterbox-turbo",

});

Environment & Configuration

# Custom models directory (set before launching)

export VOICEBOX_MODELS_DIR=/path/to/models

# For AMD ROCm GPU (auto-configured, but can override)

export HSA_OVERRIDE_GFX_VERSION=11.0.0

Docker configuration (docker-compose.yml override):

services:

  voicebox:

    environment:

      - VOICEBOX_MODELS_DIR=/models

    volumes:

      - /host/models:/models

    ports:

      - "17493:17493"

    # For NVIDIA GPU passthrough:

    deploy:

      resources:

        reservations:

          devices:

            - driver: nvidia

              count: 1

              capabilities: [gpu]

Common Patterns

Voice Profile Creation Flow

// 1. Create profile

const profile = await fetch(`${VOICEBOX_URL}/profiles`, {

  method: "POST",

  headers: { "Content-Type": "application/json" },

  body: JSON.stringify({ name: "My Voice", language: "en" }),

}).then((r) => r.json());

// 2. Upload audio sample (WAV/MP3, ideally 5–30 seconds clean speech)

const formData = new FormData();

formData.append("file", audioBlob, "sample.wav");

await fetch(`${VOICEBOX_URL}/profiles/${profile.id}/samples`, {

  method: "POST",

  body: formData,

});

// 3. Generate with the new profile

const gen = await generateSpeech({

  text: "Testing my cloned voice.",

  profile_id: profile.id,

});

Batch Generation with Queue

async function batchGenerate(

  items: Array<{ text: string; profileId: string }>,

  engine = "qwen3-tts"

): Promise<string[]> {

  // Submit all — Voicebox queues them serially to avoid GPU contention

  const submissions = await Promise.all(

    items.map((item) =>

      generateSpeech({ text: item.text, profile_id: item.profileId, engine })

    )

  );

  // Wait for all completions

  const audioUrls = await Promise.all(

    submissions.map((s) => waitForGeneration(s.generation_id))

  );

  return audioUrls;

}

Long-Form Text (Auto-Chunking)

Voicebox auto-chunks at sentence boundaries — just send the full text:

const longScript = `

  Chapter one. The morning fog rolled across the valley floor...

  // Up to 50,000 characters supported

`;

await generateSpeech({

  text: longScript,

  profile_id: "narrator-profile-id",

  engine: "tada", // Best for long-form coherence

  language: "en",

});

Troubleshooting

API not responding

# Check if backend is running

curl http://localhost:17493/health

# Restart backend only (dev mode)

just backend

# Check logs

just logs

GPU not detected

# Check detected backend

curl http://localhost:17493/system/info

# Force CPU mode (set before launch)

export VOICEBOX_FORCE_CPU=1

Model download fails / slow

# Set custom models directory with more space

export VOICEBOX_MODELS_DIR=/path/with/space

just dev

# Cancel stuck download via API

curl -X DELETE http://localhost:17493/models/{model_id}/download

Out of VRAM — unload models

# List loaded models

curl http://localhost:17493/models | jq '.[] | select(.loaded == true)'

# Unload specific model

curl -X POST http://localhost:17493/models/{model_id}/unload

Audio quality issues

Use 5–30 seconds of clean, noise-free speech for voice samples

Multiple samples improve clone quality — upload 3–5 different sentences

For multilingual cloning, use chatterbox engine

Ensure sample audio is 16kHz+ mono WAV for best results

Use luxtts for highest output quality (48kHz) in English

Generation stuck in queue after crash

Voicebox auto-recovers stale generations on startup. If the issue persists:

curl -X POST http://localhost:17493/generations/{generation_id}/retry

Frontend Integration (React Example)

import { useState } from "react";

const VOICEBOX_URL = import.meta.env.VITE_VOICEBOX_URL ?? "http://localhost:17493";

export function VoiceGenerator({ profileId }: { profileId: string }) {

  const [text, setText] = useState("");

  const [audioUrl, setAudioUrl] = useState<string | null>(null);

  const [loading, setLoading] = useState(false);

  const handleGenerate = async () => {

    setLoading(true);

    try {

      const res = await fetch(`${VOICEBOX_URL}/generate`, {

        method: "POST",

        headers: { "Content-Type": "application/json" },

        body: JSON.stringify({ text, profile_id: profileId, language: "en" }),

      });

      const { generation_id } = await res.json();

      // Poll for completion

      let done = false;

      while (!done) {

        await new Promise((r) => setTimeout(r, 1000));

        const statusRes = await fetch(`${VOICEBOX_URL}/generations/${generation_id}`);

        const { status } = await statusRes.json();

        if (status === "complete") {

          setAudioUrl(`${VOICEBOX_URL}/generations/${generation_id}/audio`);

          done = true;

        } else if (status === "failed") {

          throw new Error("Generation failed");

        }

      }

    } finally {

      setLoading(false);

    }

  };

  return (

    <div>

      <textarea value={text} onChange={(e) => setText(e.target.value)} />

      <button onClick={handleGenerate} disabled={loading}>

        {loading ? "Generating..." : "Generate Speech"}

      </button>

      {audioUrl &#x26;&#x26; <audio controls src={audioUrl} />}

    </div>

  );

}

voicebox-voice-synthesis

SKILL.md

Build from Source

Architecture

REST API Reference

Generate Speech

Voice Profiles

Generation Queue & Status

Models

TypeScript/JavaScript Integration

Basic TTS Client

Poll for Completion

Stream Status with SSE

Download Audio as Blob

Python Integration

TTS Engine Selection Guide

Delivery Instructions (Qwen3-TTS)

Paralinguistic Tags (Chatterbox Turbo)

Environment & Configuration

Common Patterns

Voice Profile Creation Flow

Batch Generation with Queue

Long-Form Text (Auto-Chunking)

Troubleshooting

API not responding

GPU not detected

Model download fails / slow

Out of VRAM — unload models

Audio quality issues

Generation stuck in queue after crash

Frontend Integration (React Example)

Stop writing automation&scrapers

voicebox-voice-synthesis

SKILL.md

Build from Source

Architecture

REST API Reference

Generate Speech

Voice Profiles

Generation Queue &#x26; Status

Models

TypeScript/JavaScript Integration

Basic TTS Client

Poll for Completion

Stream Status with SSE

Download Audio as Blob

Python Integration

TTS Engine Selection Guide

Delivery Instructions (Qwen3-TTS)

Paralinguistic Tags (Chatterbox Turbo)

Environment &#x26; Configuration

Common Patterns

Voice Profile Creation Flow

Batch Generation with Queue

Long-Form Text (Auto-Chunking)

Troubleshooting

API not responding

GPU not detected

Model download fails / slow

Out of VRAM — unload models

Audio quality issues

Generation stuck in queue after crash

Frontend Integration (React Example)

Let your agent run on any real-world website

Related skills

Stop writing automation&scrapers

Generation Queue & Status

Environment & Configuration