diff --git a/.gitignore b/.gitignore
index 1d78df597..9c6a1374a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,3 +11,5 @@ intermediate-findings/
# Playwright
playwright-report/
test-results/
+__pycache__/
+*.pyc
\ No newline at end of file
diff --git a/README.md b/README.md
index 6057fe10c..ece4234ac 100644
--- a/README.md
+++ b/README.md
@@ -67,8 +67,8 @@ Or edit your `package.json` manually:
| [**Scenario Modeler**](examples/scenario-modeler-server) | [**Budget Allocator**](examples/budget-allocator-server) | [**Customer Segmentation**](examples/customer-segmentation-server) |
| [](examples/system-monitor-server) | [](examples/transcript-server) | [](examples/video-resource-server) |
| [**System Monitor**](examples/system-monitor-server) | [**Transcript**](examples/transcript-server) | [**Video Resource**](examples/video-resource-server) |
-| [](examples/pdf-server) | [](examples/qr-server) | |
-| [**PDF Server**](examples/pdf-server) | [**QR Code (Python)**](examples/qr-server) | |
+| [](examples/pdf-server) | [](examples/qr-server) | [](examples/say-server) |
+| [**PDF Server**](examples/pdf-server) | [**QR Code (Python)**](examples/qr-server) | [**Say Demo**](examples/say-server) |
### Starter Templates
diff --git a/examples/say-server/.gitignore b/examples/say-server/.gitignore
new file mode 100644
index 000000000..b94707787
--- /dev/null
+++ b/examples/say-server/.gitignore
@@ -0,0 +1,2 @@
+node_modules/
+dist/
diff --git a/examples/say-server/README.md b/examples/say-server/README.md
new file mode 100644
index 000000000..b6629ef82
--- /dev/null
+++ b/examples/say-server/README.md
@@ -0,0 +1,175 @@
+# Say Server - Streaming TTS MCP App
+
+A real-time text-to-speech MCP App with karaoke-style text highlighting, powered by [Kyutai's Pocket TTS](https://github.com/kyutai-labs/pocket-tts).
+
+## MCP App Features Demonstrated
+
+This example showcases several MCP App capabilities:
+
+- **Single-file executable**: Python server with embedded React UI - no build step required
+- **Partial tool inputs** (`ontoolinputpartial`): Widget receives streaming text as it's being generated
+- **Queue-based streaming**: Demonstrates how to stream text out and audio in via a polling tool (adds text to an input queue, retrieves audio chunks from an output queue)
+- **Model context updates**: Widget updates the LLM with playback progress ("Playing: ...snippet...")
+- **Native theming**: Uses CSS variables for automatic dark/light mode adaptation
+- **Fullscreen mode**: Toggle fullscreen via `requestDisplayMode()` API, press Escape to exit
+- **Multi-widget speak lock**: Coordinates multiple TTS widgets via localStorage so only one plays at a time
+- **Hidden tools** (`visibility: ["app"]`): Private tools only accessible to the widget, not the model
+- **External links** (`openLink`): Attribution popup uses `app.openLink()` to open external URLs
+- **CSP metadata**: Resource declares required domains (`esm.sh`) for in-browser transpilation
+
+## Features
+
+- **Streaming TTS**: Audio starts playing as text is being generated
+- **Karaoke highlighting**: Words are highlighted in sync with speech
+- **Interactive controls**: Click to pause/resume, double-click to restart
+- **Low latency**: Uses a polling-based queue for minimal delay
+
+## Prerequisites
+
+- [uv](https://docs.astral.sh/uv/getting-started/installation/) - fast Python package manager
+- A CUDA GPU (recommended) or CPU with sufficient RAM (~2GB for model)
+
+## Quick Start
+
+The server is a single self-contained Python file that can be run directly with `uv`:
+
+```bash
+# Run directly (uv auto-installs dependencies)
+uv run examples/say-server/server.py
+```
+
+The server will be available at `http://localhost:3109/mcp`.
+
+## Running with Docker
+
+Run directly from GitHub using the official `uv` Docker image. Mount your HuggingFace cache to avoid re-downloading the model:
+
+```bash
+docker run --rm -it \
+ -p 3109:3109 \
+ -v ~/.cache/huggingface:/root/.cache/huggingface \
+ -e HF_HOME=/root/.cache/huggingface \
+ ghcr.io/astral-sh/uv:debian \
+ uv run https://raw.githubusercontent.com/modelcontextprotocol/ext-apps/main/examples/say-server/server.py
+```
+
+For GPU support, add `--gpus all` (requires [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html)).
+
+## Usage
+
+### With Claude Desktop
+
+Add to your Claude Desktop config (`~/Library/Application Support/Claude/claude_desktop_config.json`):
+
+```json
+{
+ "mcpServers": {
+ "say": {
+ "command": "uv",
+ "args": ["run", "server.py", "--stdio"],
+ "cwd": "/path/to/examples/say-server"
+ }
+ }
+}
+```
+
+### With MCP Clients
+
+Connect to `http://localhost:3109/mcp` and call the `say` tool:
+
+```json
+{
+ "name": "say",
+ "arguments": {
+ "text": "Hello, world! This is a streaming TTS demo."
+ }
+}
+```
+
+## Available Voices
+
+The default voice is `cosette`. Use the `list_voices` tool or pass a `voice` parameter to `say`:
+
+### Predefined Voices
+
+- `alba`, `marius`, `javert`, `jean` - from [alba-mackenna](https://huggingface.co/kyutai/tts-voices/tree/main/alba-mackenna) (CC BY 4.0)
+- `cosette`, `eponine`, `azelma`, `fantine` - from [VCTK dataset](https://huggingface.co/kyutai/tts-voices/tree/main/vctk) (CC BY 4.0)
+
+### Custom Voices
+
+You can also use HuggingFace URLs or local file paths:
+
+```json
+{"text": "Hello!", "voice": "hf://kyutai/tts-voices/voice-donations/alice.wav"}
+{"text": "Hello!", "voice": "/path/to/my-voice.wav"}
+```
+
+See the [kyutai/tts-voices](https://huggingface.co/kyutai/tts-voices) repository for more voice collections
+
+## Architecture
+
+The entire server is contained in a single `server.py` file:
+
+1. **`say` tool**: Public tool that triggers the widget with text to speak
+2. **Private tools** (`create_tts_queue`, `add_tts_text`, `poll_tts_audio`, etc.): Hidden from the model, only callable by the widget
+3. **Embedded React widget**: Uses [Babel standalone](https://babeljs.io/docs/babel-standalone) for in-browser JSX transpilation - no build step needed
+4. **TTS backend**: Manages per-request audio queues using Pocket TTS
+
+The widget communicates with the server via MCP tool calls:
+
+- Receives streaming text via `ontoolinputpartial` callback
+- Incrementally sends new text to the server as it arrives (via `add_tts_text`)
+- Polls for generated audio chunks while TTS runs in parallel
+- Plays audio via Web Audio API with synchronized text highlighting
+
+## Multi-Widget Speak Lock
+
+When multiple TTS widgets exist in the same browser (e.g., multiple chat messages each with their own say widget), they coordinate via localStorage to ensure only one plays at a time:
+
+1. **Unique Widget IDs**: Each widget receives a UUID via `toolResult._meta.widgetUUID`
+2. **Announce on Play**: When starting, a widget writes `{uuid, timestamp}` to `localStorage["mcp-tts-playing"]`
+3. **Poll for Conflicts**: Every 200ms, playing widgets check if another widget took the lock
+4. **Yield Gracefully**: If another widget started playing, pause and yield
+5. **Clean Up**: On pause/finish, clear the lock (only if owned)
+
+This "last writer wins" protocol ensures a seamless experience: clicking play on any widget immediately pauses others, without requiring cross-iframe postMessage coordination.
+
+## TODO
+
+- Persist caret position in localStorage (resume from where you left off)
+- Click anywhere in text to move the cursor/playback position
+
+## Credits
+
+This project uses [Pocket TTS](https://github.com/kyutai-labs/pocket-tts) by [Kyutai](https://kyutai.org/) - a fantastic open-source text-to-speech model. Thank you to the Kyutai team for making this technology available!
+
+The server includes modified Pocket TTS code to support streaming text input (text can be fed incrementally while audio generation runs in parallel). A PR contributing this functionality back to the original repo is planned.
+
+## License
+
+This example is MIT licensed.
+
+### Third-Party Licenses
+
+This project uses the following open-source components:
+
+| Component | License | Link |
+| --------------------------------------------------------------------- | ----------------- | ---------------------------- |
+| [pocket-tts](https://github.com/kyutai-labs/pocket-tts) | MIT | Python TTS library |
+| [Kyutai TTS model](https://huggingface.co/kyutai/tts-0.75b-en-public) | CC-BY 4.0 | Text-to-speech model weights |
+| [kyutai/tts-voices](https://huggingface.co/kyutai/tts-voices) | Mixed (see below) | Voice prompt files |
+
+### Voice Collection Licenses
+
+The predefined voices in this example use **CC-BY 4.0** licensed collections:
+
+| Collection | License | Commercial Use |
+| --------------- | ------------------- | ------------------------- |
+| alba-mackenna | CC-BY 4.0 | ✅ Yes (with attribution) |
+| vctk | CC-BY 4.0 | ✅ Yes (with attribution) |
+| cml-tts/fr | CC-BY 4.0 | ✅ Yes (with attribution) |
+| voice-donations | CC0 (Public Domain) | ✅ Yes |
+| **expresso** | CC-BY-NC 4.0 | ❌ Non-commercial only |
+| **ears** | CC-BY-NC 4.0 | ❌ Non-commercial only |
+
+⚠️ **Note**: If you use voices from the `expresso/` or `ears/` collections, your use is restricted to non-commercial purposes.
diff --git a/examples/say-server/grid-cell.png b/examples/say-server/grid-cell.png
new file mode 100644
index 000000000..ff2e85a95
Binary files /dev/null and b/examples/say-server/grid-cell.png differ
diff --git a/examples/say-server/mcp-app.html b/examples/say-server/mcp-app.html
new file mode 100644
index 000000000..c51777c71
--- /dev/null
+++ b/examples/say-server/mcp-app.html
@@ -0,0 +1,14 @@
+
+
+
+
+
+
+ Say Widget
+
+
+
+
+
+
+
diff --git a/examples/say-server/package.json b/examples/say-server/package.json
new file mode 100644
index 000000000..a752ef7c4
--- /dev/null
+++ b/examples/say-server/package.json
@@ -0,0 +1,17 @@
+{
+ "name": "@modelcontextprotocol/server-say",
+ "version": "0.4.1",
+ "private": true,
+ "description": "Streaming TTS MCP App Server with karaoke-style text highlighting",
+ "repository": {
+ "type": "git",
+ "url": "https://github.com/modelcontextprotocol/ext-apps",
+ "directory": "examples/say-server"
+ },
+ "license": "MIT",
+ "scripts": {
+ "start": "uv run server.py",
+ "dev": "uv run server.py",
+ "build": "echo 'No build step needed for Python server'"
+ }
+}
diff --git a/examples/say-server/screenshot.png b/examples/say-server/screenshot.png
new file mode 100644
index 000000000..0ac8e6cc1
Binary files /dev/null and b/examples/say-server/screenshot.png differ
diff --git a/examples/say-server/server.py b/examples/say-server/server.py
new file mode 100755
index 000000000..7d3891856
--- /dev/null
+++ b/examples/say-server/server.py
@@ -0,0 +1,1457 @@
+#!/usr/bin/env uv run --default-index https://pypi.org/simple
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+# "mcp @ git+https://github.com/modelcontextprotocol/python-sdk@main",
+# "uvicorn>=0.34.0",
+# "starlette>=0.46.0",
+# "pocket-tts>=1.0.1",
+# ]
+# ///
+"""
+Say Demo - MCP App for streaming text-to-speech.
+
+This MCP server provides a "say" tool that speaks text using TTS.
+The widget receives streaming partial input and starts speaking immediately.
+
+Architecture:
+- The `say` tool itself is a no-op - it just triggers the widget
+- The widget uses `ontoolinputpartial` to receive text as it streams
+- Widget calls private tools to create TTS queue, add text, and poll audio
+- Audio plays in the widget using Web Audio API
+- Model context updates show playback progress to the LLM
+- Native theming adapts to dark/light mode automatically
+- Fullscreen mode with Escape key to exit
+- Multi-widget speak lock coordinates playback across instances
+
+Usage:
+ # Start the MCP server
+ python server.py
+
+ # Or with stdio transport (for Claude Desktop)
+ python server.py --stdio
+"""
+from __future__ import annotations
+import asyncio
+import base64
+import logging
+import os
+import sys
+import time
+import uuid
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Annotated, Literal
+from pydantic import Field
+
+import torch
+import uvicorn
+from mcp.server.fastmcp import FastMCP
+from mcp import types
+from mcp.types import Icon
+from starlette.middleware.cors import CORSMiddleware
+
+from pocket_tts.models.tts_model import TTSModel, prepare_text_prompt
+from pocket_tts.default_parameters import DEFAULT_AUDIO_PROMPT
+
+logger = logging.getLogger(__name__)
+
+WIDGET_URI = "ui://say-demo/widget.html"
+HOST = os.environ.get("HOST", "0.0.0.0")
+PORT = int(os.environ.get("PORT", "3109"))
+
+# Speaker icon as SVG data URI
+SPEAKER_ICON = Icon(
+ src="data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 24 24' fill='none' stroke='currentColor' stroke-width='2'%3E%3Cpolygon points='11 5 6 9 2 9 2 15 6 15 11 19 11 5'/%3E%3Cpath d='M15.54 8.46a5 5 0 0 1 0 7.07'/%3E%3Cpath d='M19.07 4.93a10 10 0 0 1 0 14.14'/%3E%3C/svg%3E",
+ mimeType="image/svg+xml",
+)
+
+mcp = FastMCP("Say Demo", icons=[SPEAKER_ICON])
+
+# Global TTS model (loaded on startup)
+tts_model: TTSModel | None = None
+
+
+# ------------------------------------------------------
+# TTS Queue State Management
+# ------------------------------------------------------
+
+@dataclass
+class AudioChunkData:
+ """Audio chunk with timing metadata."""
+ index: int
+ audio_base64: str
+ char_start: int
+ char_end: int
+ duration_ms: float
+
+
+@dataclass
+class TTSQueueState:
+ """State for a TTS generation queue."""
+ id: str
+ voice: str
+ sample_rate: int
+ status: Literal["active", "complete", "error"] = "active"
+ error_message: str | None = None
+
+ # Text queue
+ text_queue: asyncio.Queue = field(default_factory=asyncio.Queue)
+ end_signaled: bool = False
+
+ # Audio output
+ audio_chunks: list[AudioChunkData] = field(default_factory=list)
+ chunks_delivered: int = 0
+
+ # Tracking
+ created_at: float = field(default_factory=time.time)
+ last_activity: float = field(default_factory=time.time) # Last text or end signal
+ lock: asyncio.Lock = field(default_factory=asyncio.Lock)
+ task: asyncio.Task | None = None
+
+
+# Active TTS queues
+tts_queues: dict[str, TTSQueueState] = {}
+
+# Queue timeout: if no activity for this long, mark as error
+QUEUE_TIMEOUT_SECONDS = 30
+
+
+# ------------------------------------------------------
+# Public Tool: say
+# ------------------------------------------------------
+
+DEFAULT_TEXT = """Hello! I'm a text-to-speech demonstration. This speech is being generated in real-time as you watch. The words you see highlighted are synchronized with the audio playback, creating a karaoke-style reading experience. You can click to pause or resume, and use the reset button to restart from the beginning. Pretty neat, right?"""
+
+# Predefined voices from pocket-tts (mapped to HuggingFace files)
+# See: https://huggingface.co/kyutai/tts-voices
+PREDEFINED_VOICES = {
+ "alba": "hf://kyutai/tts-voices/alba-mackenna/casual.wav",
+ "marius": "hf://kyutai/tts-voices/alba-mackenna/merchant.wav",
+ "javert": "hf://kyutai/tts-voices/alba-mackenna/announcer.wav",
+ "jean": "hf://kyutai/tts-voices/alba-mackenna/a-moment-by.wav",
+ "fantine": "hf://kyutai/tts-voices/vctk/p225_023_mic1.wav",
+ "cosette": "hf://kyutai/tts-voices/vctk/p226_023_mic1.wav",
+ "eponine": "hf://kyutai/tts-voices/vctk/p227_023_mic1.wav",
+ "azelma": "hf://kyutai/tts-voices/vctk/p228_023_mic1.wav",
+}
+
+DEFAULT_VOICE = "cosette"
+
+
+@mcp.tool()
+def list_voices() -> list[types.TextContent]:
+ """List available TTS voices.
+
+ Returns the predefined voice names that can be used with the say tool.
+ You can also use HuggingFace URLs (hf://kyutai/tts-voices/...) or local file paths.
+ """
+ import json
+ voice_info = {
+ "predefined_voices": list(PREDEFINED_VOICES.keys()),
+ "default_voice": DEFAULT_VOICE,
+ "custom_voice_formats": [
+ "hf://kyutai/tts-voices//.wav",
+ "/path/to/local/voice.wav",
+ ],
+ "collections": [
+ "alba-mackenna (CC-BY 4.0) - voice-acted characters",
+ "vctk (CC-BY 4.0) - VCTK dataset speakers",
+ "cml-tts/fr (CC-BY 4.0) - French voices",
+ "voice-donations (CC0) - public domain community voices",
+ "expresso (CC-BY-NC 4.0) - expressive (NON-COMMERCIAL ONLY)",
+ "ears (CC-BY-NC 4.0) - emotional (NON-COMMERCIAL ONLY)",
+ ],
+ }
+ return [types.TextContent(type="text", text=json.dumps(voice_info, indent=2))]
+
+
+@mcp.tool(meta={
+ "ui":{"resourceUri": WIDGET_URI},
+ "ui/resourceUri": WIDGET_URI, # legacy support
+})
+def say(
+ text: Annotated[str, Field(description="The English text to speak aloud")] = DEFAULT_TEXT,
+ voice: Annotated[str, Field(
+ description="Voice to use. Can be a predefined name (alba, marius, cosette, etc.), "
+ "a HuggingFace URL (hf://kyutai/tts-voices/...), or a local file path."
+ )] = DEFAULT_VOICE,
+ autoPlay: Annotated[bool, Field(
+ description="Whether to start playing automatically. Note: browsers may block autoplay until user interaction."
+ )] = True,
+) -> list[types.TextContent]:
+ """Speak English text aloud using text-to-speech.
+
+ Use when the user wants text read or spoken aloud:
+ - "say ...", "speak ...", "read ... out loud"
+ - "...; say it", "...; read it to me", "...; speak it"
+ - "narrate ...", "read this aloud"
+
+ Audio streams in real-time as text is provided.
+ Use list_voices() for voice options.
+
+ Note: English only. Non-English text may produce poor or garbled results.
+ """
+ # Generate a unique ID for this widget instance (used for speak lock coordination)
+ widget_uuid = uuid.uuid4().hex[:12]
+
+ # This is a no-op - the widget handles everything via ontoolinputpartial
+ # The tool exists to:
+ # 1. Trigger the widget to load
+ # 2. Provide the resourceUri metadata
+ # 3. Show the final text in the tool result
+ # 4. Provide widget UUID for multi-player coordination
+ return [types.TextContent(
+ type="text",
+ text=f"Displayed a TTS widget with voice '{voice}'. Click to play/pause, use toolbar to restart or fullscreen.",
+ _meta={"widgetUUID": widget_uuid},
+ )]
+
+
+# ------------------------------------------------------
+# Private Tools: TTS Queue Management
+# ------------------------------------------------------
+
+@mcp.tool(meta={"ui":{"visibility":["app"]}})
+async def create_tts_queue(voice: str = "cosette") -> list[types.TextContent]:
+ """Create a TTS generation queue. Returns queue_id and sample_rate.
+
+ Args:
+ voice: Voice to use (cosette, alba, brenda, etc.)
+ """
+ if tts_model is None:
+ return [types.TextContent(type="text", text='{"error": "TTS model not loaded"}')]
+
+ queue_id = uuid.uuid4().hex[:12]
+ sample_rate = tts_model.config.mimi.sample_rate
+
+ state = TTSQueueState(
+ id=queue_id,
+ voice=voice,
+ sample_rate=sample_rate,
+ )
+ tts_queues[queue_id] = state
+
+ # Start background TTS processing task
+ state.task = asyncio.create_task(_run_tts_queue(state))
+
+ logger.info(f"Created TTS queue {queue_id}")
+
+ import json
+ return [types.TextContent(
+ type="text",
+ text=json.dumps({"queue_id": queue_id, "sample_rate": sample_rate})
+ )]
+
+
+@mcp.tool(meta={"ui":{"visibility":["app"]}})
+def add_tts_text(queue_id: str, text: str) -> list[types.TextContent]:
+ """Add text to a TTS queue.
+
+ Args:
+ queue_id: The queue ID from create_tts_queue
+ text: Text to add (incremental, not cumulative)
+ """
+ state = tts_queues.get(queue_id)
+ if not state:
+ return [types.TextContent(type="text", text='{"error": "Queue not found"}')]
+ if state.end_signaled:
+ return [types.TextContent(type="text", text='{"error": "Queue already ended"}')]
+
+ # Queue the text (non-blocking)
+ try:
+ state.text_queue.put_nowait(text)
+ state.last_activity = time.time() # Update activity timestamp
+ except asyncio.QueueFull:
+ return [types.TextContent(type="text", text='{"error": "Queue full"}')]
+
+ return [types.TextContent(type="text", text='{"queued": true}')]
+
+
+@mcp.tool(meta={"ui":{"visibility":["app"]}})
+def end_tts_queue(queue_id: str) -> list[types.TextContent]:
+ """Signal that no more text will be sent to a queue.
+
+ Args:
+ queue_id: The queue ID from create_tts_queue
+ """
+ state = tts_queues.get(queue_id)
+ if not state:
+ logger.warning(f"end_tts_queue called for unknown queue: {queue_id}")
+ return [types.TextContent(type="text", text='{"error": "Queue not found"}')]
+ if state.end_signaled:
+ logger.info(f"end_tts_queue called for already-ended queue: {queue_id}")
+ return [types.TextContent(type="text", text='{"already_ended": true}')]
+
+ state.end_signaled = True
+ state.last_activity = time.time() # Update activity timestamp
+ try:
+ state.text_queue.put_nowait(None) # EOF marker
+ except asyncio.QueueFull:
+ pass
+
+ logger.info(f"end_tts_queue called for queue: {queue_id}")
+ return [types.TextContent(type="text", text='{"ended": true}')]
+
+
+@mcp.tool(meta={"ui":{"visibility":["app"]}})
+def cancel_tts_queue(queue_id: str) -> list[types.TextContent]:
+ """Cancel and cleanup a TTS queue. Use before creating a new queue to avoid overlapping playback.
+
+ Args:
+ queue_id: The queue ID from create_tts_queue
+ """
+ state = tts_queues.pop(queue_id, None)
+ if not state:
+ return [types.TextContent(type="text", text='{"error": "Queue not found"}')]
+
+ # Cancel the background task
+ if state.task and not state.task.done():
+ state.task.cancel()
+ logger.info(f"Cancelled TTS queue {queue_id}")
+
+ # Signal end to unblock any waiting consumers
+ state.end_signaled = True
+ try:
+ state.text_queue.put_nowait(None)
+ except asyncio.QueueFull:
+ pass
+
+ state.status = "complete"
+
+ return [types.TextContent(type="text", text='{"cancelled": true}')]
+
+
+@mcp.tool(meta={"ui":{"visibility":["app"]}})
+def poll_tts_audio(queue_id: str) -> list[types.TextContent]:
+ """Poll for available audio chunks from a TTS queue.
+
+ Returns base64-encoded audio chunks with timing metadata.
+ Call repeatedly until done=true.
+
+ Args:
+ queue_id: The queue ID from create_tts_queue
+ """
+ import json
+ import time
+
+ state = tts_queues.get(queue_id)
+ if not state:
+ return [types.TextContent(type="text", text='{"error": "Queue not found"}')]
+
+ # Update last activity to prevent timeout during active polling
+ state.last_activity = time.time()
+
+ # Get new chunks (use sync approach since we can't await in tool)
+ # The lock is async, so we need to be careful here
+ # For simplicity, just grab what's available without locking
+ new_chunks = state.audio_chunks[state.chunks_delivered:]
+ state.chunks_delivered = len(state.audio_chunks)
+
+ # Consider queues with errors as "done" so widget stops polling
+ done = (state.status == "complete" or state.status == "error") and state.chunks_delivered >= len(state.audio_chunks)
+
+ response = {
+ "chunks": [
+ {
+ "index": c.index,
+ "audio_base64": c.audio_base64,
+ "char_start": c.char_start,
+ "char_end": c.char_end,
+ "duration_ms": c.duration_ms,
+ }
+ for c in new_chunks
+ ],
+ "done": done,
+ "status": state.status,
+ }
+
+ # Include error message if present
+ if state.error_message:
+ response["error"] = state.error_message
+
+ # Clean up completed or errored queues
+ if done:
+ # Schedule cleanup after a delay
+ async def cleanup():
+ await asyncio.sleep(60)
+ tts_queues.pop(queue_id, None)
+ try:
+ asyncio.get_event_loop().create_task(cleanup())
+ except RuntimeError:
+ pass
+
+ return [types.TextContent(type="text", text=json.dumps(response))]
+
+
+# ------------------------------------------------------
+# Background TTS Processing
+# ------------------------------------------------------
+
+
+class StreamingTextChunker:
+ """Buffers streaming text and emits chunks when ready for TTS processing.
+
+ Chunks are emitted when:
+ - Token count reaches max_tokens threshold (at a sentence boundary if possible)
+ - flush() is called (end of stream)
+
+ This matches the chunking behavior of split_into_best_sentences() but works
+ incrementally as text arrives.
+ """
+
+ def __init__(self, tokenizer, max_tokens: int = 50, min_tokens: int = 15):
+ """
+ Args:
+ tokenizer: SentencePiece tokenizer from flow_lm.conditioner.tokenizer
+ max_tokens: Maximum tokens per chunk (default 50, matches existing)
+ min_tokens: Minimum tokens before considering emission
+ """
+ self.tokenizer = tokenizer
+ self.max_tokens = max_tokens
+ self.min_tokens = min_tokens
+ self.buffer = ""
+
+ # Cache end-of-sentence token IDs for boundary detection
+ _, *eos_tokens = tokenizer(".!...?").tokens[0].tolist()
+ self.eos_tokens = set(eos_tokens)
+
+ def add_text(self, text: str) -> list[str]:
+ """Add text to buffer, return any complete chunks ready for processing.
+
+ Args:
+ text: Incremental text to add (e.g., from LLM token)
+
+ Returns:
+ List of text chunks ready for TTS (may be empty if still buffering)
+ """
+ self.buffer += text
+ return self._extract_ready_chunks()
+
+ def flush(self) -> list[str]:
+ """Flush remaining buffer as final chunk(s).
+
+ Call this when the text stream ends to process any remaining text.
+
+ Returns:
+ List of final text chunks (may be empty if buffer was empty)
+ """
+ if not self.buffer.strip():
+ return []
+
+ # Force emit whatever remains
+ chunks = self._extract_ready_chunks(force_emit=True)
+ if self.buffer.strip():
+ chunks.append(self.buffer.strip())
+ self.buffer = ""
+ return chunks
+
+ def _extract_ready_chunks(self, force_emit: bool = False) -> list[str]:
+ """Extract chunks that are ready for processing."""
+ chunks = []
+
+ while True:
+ chunk = self._try_extract_chunk(force_emit and not chunks)
+ if chunk is None:
+ break
+ chunks.append(chunk)
+
+ return chunks
+
+ def _try_extract_chunk(self, force_emit: bool = False) -> str | None:
+ """Try to extract one chunk from buffer."""
+ text = self.buffer.strip()
+ if not text:
+ return None
+
+ tokens = self.tokenizer(text).tokens[0].tolist()
+ num_tokens = len(tokens)
+
+ # Not enough tokens yet
+ if num_tokens < self.min_tokens and not force_emit:
+ return None
+
+ # Under max and not forcing - check for complete sentence worth emitting
+ if num_tokens < self.max_tokens and not force_emit:
+ # Only emit early if we have a complete sentence at a good length
+ if num_tokens >= self.min_tokens and self._ends_with_sentence_boundary(tokens):
+ # Found a complete sentence - emit it
+ chunk = text
+ self.buffer = ""
+ return chunk
+ return None
+
+ # Over max_tokens or force_emit - find best split point
+ split_idx = self._find_best_split(tokens, force_emit)
+
+ if split_idx == 0:
+ if force_emit:
+ chunk = text
+ self.buffer = ""
+ return chunk
+ return None
+
+ # Decode tokens up to split point
+ chunk_text = self.tokenizer.sp.decode(tokens[:split_idx])
+ remaining_text = self.tokenizer.sp.decode(tokens[split_idx:])
+
+ self.buffer = remaining_text
+ return chunk_text.strip()
+
+ def _find_best_split(self, tokens: list[int], force_emit: bool = False) -> int:
+ """Find the best token index to split at (sentence boundary near max_tokens)."""
+ # Find all sentence boundaries (position AFTER the punctuation)
+ boundaries = []
+ prev_was_eos = False
+
+ for i, token in enumerate(tokens):
+ if token in self.eos_tokens:
+ prev_was_eos = True
+ elif prev_was_eos:
+ boundaries.append(i)
+ prev_was_eos = False
+
+ # Also consider end of tokens if it ends with punctuation
+ if tokens and tokens[-1] in self.eos_tokens:
+ boundaries.append(len(tokens))
+
+ if not boundaries:
+ # No sentence boundaries - split at max_tokens if we're over
+ if len(tokens) >= self.max_tokens:
+ return self.max_tokens
+ return len(tokens) if force_emit else 0
+
+ # Find boundary closest to max_tokens without going too far over
+ best_boundary = 0
+ for boundary in boundaries:
+ if boundary <= self.max_tokens:
+ best_boundary = boundary
+ elif best_boundary == 0:
+ # First boundary is past max - use it anyway
+ best_boundary = boundary
+ break
+ else:
+ # We have a good boundary before max, stop
+ break
+
+ return best_boundary
+
+ def _ends_with_sentence_boundary(self, tokens: list[int]) -> bool:
+ """Check if token sequence ends with sentence-ending punctuation."""
+ if not tokens:
+ return False
+ return tokens[-1] in self.eos_tokens
+
+ @property
+ def buffered_text(self) -> str:
+ """Current buffered text (for debugging/monitoring)."""
+ return self.buffer
+
+ @property
+ def buffered_token_count(self) -> int:
+ """Approximate token count in buffer."""
+ if not self.buffer.strip():
+ return 0
+ return len(self.tokenizer(self.buffer).tokens[0].tolist())
+
+
+async def _run_tts_queue(state: TTSQueueState):
+ """Background task: consume text queue, produce audio chunks."""
+ if tts_model is None:
+ state.status = "error"
+ state.error_message = "TTS model not loaded"
+ return
+
+ model_state = tts_model._cached_get_state_for_audio_prompt(state.voice, truncate=True)
+ chunker = StreamingTextChunker(tts_model.flow_lm.conditioner.tokenizer)
+ chunk_index = 0
+ char_offset = 0
+
+ try:
+ while True:
+ # Wait for text with timeout to detect stale queues
+ try:
+ text_item = await asyncio.wait_for(
+ state.text_queue.get(),
+ timeout=5.0 # Check every 5 seconds
+ )
+ except asyncio.TimeoutError:
+ # Check if queue is stale (no activity for too long)
+ if time.time() - state.last_activity > QUEUE_TIMEOUT_SECONDS:
+ logger.warning(f"TTS queue {state.id} timeout after {QUEUE_TIMEOUT_SECONDS}s of inactivity")
+ state.status = "error"
+ state.error_message = f"Queue timeout: no activity for {QUEUE_TIMEOUT_SECONDS}s"
+ break
+ # Continue waiting - queue might still be active
+ continue
+
+ if text_item is None:
+ # EOF - flush remaining text
+ remaining = chunker.flush()
+ for chunk_text in remaining:
+ await _process_tts_chunk(state, chunk_text, chunk_index, char_offset, model_state)
+ char_offset += len(chunk_text)
+ chunk_index += 1
+
+ state.status = "complete"
+ logger.info(f"TTS queue {state.id} complete: {chunk_index} chunks")
+ break
+
+ # Feed text to chunker
+ ready_chunks = chunker.add_text(text_item)
+
+ for chunk_text in ready_chunks:
+ await _process_tts_chunk(state, chunk_text, chunk_index, char_offset, model_state)
+ char_offset += len(chunk_text)
+ chunk_index += 1
+
+ except Exception as e:
+ logger.error(f"TTS queue {state.id} error: {e}")
+ state.status = "error"
+ state.error_message = str(e)
+
+
+async def _process_tts_chunk(
+ state: TTSQueueState,
+ text: str,
+ chunk_index: int,
+ char_offset: int,
+ model_state: dict,
+):
+ """Process a text chunk and add audio to state."""
+ if tts_model is None:
+ return
+
+ loop = asyncio.get_event_loop()
+ audio_bytes_list: list[bytes] = []
+ total_samples = 0
+
+ def generate_sync():
+ nonlocal total_samples
+ _, frames_after_eos = prepare_text_prompt(text)
+ frames_after_eos += 2
+
+ for audio_chunk in tts_model._generate_audio_stream_short_text(
+ model_state=model_state,
+ text_to_generate=text,
+ frames_after_eos=frames_after_eos,
+ copy_state=True,
+ ):
+ audio_int16 = (audio_chunk * 32767).to(torch.int16)
+ audio_bytes_list.append(audio_int16.cpu().numpy().tobytes())
+ total_samples += len(audio_chunk)
+
+ await loop.run_in_executor(None, generate_sync)
+
+ combined_audio = b"".join(audio_bytes_list)
+ duration_ms = (total_samples / state.sample_rate) * 1000
+
+ chunk_data = AudioChunkData(
+ index=chunk_index,
+ audio_base64=base64.b64encode(combined_audio).decode(),
+ char_start=char_offset,
+ char_end=char_offset + len(text),
+ duration_ms=duration_ms,
+ )
+
+ async with state.lock:
+ state.audio_chunks.append(chunk_data)
+
+ logger.debug(f"TTS queue {state.id}: chunk {chunk_index} ready ({duration_ms:.0f}ms)")
+
+
+# ------------------------------------------------------
+# Widget Resource
+# ------------------------------------------------------
+
+# Embedded widget HTML for standalone execution via `uv run `
+# Uses Babel standalone for in-browser JSX transpilation
+# This is a copy of widget.html - keep them in sync!
+EMBEDDED_WIDGET_HTML = """
+
+
+
+
+ Say Widget
+
+
+
+
+
+
+
+
+"""
+
+
+def get_widget_html() -> str:
+ """Get the widget HTML, preferring built version from dist/."""
+ # Prefer built version from dist/ (local development with npm run build)
+ dist_path = Path(__file__).parent / "dist" / "mcp-app.html"
+ if dist_path.exists():
+ return dist_path.read_text()
+ # Fallback to embedded widget (for `uv run ` or unbundled usage)
+ return EMBEDDED_WIDGET_HTML
+
+
+# IMPORTANT: all the external domains used by app must be listed
+# in the meta.ui.csp.resourceDomains - otherwise they will be blocked by CSP policy
+@mcp.resource(
+ WIDGET_URI,
+ mime_type="text/html;profile=mcp-app",
+ meta={"ui": {"csp": {"resourceDomains": ["https://esm.sh", "https://unpkg.com"]}}},
+)
+def widget() -> str:
+ """Widget HTML resource with CSP metadata for external dependencies."""
+ return get_widget_html()
+
+
+# ------------------------------------------------------
+# Startup
+# ------------------------------------------------------
+
+def load_tts_model():
+ """Load the TTS model on startup."""
+ global tts_model
+ logger.info("Loading TTS model...")
+ tts_model = TTSModel.load_model()
+ logger.info("TTS model loaded")
+
+
+def create_app():
+ """Create the ASGI app (for uvicorn reload mode)."""
+ load_tts_model()
+ app = mcp.streamable_http_app(stateless_http=True)
+ app.add_middleware(
+ CORSMiddleware,
+ allow_origins=["*"],
+ allow_methods=["*"],
+ allow_headers=["*"],
+ )
+ return app
+
+
+if __name__ == "__main__":
+ logging.basicConfig(level=logging.INFO)
+
+ if "--stdio" in sys.argv:
+ # Claude Desktop mode
+ load_tts_model()
+ mcp.run(transport="stdio")
+ elif "--reload" in sys.argv:
+ # Reload mode - pass app as string so uvicorn can reimport
+ print(f"Say Server listening on http://{HOST}:{PORT}/mcp (reload mode)")
+ uvicorn.run("server:create_app", host=HOST, port=PORT, reload=True, factory=True)
+ else:
+ # HTTP mode
+ app = create_app()
+ print(f"Say Server listening on http://{HOST}:{PORT}/mcp")
+ uvicorn.run(app, host=HOST, port=PORT)
diff --git a/package-lock.json b/package-lock.json
index 405f3148d..b82b93a41 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -617,6 +617,44 @@
"name": "@modelcontextprotocol/server-qr",
"version": "1.0.0"
},
+ "examples/say-server": {
+ "name": "@modelcontextprotocol/server-say",
+ "version": "0.4.1",
+ "license": "MIT",
+ "dependencies": {
+ "@modelcontextprotocol/ext-apps": "^0.4.1",
+ "react": "^19.2.0",
+ "react-dom": "^19.2.0"
+ },
+ "devDependencies": {
+ "@types/node": "^22.0.0",
+ "@types/react": "^19.2.2",
+ "@types/react-dom": "^19.2.2",
+ "@vitejs/plugin-react": "^4.3.4",
+ "concurrently": "^9.2.1",
+ "cross-env": "^10.1.0",
+ "typescript": "^5.9.3",
+ "vite": "^6.0.0",
+ "vite-plugin-singlefile": "^2.3.0"
+ }
+ },
+ "examples/say-server/node_modules/@types/node": {
+ "version": "22.19.7",
+ "resolved": "https://registry.npmjs.org/@types/node/-/node-22.19.7.tgz",
+ "integrity": "sha512-MciR4AKGHWl7xwxkBa6xUGxQJ4VBOmPTF7sL+iGzuahOFaO0jHCsuEfS80pan1ef4gWId1oWOweIhrDEYLuaOw==",
+ "dev": true,
+ "license": "MIT",
+ "dependencies": {
+ "undici-types": "~6.21.0"
+ }
+ },
+ "examples/say-server/node_modules/undici-types": {
+ "version": "6.21.0",
+ "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.21.0.tgz",
+ "integrity": "sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ==",
+ "dev": true,
+ "license": "MIT"
+ },
"examples/scenario-modeler-server": {
"name": "@modelcontextprotocol/server-scenario-modeler",
"version": "0.4.1",
@@ -2480,6 +2518,10 @@
"resolved": "examples/qr-server",
"link": true
},
+ "node_modules/@modelcontextprotocol/server-say": {
+ "resolved": "examples/say-server",
+ "link": true
+ },
"node_modules/@modelcontextprotocol/server-scenario-modeler": {
"resolved": "examples/scenario-modeler-server",
"link": true
diff --git a/tests/e2e/generate-grid-screenshots.spec.ts b/tests/e2e/generate-grid-screenshots.spec.ts
index 80e3c0a15..1bc639c8e 100644
--- a/tests/e2e/generate-grid-screenshots.spec.ts
+++ b/tests/e2e/generate-grid-screenshots.spec.ts
@@ -58,6 +58,7 @@ const ALL_SERVERS = [
{ key: "map-server", name: "Map Server", dir: "map-server" },
{ key: "pdf-server", name: "PDF Server", dir: "pdf-server" },
{ key: "qr-server", name: "QR Code Server", dir: "qr-server" },
+ { key: "say-server", name: "Say Demo", dir: "say-server" },
{
key: "scenario-modeler",
name: "SaaS Scenario Modeler",
diff --git a/tests/e2e/servers.spec.ts b/tests/e2e/servers.spec.ts
index a3df89fb7..d3928931a 100644
--- a/tests/e2e/servers.spec.ts
+++ b/tests/e2e/servers.spec.ts
@@ -16,6 +16,7 @@ const DYNAMIC_MASKS: Record = {
"basic-vue": ["#server-time"], // Server time display
"cohort-heatmap": ['[class*="heatmapWrapper"]'], // Heatmap grid (random data)
"customer-segmentation": [".chart-container"], // Scatter plot (random data)
+ "say-server": [".playBtn", ".playOverlayBtn"], // Play buttons may have different states
shadertoy: ["#canvas"], // WebGL shader canvas (animated)
"system-monitor": [
".chart-container", // CPU chart (highly dynamic)
diff --git a/tests/e2e/servers.spec.ts-snapshots/say-server.png b/tests/e2e/servers.spec.ts-snapshots/say-server.png
new file mode 100644
index 000000000..6bd22151e
Binary files /dev/null and b/tests/e2e/servers.spec.ts-snapshots/say-server.png differ