diff --git a/.gitignore b/.gitignore index 1d78df597..9c6a1374a 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,5 @@ intermediate-findings/ # Playwright playwright-report/ test-results/ +__pycache__/ +*.pyc \ No newline at end of file diff --git a/README.md b/README.md index 6057fe10c..ece4234ac 100644 --- a/README.md +++ b/README.md @@ -67,8 +67,8 @@ Or edit your `package.json` manually: | [**Scenario Modeler**](examples/scenario-modeler-server) | [**Budget Allocator**](examples/budget-allocator-server) | [**Customer Segmentation**](examples/customer-segmentation-server) | | [![System Monitor](examples/system-monitor-server/grid-cell.png "Real-time OS metrics")](examples/system-monitor-server) | [![Transcript](examples/transcript-server/grid-cell.png "Live speech transcription")](examples/transcript-server) | [![Video Resource](examples/video-resource-server/grid-cell.png "Binary video via MCP resources")](examples/video-resource-server) | | [**System Monitor**](examples/system-monitor-server) | [**Transcript**](examples/transcript-server) | [**Video Resource**](examples/video-resource-server) | -| [![PDF Server](examples/pdf-server/grid-cell.png "Interactive PDF viewer with chunked loading")](examples/pdf-server) | [![QR Code](examples/qr-server/grid-cell.png "QR code generator")](examples/qr-server) | | -| [**PDF Server**](examples/pdf-server) | [**QR Code (Python)**](examples/qr-server) | | +| [![PDF Server](examples/pdf-server/grid-cell.png "Interactive PDF viewer with chunked loading")](examples/pdf-server) | [![QR Code](examples/qr-server/grid-cell.png "QR code generator")](examples/qr-server) | [![Say Demo](examples/say-server/grid-cell.png "Text-to-speech demo")](examples/say-server) | +| [**PDF Server**](examples/pdf-server) | [**QR Code (Python)**](examples/qr-server) | [**Say Demo**](examples/say-server) | ### Starter Templates diff --git a/examples/say-server/.gitignore b/examples/say-server/.gitignore new file mode 100644 index 000000000..b94707787 --- /dev/null +++ b/examples/say-server/.gitignore @@ -0,0 +1,2 @@ +node_modules/ +dist/ diff --git a/examples/say-server/README.md b/examples/say-server/README.md new file mode 100644 index 000000000..b6629ef82 --- /dev/null +++ b/examples/say-server/README.md @@ -0,0 +1,175 @@ +# Say Server - Streaming TTS MCP App + +A real-time text-to-speech MCP App with karaoke-style text highlighting, powered by [Kyutai's Pocket TTS](https://github.com/kyutai-labs/pocket-tts). + +## MCP App Features Demonstrated + +This example showcases several MCP App capabilities: + +- **Single-file executable**: Python server with embedded React UI - no build step required +- **Partial tool inputs** (`ontoolinputpartial`): Widget receives streaming text as it's being generated +- **Queue-based streaming**: Demonstrates how to stream text out and audio in via a polling tool (adds text to an input queue, retrieves audio chunks from an output queue) +- **Model context updates**: Widget updates the LLM with playback progress ("Playing: ...snippet...") +- **Native theming**: Uses CSS variables for automatic dark/light mode adaptation +- **Fullscreen mode**: Toggle fullscreen via `requestDisplayMode()` API, press Escape to exit +- **Multi-widget speak lock**: Coordinates multiple TTS widgets via localStorage so only one plays at a time +- **Hidden tools** (`visibility: ["app"]`): Private tools only accessible to the widget, not the model +- **External links** (`openLink`): Attribution popup uses `app.openLink()` to open external URLs +- **CSP metadata**: Resource declares required domains (`esm.sh`) for in-browser transpilation + +## Features + +- **Streaming TTS**: Audio starts playing as text is being generated +- **Karaoke highlighting**: Words are highlighted in sync with speech +- **Interactive controls**: Click to pause/resume, double-click to restart +- **Low latency**: Uses a polling-based queue for minimal delay + +## Prerequisites + +- [uv](https://docs.astral.sh/uv/getting-started/installation/) - fast Python package manager +- A CUDA GPU (recommended) or CPU with sufficient RAM (~2GB for model) + +## Quick Start + +The server is a single self-contained Python file that can be run directly with `uv`: + +```bash +# Run directly (uv auto-installs dependencies) +uv run examples/say-server/server.py +``` + +The server will be available at `http://localhost:3109/mcp`. + +## Running with Docker + +Run directly from GitHub using the official `uv` Docker image. Mount your HuggingFace cache to avoid re-downloading the model: + +```bash +docker run --rm -it \ + -p 3109:3109 \ + -v ~/.cache/huggingface:/root/.cache/huggingface \ + -e HF_HOME=/root/.cache/huggingface \ + ghcr.io/astral-sh/uv:debian \ + uv run https://raw.githubusercontent.com/modelcontextprotocol/ext-apps/main/examples/say-server/server.py +``` + +For GPU support, add `--gpus all` (requires [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html)). + +## Usage + +### With Claude Desktop + +Add to your Claude Desktop config (`~/Library/Application Support/Claude/claude_desktop_config.json`): + +```json +{ + "mcpServers": { + "say": { + "command": "uv", + "args": ["run", "server.py", "--stdio"], + "cwd": "/path/to/examples/say-server" + } + } +} +``` + +### With MCP Clients + +Connect to `http://localhost:3109/mcp` and call the `say` tool: + +```json +{ + "name": "say", + "arguments": { + "text": "Hello, world! This is a streaming TTS demo." + } +} +``` + +## Available Voices + +The default voice is `cosette`. Use the `list_voices` tool or pass a `voice` parameter to `say`: + +### Predefined Voices + +- `alba`, `marius`, `javert`, `jean` - from [alba-mackenna](https://huggingface.co/kyutai/tts-voices/tree/main/alba-mackenna) (CC BY 4.0) +- `cosette`, `eponine`, `azelma`, `fantine` - from [VCTK dataset](https://huggingface.co/kyutai/tts-voices/tree/main/vctk) (CC BY 4.0) + +### Custom Voices + +You can also use HuggingFace URLs or local file paths: + +```json +{"text": "Hello!", "voice": "hf://kyutai/tts-voices/voice-donations/alice.wav"} +{"text": "Hello!", "voice": "/path/to/my-voice.wav"} +``` + +See the [kyutai/tts-voices](https://huggingface.co/kyutai/tts-voices) repository for more voice collections + +## Architecture + +The entire server is contained in a single `server.py` file: + +1. **`say` tool**: Public tool that triggers the widget with text to speak +2. **Private tools** (`create_tts_queue`, `add_tts_text`, `poll_tts_audio`, etc.): Hidden from the model, only callable by the widget +3. **Embedded React widget**: Uses [Babel standalone](https://babeljs.io/docs/babel-standalone) for in-browser JSX transpilation - no build step needed +4. **TTS backend**: Manages per-request audio queues using Pocket TTS + +The widget communicates with the server via MCP tool calls: + +- Receives streaming text via `ontoolinputpartial` callback +- Incrementally sends new text to the server as it arrives (via `add_tts_text`) +- Polls for generated audio chunks while TTS runs in parallel +- Plays audio via Web Audio API with synchronized text highlighting + +## Multi-Widget Speak Lock + +When multiple TTS widgets exist in the same browser (e.g., multiple chat messages each with their own say widget), they coordinate via localStorage to ensure only one plays at a time: + +1. **Unique Widget IDs**: Each widget receives a UUID via `toolResult._meta.widgetUUID` +2. **Announce on Play**: When starting, a widget writes `{uuid, timestamp}` to `localStorage["mcp-tts-playing"]` +3. **Poll for Conflicts**: Every 200ms, playing widgets check if another widget took the lock +4. **Yield Gracefully**: If another widget started playing, pause and yield +5. **Clean Up**: On pause/finish, clear the lock (only if owned) + +This "last writer wins" protocol ensures a seamless experience: clicking play on any widget immediately pauses others, without requiring cross-iframe postMessage coordination. + +## TODO + +- Persist caret position in localStorage (resume from where you left off) +- Click anywhere in text to move the cursor/playback position + +## Credits + +This project uses [Pocket TTS](https://github.com/kyutai-labs/pocket-tts) by [Kyutai](https://kyutai.org/) - a fantastic open-source text-to-speech model. Thank you to the Kyutai team for making this technology available! + +The server includes modified Pocket TTS code to support streaming text input (text can be fed incrementally while audio generation runs in parallel). A PR contributing this functionality back to the original repo is planned. + +## License + +This example is MIT licensed. + +### Third-Party Licenses + +This project uses the following open-source components: + +| Component | License | Link | +| --------------------------------------------------------------------- | ----------------- | ---------------------------- | +| [pocket-tts](https://github.com/kyutai-labs/pocket-tts) | MIT | Python TTS library | +| [Kyutai TTS model](https://huggingface.co/kyutai/tts-0.75b-en-public) | CC-BY 4.0 | Text-to-speech model weights | +| [kyutai/tts-voices](https://huggingface.co/kyutai/tts-voices) | Mixed (see below) | Voice prompt files | + +### Voice Collection Licenses + +The predefined voices in this example use **CC-BY 4.0** licensed collections: + +| Collection | License | Commercial Use | +| --------------- | ------------------- | ------------------------- | +| alba-mackenna | CC-BY 4.0 | ✅ Yes (with attribution) | +| vctk | CC-BY 4.0 | ✅ Yes (with attribution) | +| cml-tts/fr | CC-BY 4.0 | ✅ Yes (with attribution) | +| voice-donations | CC0 (Public Domain) | ✅ Yes | +| **expresso** | CC-BY-NC 4.0 | ❌ Non-commercial only | +| **ears** | CC-BY-NC 4.0 | ❌ Non-commercial only | + +⚠️ **Note**: If you use voices from the `expresso/` or `ears/` collections, your use is restricted to non-commercial purposes. diff --git a/examples/say-server/grid-cell.png b/examples/say-server/grid-cell.png new file mode 100644 index 000000000..ff2e85a95 Binary files /dev/null and b/examples/say-server/grid-cell.png differ diff --git a/examples/say-server/mcp-app.html b/examples/say-server/mcp-app.html new file mode 100644 index 000000000..c51777c71 --- /dev/null +++ b/examples/say-server/mcp-app.html @@ -0,0 +1,14 @@ + + + + + + + Say Widget + + + +
+ + + diff --git a/examples/say-server/package.json b/examples/say-server/package.json new file mode 100644 index 000000000..a752ef7c4 --- /dev/null +++ b/examples/say-server/package.json @@ -0,0 +1,17 @@ +{ + "name": "@modelcontextprotocol/server-say", + "version": "0.4.1", + "private": true, + "description": "Streaming TTS MCP App Server with karaoke-style text highlighting", + "repository": { + "type": "git", + "url": "https://github.com/modelcontextprotocol/ext-apps", + "directory": "examples/say-server" + }, + "license": "MIT", + "scripts": { + "start": "uv run server.py", + "dev": "uv run server.py", + "build": "echo 'No build step needed for Python server'" + } +} diff --git a/examples/say-server/screenshot.png b/examples/say-server/screenshot.png new file mode 100644 index 000000000..0ac8e6cc1 Binary files /dev/null and b/examples/say-server/screenshot.png differ diff --git a/examples/say-server/server.py b/examples/say-server/server.py new file mode 100755 index 000000000..7d3891856 --- /dev/null +++ b/examples/say-server/server.py @@ -0,0 +1,1457 @@ +#!/usr/bin/env uv run --default-index https://pypi.org/simple +# /// script +# requires-python = ">=3.10" +# dependencies = [ +# "mcp @ git+https://github.com/modelcontextprotocol/python-sdk@main", +# "uvicorn>=0.34.0", +# "starlette>=0.46.0", +# "pocket-tts>=1.0.1", +# ] +# /// +""" +Say Demo - MCP App for streaming text-to-speech. + +This MCP server provides a "say" tool that speaks text using TTS. +The widget receives streaming partial input and starts speaking immediately. + +Architecture: +- The `say` tool itself is a no-op - it just triggers the widget +- The widget uses `ontoolinputpartial` to receive text as it streams +- Widget calls private tools to create TTS queue, add text, and poll audio +- Audio plays in the widget using Web Audio API +- Model context updates show playback progress to the LLM +- Native theming adapts to dark/light mode automatically +- Fullscreen mode with Escape key to exit +- Multi-widget speak lock coordinates playback across instances + +Usage: + # Start the MCP server + python server.py + + # Or with stdio transport (for Claude Desktop) + python server.py --stdio +""" +from __future__ import annotations +import asyncio +import base64 +import logging +import os +import sys +import time +import uuid +from dataclasses import dataclass, field +from pathlib import Path +from typing import Annotated, Literal +from pydantic import Field + +import torch +import uvicorn +from mcp.server.fastmcp import FastMCP +from mcp import types +from mcp.types import Icon +from starlette.middleware.cors import CORSMiddleware + +from pocket_tts.models.tts_model import TTSModel, prepare_text_prompt +from pocket_tts.default_parameters import DEFAULT_AUDIO_PROMPT + +logger = logging.getLogger(__name__) + +WIDGET_URI = "ui://say-demo/widget.html" +HOST = os.environ.get("HOST", "0.0.0.0") +PORT = int(os.environ.get("PORT", "3109")) + +# Speaker icon as SVG data URI +SPEAKER_ICON = Icon( + src="data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 24 24' fill='none' stroke='currentColor' stroke-width='2'%3E%3Cpolygon points='11 5 6 9 2 9 2 15 6 15 11 19 11 5'/%3E%3Cpath d='M15.54 8.46a5 5 0 0 1 0 7.07'/%3E%3Cpath d='M19.07 4.93a10 10 0 0 1 0 14.14'/%3E%3C/svg%3E", + mimeType="image/svg+xml", +) + +mcp = FastMCP("Say Demo", icons=[SPEAKER_ICON]) + +# Global TTS model (loaded on startup) +tts_model: TTSModel | None = None + + +# ------------------------------------------------------ +# TTS Queue State Management +# ------------------------------------------------------ + +@dataclass +class AudioChunkData: + """Audio chunk with timing metadata.""" + index: int + audio_base64: str + char_start: int + char_end: int + duration_ms: float + + +@dataclass +class TTSQueueState: + """State for a TTS generation queue.""" + id: str + voice: str + sample_rate: int + status: Literal["active", "complete", "error"] = "active" + error_message: str | None = None + + # Text queue + text_queue: asyncio.Queue = field(default_factory=asyncio.Queue) + end_signaled: bool = False + + # Audio output + audio_chunks: list[AudioChunkData] = field(default_factory=list) + chunks_delivered: int = 0 + + # Tracking + created_at: float = field(default_factory=time.time) + last_activity: float = field(default_factory=time.time) # Last text or end signal + lock: asyncio.Lock = field(default_factory=asyncio.Lock) + task: asyncio.Task | None = None + + +# Active TTS queues +tts_queues: dict[str, TTSQueueState] = {} + +# Queue timeout: if no activity for this long, mark as error +QUEUE_TIMEOUT_SECONDS = 30 + + +# ------------------------------------------------------ +# Public Tool: say +# ------------------------------------------------------ + +DEFAULT_TEXT = """Hello! I'm a text-to-speech demonstration. This speech is being generated in real-time as you watch. The words you see highlighted are synchronized with the audio playback, creating a karaoke-style reading experience. You can click to pause or resume, and use the reset button to restart from the beginning. Pretty neat, right?""" + +# Predefined voices from pocket-tts (mapped to HuggingFace files) +# See: https://huggingface.co/kyutai/tts-voices +PREDEFINED_VOICES = { + "alba": "hf://kyutai/tts-voices/alba-mackenna/casual.wav", + "marius": "hf://kyutai/tts-voices/alba-mackenna/merchant.wav", + "javert": "hf://kyutai/tts-voices/alba-mackenna/announcer.wav", + "jean": "hf://kyutai/tts-voices/alba-mackenna/a-moment-by.wav", + "fantine": "hf://kyutai/tts-voices/vctk/p225_023_mic1.wav", + "cosette": "hf://kyutai/tts-voices/vctk/p226_023_mic1.wav", + "eponine": "hf://kyutai/tts-voices/vctk/p227_023_mic1.wav", + "azelma": "hf://kyutai/tts-voices/vctk/p228_023_mic1.wav", +} + +DEFAULT_VOICE = "cosette" + + +@mcp.tool() +def list_voices() -> list[types.TextContent]: + """List available TTS voices. + + Returns the predefined voice names that can be used with the say tool. + You can also use HuggingFace URLs (hf://kyutai/tts-voices/...) or local file paths. + """ + import json + voice_info = { + "predefined_voices": list(PREDEFINED_VOICES.keys()), + "default_voice": DEFAULT_VOICE, + "custom_voice_formats": [ + "hf://kyutai/tts-voices//.wav", + "/path/to/local/voice.wav", + ], + "collections": [ + "alba-mackenna (CC-BY 4.0) - voice-acted characters", + "vctk (CC-BY 4.0) - VCTK dataset speakers", + "cml-tts/fr (CC-BY 4.0) - French voices", + "voice-donations (CC0) - public domain community voices", + "expresso (CC-BY-NC 4.0) - expressive (NON-COMMERCIAL ONLY)", + "ears (CC-BY-NC 4.0) - emotional (NON-COMMERCIAL ONLY)", + ], + } + return [types.TextContent(type="text", text=json.dumps(voice_info, indent=2))] + + +@mcp.tool(meta={ + "ui":{"resourceUri": WIDGET_URI}, + "ui/resourceUri": WIDGET_URI, # legacy support +}) +def say( + text: Annotated[str, Field(description="The English text to speak aloud")] = DEFAULT_TEXT, + voice: Annotated[str, Field( + description="Voice to use. Can be a predefined name (alba, marius, cosette, etc.), " + "a HuggingFace URL (hf://kyutai/tts-voices/...), or a local file path." + )] = DEFAULT_VOICE, + autoPlay: Annotated[bool, Field( + description="Whether to start playing automatically. Note: browsers may block autoplay until user interaction." + )] = True, +) -> list[types.TextContent]: + """Speak English text aloud using text-to-speech. + + Use when the user wants text read or spoken aloud: + - "say ...", "speak ...", "read ... out loud" + - "...; say it", "...; read it to me", "...; speak it" + - "narrate ...", "read this aloud" + + Audio streams in real-time as text is provided. + Use list_voices() for voice options. + + Note: English only. Non-English text may produce poor or garbled results. + """ + # Generate a unique ID for this widget instance (used for speak lock coordination) + widget_uuid = uuid.uuid4().hex[:12] + + # This is a no-op - the widget handles everything via ontoolinputpartial + # The tool exists to: + # 1. Trigger the widget to load + # 2. Provide the resourceUri metadata + # 3. Show the final text in the tool result + # 4. Provide widget UUID for multi-player coordination + return [types.TextContent( + type="text", + text=f"Displayed a TTS widget with voice '{voice}'. Click to play/pause, use toolbar to restart or fullscreen.", + _meta={"widgetUUID": widget_uuid}, + )] + + +# ------------------------------------------------------ +# Private Tools: TTS Queue Management +# ------------------------------------------------------ + +@mcp.tool(meta={"ui":{"visibility":["app"]}}) +async def create_tts_queue(voice: str = "cosette") -> list[types.TextContent]: + """Create a TTS generation queue. Returns queue_id and sample_rate. + + Args: + voice: Voice to use (cosette, alba, brenda, etc.) + """ + if tts_model is None: + return [types.TextContent(type="text", text='{"error": "TTS model not loaded"}')] + + queue_id = uuid.uuid4().hex[:12] + sample_rate = tts_model.config.mimi.sample_rate + + state = TTSQueueState( + id=queue_id, + voice=voice, + sample_rate=sample_rate, + ) + tts_queues[queue_id] = state + + # Start background TTS processing task + state.task = asyncio.create_task(_run_tts_queue(state)) + + logger.info(f"Created TTS queue {queue_id}") + + import json + return [types.TextContent( + type="text", + text=json.dumps({"queue_id": queue_id, "sample_rate": sample_rate}) + )] + + +@mcp.tool(meta={"ui":{"visibility":["app"]}}) +def add_tts_text(queue_id: str, text: str) -> list[types.TextContent]: + """Add text to a TTS queue. + + Args: + queue_id: The queue ID from create_tts_queue + text: Text to add (incremental, not cumulative) + """ + state = tts_queues.get(queue_id) + if not state: + return [types.TextContent(type="text", text='{"error": "Queue not found"}')] + if state.end_signaled: + return [types.TextContent(type="text", text='{"error": "Queue already ended"}')] + + # Queue the text (non-blocking) + try: + state.text_queue.put_nowait(text) + state.last_activity = time.time() # Update activity timestamp + except asyncio.QueueFull: + return [types.TextContent(type="text", text='{"error": "Queue full"}')] + + return [types.TextContent(type="text", text='{"queued": true}')] + + +@mcp.tool(meta={"ui":{"visibility":["app"]}}) +def end_tts_queue(queue_id: str) -> list[types.TextContent]: + """Signal that no more text will be sent to a queue. + + Args: + queue_id: The queue ID from create_tts_queue + """ + state = tts_queues.get(queue_id) + if not state: + logger.warning(f"end_tts_queue called for unknown queue: {queue_id}") + return [types.TextContent(type="text", text='{"error": "Queue not found"}')] + if state.end_signaled: + logger.info(f"end_tts_queue called for already-ended queue: {queue_id}") + return [types.TextContent(type="text", text='{"already_ended": true}')] + + state.end_signaled = True + state.last_activity = time.time() # Update activity timestamp + try: + state.text_queue.put_nowait(None) # EOF marker + except asyncio.QueueFull: + pass + + logger.info(f"end_tts_queue called for queue: {queue_id}") + return [types.TextContent(type="text", text='{"ended": true}')] + + +@mcp.tool(meta={"ui":{"visibility":["app"]}}) +def cancel_tts_queue(queue_id: str) -> list[types.TextContent]: + """Cancel and cleanup a TTS queue. Use before creating a new queue to avoid overlapping playback. + + Args: + queue_id: The queue ID from create_tts_queue + """ + state = tts_queues.pop(queue_id, None) + if not state: + return [types.TextContent(type="text", text='{"error": "Queue not found"}')] + + # Cancel the background task + if state.task and not state.task.done(): + state.task.cancel() + logger.info(f"Cancelled TTS queue {queue_id}") + + # Signal end to unblock any waiting consumers + state.end_signaled = True + try: + state.text_queue.put_nowait(None) + except asyncio.QueueFull: + pass + + state.status = "complete" + + return [types.TextContent(type="text", text='{"cancelled": true}')] + + +@mcp.tool(meta={"ui":{"visibility":["app"]}}) +def poll_tts_audio(queue_id: str) -> list[types.TextContent]: + """Poll for available audio chunks from a TTS queue. + + Returns base64-encoded audio chunks with timing metadata. + Call repeatedly until done=true. + + Args: + queue_id: The queue ID from create_tts_queue + """ + import json + import time + + state = tts_queues.get(queue_id) + if not state: + return [types.TextContent(type="text", text='{"error": "Queue not found"}')] + + # Update last activity to prevent timeout during active polling + state.last_activity = time.time() + + # Get new chunks (use sync approach since we can't await in tool) + # The lock is async, so we need to be careful here + # For simplicity, just grab what's available without locking + new_chunks = state.audio_chunks[state.chunks_delivered:] + state.chunks_delivered = len(state.audio_chunks) + + # Consider queues with errors as "done" so widget stops polling + done = (state.status == "complete" or state.status == "error") and state.chunks_delivered >= len(state.audio_chunks) + + response = { + "chunks": [ + { + "index": c.index, + "audio_base64": c.audio_base64, + "char_start": c.char_start, + "char_end": c.char_end, + "duration_ms": c.duration_ms, + } + for c in new_chunks + ], + "done": done, + "status": state.status, + } + + # Include error message if present + if state.error_message: + response["error"] = state.error_message + + # Clean up completed or errored queues + if done: + # Schedule cleanup after a delay + async def cleanup(): + await asyncio.sleep(60) + tts_queues.pop(queue_id, None) + try: + asyncio.get_event_loop().create_task(cleanup()) + except RuntimeError: + pass + + return [types.TextContent(type="text", text=json.dumps(response))] + + +# ------------------------------------------------------ +# Background TTS Processing +# ------------------------------------------------------ + + +class StreamingTextChunker: + """Buffers streaming text and emits chunks when ready for TTS processing. + + Chunks are emitted when: + - Token count reaches max_tokens threshold (at a sentence boundary if possible) + - flush() is called (end of stream) + + This matches the chunking behavior of split_into_best_sentences() but works + incrementally as text arrives. + """ + + def __init__(self, tokenizer, max_tokens: int = 50, min_tokens: int = 15): + """ + Args: + tokenizer: SentencePiece tokenizer from flow_lm.conditioner.tokenizer + max_tokens: Maximum tokens per chunk (default 50, matches existing) + min_tokens: Minimum tokens before considering emission + """ + self.tokenizer = tokenizer + self.max_tokens = max_tokens + self.min_tokens = min_tokens + self.buffer = "" + + # Cache end-of-sentence token IDs for boundary detection + _, *eos_tokens = tokenizer(".!...?").tokens[0].tolist() + self.eos_tokens = set(eos_tokens) + + def add_text(self, text: str) -> list[str]: + """Add text to buffer, return any complete chunks ready for processing. + + Args: + text: Incremental text to add (e.g., from LLM token) + + Returns: + List of text chunks ready for TTS (may be empty if still buffering) + """ + self.buffer += text + return self._extract_ready_chunks() + + def flush(self) -> list[str]: + """Flush remaining buffer as final chunk(s). + + Call this when the text stream ends to process any remaining text. + + Returns: + List of final text chunks (may be empty if buffer was empty) + """ + if not self.buffer.strip(): + return [] + + # Force emit whatever remains + chunks = self._extract_ready_chunks(force_emit=True) + if self.buffer.strip(): + chunks.append(self.buffer.strip()) + self.buffer = "" + return chunks + + def _extract_ready_chunks(self, force_emit: bool = False) -> list[str]: + """Extract chunks that are ready for processing.""" + chunks = [] + + while True: + chunk = self._try_extract_chunk(force_emit and not chunks) + if chunk is None: + break + chunks.append(chunk) + + return chunks + + def _try_extract_chunk(self, force_emit: bool = False) -> str | None: + """Try to extract one chunk from buffer.""" + text = self.buffer.strip() + if not text: + return None + + tokens = self.tokenizer(text).tokens[0].tolist() + num_tokens = len(tokens) + + # Not enough tokens yet + if num_tokens < self.min_tokens and not force_emit: + return None + + # Under max and not forcing - check for complete sentence worth emitting + if num_tokens < self.max_tokens and not force_emit: + # Only emit early if we have a complete sentence at a good length + if num_tokens >= self.min_tokens and self._ends_with_sentence_boundary(tokens): + # Found a complete sentence - emit it + chunk = text + self.buffer = "" + return chunk + return None + + # Over max_tokens or force_emit - find best split point + split_idx = self._find_best_split(tokens, force_emit) + + if split_idx == 0: + if force_emit: + chunk = text + self.buffer = "" + return chunk + return None + + # Decode tokens up to split point + chunk_text = self.tokenizer.sp.decode(tokens[:split_idx]) + remaining_text = self.tokenizer.sp.decode(tokens[split_idx:]) + + self.buffer = remaining_text + return chunk_text.strip() + + def _find_best_split(self, tokens: list[int], force_emit: bool = False) -> int: + """Find the best token index to split at (sentence boundary near max_tokens).""" + # Find all sentence boundaries (position AFTER the punctuation) + boundaries = [] + prev_was_eos = False + + for i, token in enumerate(tokens): + if token in self.eos_tokens: + prev_was_eos = True + elif prev_was_eos: + boundaries.append(i) + prev_was_eos = False + + # Also consider end of tokens if it ends with punctuation + if tokens and tokens[-1] in self.eos_tokens: + boundaries.append(len(tokens)) + + if not boundaries: + # No sentence boundaries - split at max_tokens if we're over + if len(tokens) >= self.max_tokens: + return self.max_tokens + return len(tokens) if force_emit else 0 + + # Find boundary closest to max_tokens without going too far over + best_boundary = 0 + for boundary in boundaries: + if boundary <= self.max_tokens: + best_boundary = boundary + elif best_boundary == 0: + # First boundary is past max - use it anyway + best_boundary = boundary + break + else: + # We have a good boundary before max, stop + break + + return best_boundary + + def _ends_with_sentence_boundary(self, tokens: list[int]) -> bool: + """Check if token sequence ends with sentence-ending punctuation.""" + if not tokens: + return False + return tokens[-1] in self.eos_tokens + + @property + def buffered_text(self) -> str: + """Current buffered text (for debugging/monitoring).""" + return self.buffer + + @property + def buffered_token_count(self) -> int: + """Approximate token count in buffer.""" + if not self.buffer.strip(): + return 0 + return len(self.tokenizer(self.buffer).tokens[0].tolist()) + + +async def _run_tts_queue(state: TTSQueueState): + """Background task: consume text queue, produce audio chunks.""" + if tts_model is None: + state.status = "error" + state.error_message = "TTS model not loaded" + return + + model_state = tts_model._cached_get_state_for_audio_prompt(state.voice, truncate=True) + chunker = StreamingTextChunker(tts_model.flow_lm.conditioner.tokenizer) + chunk_index = 0 + char_offset = 0 + + try: + while True: + # Wait for text with timeout to detect stale queues + try: + text_item = await asyncio.wait_for( + state.text_queue.get(), + timeout=5.0 # Check every 5 seconds + ) + except asyncio.TimeoutError: + # Check if queue is stale (no activity for too long) + if time.time() - state.last_activity > QUEUE_TIMEOUT_SECONDS: + logger.warning(f"TTS queue {state.id} timeout after {QUEUE_TIMEOUT_SECONDS}s of inactivity") + state.status = "error" + state.error_message = f"Queue timeout: no activity for {QUEUE_TIMEOUT_SECONDS}s" + break + # Continue waiting - queue might still be active + continue + + if text_item is None: + # EOF - flush remaining text + remaining = chunker.flush() + for chunk_text in remaining: + await _process_tts_chunk(state, chunk_text, chunk_index, char_offset, model_state) + char_offset += len(chunk_text) + chunk_index += 1 + + state.status = "complete" + logger.info(f"TTS queue {state.id} complete: {chunk_index} chunks") + break + + # Feed text to chunker + ready_chunks = chunker.add_text(text_item) + + for chunk_text in ready_chunks: + await _process_tts_chunk(state, chunk_text, chunk_index, char_offset, model_state) + char_offset += len(chunk_text) + chunk_index += 1 + + except Exception as e: + logger.error(f"TTS queue {state.id} error: {e}") + state.status = "error" + state.error_message = str(e) + + +async def _process_tts_chunk( + state: TTSQueueState, + text: str, + chunk_index: int, + char_offset: int, + model_state: dict, +): + """Process a text chunk and add audio to state.""" + if tts_model is None: + return + + loop = asyncio.get_event_loop() + audio_bytes_list: list[bytes] = [] + total_samples = 0 + + def generate_sync(): + nonlocal total_samples + _, frames_after_eos = prepare_text_prompt(text) + frames_after_eos += 2 + + for audio_chunk in tts_model._generate_audio_stream_short_text( + model_state=model_state, + text_to_generate=text, + frames_after_eos=frames_after_eos, + copy_state=True, + ): + audio_int16 = (audio_chunk * 32767).to(torch.int16) + audio_bytes_list.append(audio_int16.cpu().numpy().tobytes()) + total_samples += len(audio_chunk) + + await loop.run_in_executor(None, generate_sync) + + combined_audio = b"".join(audio_bytes_list) + duration_ms = (total_samples / state.sample_rate) * 1000 + + chunk_data = AudioChunkData( + index=chunk_index, + audio_base64=base64.b64encode(combined_audio).decode(), + char_start=char_offset, + char_end=char_offset + len(text), + duration_ms=duration_ms, + ) + + async with state.lock: + state.audio_chunks.append(chunk_data) + + logger.debug(f"TTS queue {state.id}: chunk {chunk_index} ready ({duration_ms:.0f}ms)") + + +# ------------------------------------------------------ +# Widget Resource +# ------------------------------------------------------ + +# Embedded widget HTML for standalone execution via `uv run ` +# Uses Babel standalone for in-browser JSX transpilation +# This is a copy of widget.html - keep them in sync! +EMBEDDED_WIDGET_HTML = """ + + + + + Say Widget + + + + + +
+ + +""" + + +def get_widget_html() -> str: + """Get the widget HTML, preferring built version from dist/.""" + # Prefer built version from dist/ (local development with npm run build) + dist_path = Path(__file__).parent / "dist" / "mcp-app.html" + if dist_path.exists(): + return dist_path.read_text() + # Fallback to embedded widget (for `uv run ` or unbundled usage) + return EMBEDDED_WIDGET_HTML + + +# IMPORTANT: all the external domains used by app must be listed +# in the meta.ui.csp.resourceDomains - otherwise they will be blocked by CSP policy +@mcp.resource( + WIDGET_URI, + mime_type="text/html;profile=mcp-app", + meta={"ui": {"csp": {"resourceDomains": ["https://esm.sh", "https://unpkg.com"]}}}, +) +def widget() -> str: + """Widget HTML resource with CSP metadata for external dependencies.""" + return get_widget_html() + + +# ------------------------------------------------------ +# Startup +# ------------------------------------------------------ + +def load_tts_model(): + """Load the TTS model on startup.""" + global tts_model + logger.info("Loading TTS model...") + tts_model = TTSModel.load_model() + logger.info("TTS model loaded") + + +def create_app(): + """Create the ASGI app (for uvicorn reload mode).""" + load_tts_model() + app = mcp.streamable_http_app(stateless_http=True) + app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_methods=["*"], + allow_headers=["*"], + ) + return app + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + + if "--stdio" in sys.argv: + # Claude Desktop mode + load_tts_model() + mcp.run(transport="stdio") + elif "--reload" in sys.argv: + # Reload mode - pass app as string so uvicorn can reimport + print(f"Say Server listening on http://{HOST}:{PORT}/mcp (reload mode)") + uvicorn.run("server:create_app", host=HOST, port=PORT, reload=True, factory=True) + else: + # HTTP mode + app = create_app() + print(f"Say Server listening on http://{HOST}:{PORT}/mcp") + uvicorn.run(app, host=HOST, port=PORT) diff --git a/package-lock.json b/package-lock.json index 405f3148d..b82b93a41 100644 --- a/package-lock.json +++ b/package-lock.json @@ -617,6 +617,44 @@ "name": "@modelcontextprotocol/server-qr", "version": "1.0.0" }, + "examples/say-server": { + "name": "@modelcontextprotocol/server-say", + "version": "0.4.1", + "license": "MIT", + "dependencies": { + "@modelcontextprotocol/ext-apps": "^0.4.1", + "react": "^19.2.0", + "react-dom": "^19.2.0" + }, + "devDependencies": { + "@types/node": "^22.0.0", + "@types/react": "^19.2.2", + "@types/react-dom": "^19.2.2", + "@vitejs/plugin-react": "^4.3.4", + "concurrently": "^9.2.1", + "cross-env": "^10.1.0", + "typescript": "^5.9.3", + "vite": "^6.0.0", + "vite-plugin-singlefile": "^2.3.0" + } + }, + "examples/say-server/node_modules/@types/node": { + "version": "22.19.7", + "resolved": "https://registry.npmjs.org/@types/node/-/node-22.19.7.tgz", + "integrity": "sha512-MciR4AKGHWl7xwxkBa6xUGxQJ4VBOmPTF7sL+iGzuahOFaO0jHCsuEfS80pan1ef4gWId1oWOweIhrDEYLuaOw==", + "dev": true, + "license": "MIT", + "dependencies": { + "undici-types": "~6.21.0" + } + }, + "examples/say-server/node_modules/undici-types": { + "version": "6.21.0", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.21.0.tgz", + "integrity": "sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ==", + "dev": true, + "license": "MIT" + }, "examples/scenario-modeler-server": { "name": "@modelcontextprotocol/server-scenario-modeler", "version": "0.4.1", @@ -2480,6 +2518,10 @@ "resolved": "examples/qr-server", "link": true }, + "node_modules/@modelcontextprotocol/server-say": { + "resolved": "examples/say-server", + "link": true + }, "node_modules/@modelcontextprotocol/server-scenario-modeler": { "resolved": "examples/scenario-modeler-server", "link": true diff --git a/tests/e2e/generate-grid-screenshots.spec.ts b/tests/e2e/generate-grid-screenshots.spec.ts index 80e3c0a15..1bc639c8e 100644 --- a/tests/e2e/generate-grid-screenshots.spec.ts +++ b/tests/e2e/generate-grid-screenshots.spec.ts @@ -58,6 +58,7 @@ const ALL_SERVERS = [ { key: "map-server", name: "Map Server", dir: "map-server" }, { key: "pdf-server", name: "PDF Server", dir: "pdf-server" }, { key: "qr-server", name: "QR Code Server", dir: "qr-server" }, + { key: "say-server", name: "Say Demo", dir: "say-server" }, { key: "scenario-modeler", name: "SaaS Scenario Modeler", diff --git a/tests/e2e/servers.spec.ts b/tests/e2e/servers.spec.ts index a3df89fb7..d3928931a 100644 --- a/tests/e2e/servers.spec.ts +++ b/tests/e2e/servers.spec.ts @@ -16,6 +16,7 @@ const DYNAMIC_MASKS: Record = { "basic-vue": ["#server-time"], // Server time display "cohort-heatmap": ['[class*="heatmapWrapper"]'], // Heatmap grid (random data) "customer-segmentation": [".chart-container"], // Scatter plot (random data) + "say-server": [".playBtn", ".playOverlayBtn"], // Play buttons may have different states shadertoy: ["#canvas"], // WebGL shader canvas (animated) "system-monitor": [ ".chart-container", // CPU chart (highly dynamic) diff --git a/tests/e2e/servers.spec.ts-snapshots/say-server.png b/tests/e2e/servers.spec.ts-snapshots/say-server.png new file mode 100644 index 000000000..6bd22151e Binary files /dev/null and b/tests/e2e/servers.spec.ts-snapshots/say-server.png differ