diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml index c34818f2..cb90e3a0 100644 --- a/.github/workflows/testing.yml +++ b/.github/workflows/testing.yml @@ -63,3 +63,16 @@ jobs: - uses: astral-sh/setup-uv@v4 - run: uv sync --extra dev - run: uv run pytest -m integration tests/test_github.py -v + + integration-tests-buildkite: + runs-on: ubuntu-latest + timeout-minutes: 30 + # Skip for Dependabot PRs as they don't have access to secrets + if: github.actor != 'dependabot[bot]' + env: + BUILDKITE_API_TOKEN: ${{ secrets.BUILDKITE_API_TOKEN }} + steps: + - uses: actions/checkout@v4 + - uses: astral-sh/setup-uv@v4 + - run: uv sync --extra dev + - run: uv run pytest -m integration tests/test_buildkite.py -v diff --git a/SKILLS/buildkite.md b/SKILLS/buildkite.md new file mode 100644 index 00000000..b3222cb0 --- /dev/null +++ b/SKILLS/buildkite.md @@ -0,0 +1,870 @@ +# Buildkite GPU Infrastructure Guide + +This document describes how to set up and use the Buildkite infrastructure for GPU job isolation. + +## Overview + +Buildkite provides a parallel infrastructure for onboarding arbitrary GPU vendors with proper isolation. It runs alongside the existing GitHub Actions system, providing: + +- Per-GPU job isolation via `NVIDIA_VISIBLE_DEVICES` +- Resource constraints (CPU, RAM, disk) via Docker cgroups +- Clear, reproducible Docker environment +- Automatic queue management + +## Quick Start + +1. Create queue in Buildkite UI: Agents → Default cluster → Queues → New Queue (select "Self hosted") +2. Run setup script on your GPU node: + ```bash + sudo BUILDKITE_AGENT_TOKEN= GPU_TYPE= ./deployment/buildkite/setup-node-simple.sh + ``` +3. Test with `pipeline-test-docker.yml` + +## Current Status + +**Working**: Full GPU isolation with auto-resource detection. Tested on 2x NVIDIA L40S node with: +- Each agent gets 1 GPU, 8 CPUs, 144GB RAM (auto-calculated from 16 CPUs / 2 GPUs, 289GB / 2 GPUs) + +## Architecture + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ VENDOR 8-GPU NODE │ +├─────────────────────────────────────────────────────────────────┤ +│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │ +│ │ Agent GPU-0 │ │ Agent GPU-1 │ ... │ Agent GPU-7 │ │ +│ │ NVIDIA_VIS │ │ NVIDIA_VIS │ │ NVIDIA_VIS │ │ +│ │ IBLE_DEV=0 │ │ IBLE_DEV=1 │ │ IBLE_DEV=7 │ │ +│ └──────┬──────┘ └──────┬──────┘ └──────┬──────┘ │ +│ └───────────────┴───────────────────┘ │ +│ │ │ +│ ┌────────────▼────────────┐ │ +│ │ queue = "nvidia-b200" │ ← All agents same queue│ +│ └─────────────────────────┘ │ +└─────────────────────────────────────────────────────────────────┘ + │ + ▼ + ┌───────────────────────┐ + │ BUILDKITE CLOUD │ + │ Routes to idle agent │ + └───────────────────────┘ +``` + +## Prerequisites + +### Buildkite Account Setup + +1. Create/access Buildkite organization at https://buildkite.com +2. Create a pipeline named `kernelbot` +3. Generate two tokens: + - **Agent Token**: For nodes to connect (Agents → Agent Tokens) + - **API Token**: For submitting jobs (Personal Settings → API Access Tokens) + +### API Token Permissions + +The API token needs these scopes: +- `read_builds` - Poll build status +- `write_builds` - Create/trigger builds +- `read_artifacts` - Download result.json artifact +- `read_agents` (optional) - Check queue status + +## Vendor Node Setup + +### Prerequisites (Do This First in Buildkite UI) + +Before running the setup script on your node: + +1. **Create Buildkite account** at https://buildkite.com +2. **Create pipeline** named `kernelbot` +3. **Generate Agent Token**: Go to Agents → Agent Tokens → New Token +4. **Create Queue**: Go to Agents → Default cluster → Queues → New Queue + - Enter your GPU type as the key (e.g., `test`, `b200`, `h100`) + - Select **Self hosted** + - Click Create Queue + +### Run Setup Script + +On your GPU node: + +```bash +git clone https://github.com/gpu-mode/kernelbot.git +cd kernelbot + +sudo BUILDKITE_AGENT_TOKEN= GPU_TYPE= ./deployment/buildkite/setup-node-simple.sh +``` + +The script will: +- Install Buildkite agent (if not present) +- Create one agent per GPU with proper isolation +- Configure git to use HTTPS (avoids SSH key issues) +- Create environment hook that sets `NVIDIA_VISIBLE_DEVICES` per job +- Start all agents as systemd services + +### Verify Setup + +1. Check agents appear in Buildkite: https://buildkite.com/organizations/YOUR_ORG/agents +2. Run a test build with this pipeline: + +```yaml +steps: + - label: "GPU Test" + command: "echo NVIDIA_VISIBLE_DEVICES=$$NVIDIA_VISIBLE_DEVICES && nvidia-smi -L" + agents: + queue: "your-queue-name" +``` + +### Environment Variables + +The setup script sets these automatically: +- `GPU_TYPE` (required): Queue name matching what you created in Buildkite +- `BUILDKITE_AGENT_TOKEN` (required): Agent token from Buildkite +- `NODE_NAME` (optional): Defaults to hostname +- `GPU_COUNT` (optional): Auto-detected from nvidia-smi + +## Pipeline Configuration + +### Create Pipeline in Buildkite + +1. Go to Pipelines → New Pipeline +2. Name: `kernelbot` +3. Repository: `https://github.com/gpu-mode/kernelbot` +4. Steps: Either upload from repo or paste directly + +### Pipeline YAML + +The pipeline is at `deployment/buildkite/pipeline.yml`: + +```yaml +steps: + - label: ":rocket: Kernel Evaluation" + command: "python /app/src/runners/buildkite-runner.py" + agents: + queue: "${KERNELBOT_QUEUE}" + plugins: + - docker#v5.11.0: + image: "${KERNELBOT_IMAGE:-ghcr.io/gpu-mode/kernelbot:latest}" + runtime: nvidia + propagate-environment: true + environment: + - NVIDIA_VISIBLE_DEVICES + - CUDA_VISIBLE_DEVICES + - KERNELBOT_PAYLOAD + - KERNELBOT_RUN_ID + timeout_in_minutes: 15 +``` + +## Testing + +### Working Docker Pipeline + +Use this tested pipeline configuration for GPU jobs: + +```yaml +steps: + - label: ":whale: Docker GPU Test" + agents: + queue: "test" # Must match your queue name + plugins: + - docker#v5.11.0: + image: "nvidia/cuda:12.4.0-runtime-ubuntu22.04" + always-pull: true + gpus: "all" # Use gpus instead of runtime: nvidia + propagate-environment: true + environment: + - NVIDIA_VISIBLE_DEVICES + - CUDA_VISIBLE_DEVICES + cpus: "${KERNELBOT_CPUS:-8}" + memory: "${KERNELBOT_MEMORY:-64g}" + command: | + echo "=== Resource Isolation ===" + echo "NVIDIA_VISIBLE_DEVICES=$$NVIDIA_VISIBLE_DEVICES" + nvidia-smi + nproc + free -h + timeout_in_minutes: 5 +``` + +**Key points**: +- Use `gpus: "all"` instead of `runtime: nvidia` (more reliable) +- Use `$$NVIDIA_VISIBLE_DEVICES` (double dollar) in YAML to prevent variable stripping +- The environment hook auto-sets KERNELBOT_CPUS, KERNELBOT_MEMORY based on machine resources + +### End-to-End Test + +Run from your local machine: + +```bash +cd kernelbot +BUILDKITE_API_TOKEN= uv run python tests/e2e_buildkite_test.py --queue test +``` + +Options: +- `--queue `: Target queue (default: test) +- `--org `: Buildkite org (default: mark-saroufim) +- `--pipeline `: Pipeline name (default: kernelbot) +- `--dry-run`: Print config without submitting + +### Check Queue Status + +```bash +BUILDKITE_API_TOKEN= uv run python -c " +import asyncio +from libkernelbot.launchers.buildkite import BuildkiteLauncher, BuildkiteConfig + +async def main(): + launcher = BuildkiteLauncher(BuildkiteConfig(api_token='')) + status = await launcher.get_queue_status('test') + print(f'Queue: {status[\"queue\"]}') + print(f'Total agents: {status[\"total\"]}') + print(f'Idle agents: {status[\"idle\"]}') + for agent in status['agents']: + print(f' - {agent[\"name\"]}: busy={agent[\"busy\"]}') + +asyncio.run(main()) +" +``` + +## GPU Types + +Buildkite-managed GPUs are registered with `_BK` suffix: + +| GPU Type | Queue | SM Arch | +|----------|-------|---------| +| `B200_BK` | `b200` | 100 | +| `H100_BK` | `h100` | 90a | +| `MI300_BK` | `mi300` | (AMD) | +| `L40S_BK` | `test` | 89 (Ada Lovelace) | + +## Environment Variables + +### On Heroku/Backend (where the app runs) + +These are set in Heroku config vars or your `.env` file: + +| Variable | Required | Description | +|----------|----------|-------------| +| `BUILDKITE_API_TOKEN` | Yes | API token for submitting jobs and downloading artifacts. Get from Buildkite → Personal Settings → API Access Tokens | +| `BUILDKITE_ORG` | No | Organization slug (default: `mark-saroufim`) | +| `BUILDKITE_PIPELINE` | No | Pipeline slug (default: `kernelbot`) | + +**API Token Permissions Required:** +- `read_builds` - Poll build status +- `write_builds` - Create/trigger builds +- `read_artifacts` - Download result.json artifact +- `read_agents` (optional) - Check queue status + +### On GPU Runner Nodes + +These are set during node setup: + +| Variable | Set By | Description | +|----------|--------|-------------| +| `BUILDKITE_AGENT_TOKEN` | Admin (setup script) | Agent token for connecting to Buildkite | +| `NVIDIA_VISIBLE_DEVICES` | Environment hook | GPU index for isolation (auto-set per job) | +| `CUDA_VISIBLE_DEVICES` | Environment hook | Same as above | +| `KERNELBOT_GPU_INDEX` | Environment hook | GPU index (0, 1, 2, ...) | +| `KERNELBOT_CPUSET` | Environment hook | CPU cores for this agent | +| `KERNELBOT_MEMORY` | Environment hook | Memory limit for Docker | + +### Passed to Jobs (via Buildkite API) + +These are set automatically by the launcher: + +| Variable | Description | +|----------|-------------| +| `KERNELBOT_RUN_ID` | Unique run identifier | +| `KERNELBOT_PAYLOAD` | Base64+zlib compressed job config | +| `KERNELBOT_QUEUE` | Target queue name | +| `KERNELBOT_IMAGE` | Docker image to use | + +## Troubleshooting + +### Agent not appearing in dashboard + +1. Check agent is running: `sudo systemctl status buildkite-agent` +2. Check logs: `sudo journalctl -u buildkite-agent -f` +3. Verify token is correct in `/etc/buildkite-agent/buildkite-agent.cfg` + +### Job stuck in queue + +1. Check agents are idle: Buildkite dashboard → Agents +2. Verify queue name matches agent tags +3. Check agent logs for errors + +### Docker permission denied + +```bash +sudo usermod -aG docker buildkite-agent +sudo systemctl restart buildkite-agent +``` + +### GPU not visible in container + +1. Verify nvidia-container-toolkit: `nvidia-ctk --version` +2. Configure docker runtime: `sudo nvidia-ctk runtime configure --runtime=docker` +3. Restart docker: `sudo systemctl restart docker` +4. Test: `docker run --rm --gpus all nvidia/cuda:12.4.0-base-ubuntu22.04 nvidia-smi` + +### Package dependency conflicts (nvidia-container-toolkit) + +If you see version conflicts: +```bash +sudo apt-get install -y nvidia-container-toolkit=1.18.1-1 nvidia-container-toolkit-base=1.18.1-1 +``` + +### Agent fails with "Missing build-path" + +The config file needs `build-path` set: + +```bash +sudo nano /etc/buildkite-agent/buildkite-agent.cfg +``` + +Add this line: +``` +build-path="/var/lib/buildkite-agent/builds" +``` + +Then: +```bash +sudo mkdir -p /var/lib/buildkite-agent/builds +sudo chown buildkite-agent:buildkite-agent /var/lib/buildkite-agent/builds +sudo systemctl restart buildkite-agent +``` + +### Agent not appearing - "Could not find queue" + +You must create the queue in Buildkite web UI: +1. Go to **Agents** tab → **Default cluster** → **Queues** +2. Click **New Queue** +3. Enter queue name (e.g., `test`) +4. Select **Self hosted** +5. Click **Create Queue** + +### Jobs run on hosted agents instead of self-hosted + +Make sure your pipeline steps include the queue: + +```yaml +steps: + - label: ":rocket: Test Job" + command: "nvidia-smi" + agents: + queue: "test" # This is required! +``` + +Without `agents: queue:`, Buildkite uses hosted runners by default. + +### Docker runtime crashes / "nvidia-container-runtime: no such file" + +Use `gpus: "all"` in the Docker plugin instead of `runtime: nvidia`: + +```yaml +plugins: + - docker#v5.11.0: + gpus: "all" # ✓ Use this + # runtime: nvidia # ✗ Avoid - can cause crashes +``` + +If issues persist, reinstall nvidia-container-toolkit: +```bash +sudo apt-get remove --purge nvidia-container-toolkit nvidia-container-toolkit-base +sudo apt-get install nvidia-container-toolkit +sudo nvidia-ctk runtime configure --runtime=docker +sudo systemctl restart docker +``` + +### Environment hook not running + +Make sure the hook has a shebang line: +```bash +#!/bin/bash +# Rest of hook script... +``` + +### Git clone fails with "Permission denied (publickey)" + +The buildkite-agent user doesn't have SSH keys for GitHub. Fix by using HTTPS: + +```bash +cd /tmp && sudo -u buildkite-agent git config --global url."https://github.com/".insteadOf "git@github.com:" +``` + +## Resource Isolation + +| Resource | Method | Enforcement | +|----------|--------|-------------| +| GPU | `NVIDIA_VISIBLE_DEVICES` | Per-agent env var | +| CPU | `--cpuset-cpus` | Docker cgroups | +| Memory | `--memory` | Docker cgroups | +| Disk | Separate build paths | Filesystem | +| Network | Docker bridge | Container isolation | + +## Files Reference + +| File | Purpose | +|------|---------| +| `deployment/buildkite/setup-node-simple.sh` | Vendor node setup script (recommended) | +| `deployment/buildkite/pipeline.yml` | Buildkite pipeline config | +| `deployment/buildkite/pipeline-test-docker.yml` | Docker test pipeline | +| `deployment/buildkite/Dockerfile` | Docker image for jobs | +| `src/libkernelbot/launchers/buildkite.py` | BuildkiteLauncher class | +| `src/runners/buildkite-runner.py` | Job execution script | +| `tests/e2e_buildkite_test.py` | E2E test script | + +## Auto-Resource Detection + +The environment hook automatically detects and divides machine resources: + +``` +Machine: 16 CPUs, 289GB RAM, 2 GPUs + ↓ +Per-GPU allocation: + - GPU 0: CPUs 0-7, 144GB RAM + - GPU 1: CPUs 8-15, 144GB RAM +``` + +This is calculated in the environment hook as: +- `CPUS_PER_GPU = TOTAL_CPUS / GPU_COUNT` +- `RAM_PER_GPU = TOTAL_RAM_GB / GPU_COUNT` +- `KERNELBOT_CPUSET = (GPU_INDEX * CPUS_PER_GPU) to ((GPU_INDEX + 1) * CPUS_PER_GPU - 1)` + +## Summary of Key Decisions + +1. **Use `gpus: "all"` not `runtime: nvidia`** - More reliable with nvidia-container-toolkit +2. **Environment hook for isolation** - Sets `NVIDIA_VISIBLE_DEVICES`, `KERNELBOT_*` vars before each job +3. **Auto-detect resources** - No hardcoded CPU/RAM values; divides machine resources by GPU count +4. **One agent per GPU** - Each agent has its own build path and GPU assignment +5. **HTTPS for git** - Avoids SSH key issues on buildkite-agent user +6. **Queue must exist first** - Create queue in Buildkite UI before agents can connect +7. **Follow S3 redirects for artifacts** - Buildkite returns 302 to S3; must fetch without auth header + +## E2E Workflow (Verified Working) + +The complete end-to-end flow for submitting jobs and retrieving results: + +``` +┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ +│ Your Backend │────▶│ Buildkite │────▶│ GPU Runner │ +│ │ │ Cloud │ │ (Self-hosted) │ +│ BuildkiteLauncher │ │ │ │ +│ ._launch() │ │ Routes to │ │ Runs Docker │ +│ │ │ idle agent │ │ container │ +└─────────────────┘ └─────────────────┘ └─────────────────┘ + │ │ │ + │ 1. POST /builds │ │ + │ (payload encoded) │ │ + │──────────────────────▶│ │ + │ │ 2. Dispatch job │ + │ │──────────────────────▶│ + │ │ │ + │ │ │ 3. Run evaluation + │ │ │ Write result.json + │ │ │ + │ │ 4. Upload artifact │ + │ │◀──────────────────────│ + │ │ │ + │ 5. Poll status │ │ + │◀─────────────────────▶│ │ + │ │ │ + │ 6. Download artifact │ │ + │ (via S3 redirect) │ │ + │◀──────────────────────│ │ + │ │ │ + ▼ │ │ + Return result │ │ +``` + +### Verified Test Output + +``` +=== Buildkite E2E Test === +Organization: mark-saroufim +Pipeline: kernelbot +Queue: test +Mode: artifact + +Submitting test job... +[UPDATE] Build created: [28] +[UPDATE] Build completed: [28] + +=== Result === +Success: True +Build URL: https://buildkite.com/mark-saroufim/kernelbot/builds/28 +Downloaded artifact: +{ + "success": true, + "error": "", + "runs": {}, + "system": { + "gpu_name": "test", + "cuda_version": "12.4", + "python_version": "N/A" + } +} + +=== Queue Status === +Queue: test +Total agents: 0 +Idle agents: 0 +``` + +### How It Works + +1. **BuildkiteLauncher._launch()** encodes config as base64+zlib compressed payload +2. **POST to Buildkite API** creates a build with env vars (KERNELBOT_PAYLOAD, KERNELBOT_RUN_ID) +3. **Buildkite routes** the job to an idle agent in the specified queue +4. **Agent runs Docker container** with GPU isolation (NVIDIA_VISIBLE_DEVICES set by environment hook) +5. **Container writes result.json** to working directory +6. **Buildkite uploads artifact** to S3 +7. **BuildkiteLauncher polls** until build completes +8. **Downloads result.json** by following S3 redirect (without auth header) +9. **Returns parsed result** to caller + +### Running the E2E Test + +```bash +BUILDKITE_API_TOKEN= uv run python tests/e2e_buildkite_test.py \ + --org \ + --queue test +``` + +## Real Evaluation Jobs + +### Submit a Real Kernel Evaluation + +```bash +BUILDKITE_API_TOKEN= uv run python scripts/submit_buildkite_job.py --eval vectoradd_py +``` + +This runs the full evaluation pipeline on actual GPU hardware and returns real benchmark results: + +``` +=== Result === +Success: True +System: SystemInfo(gpu='NVIDIA L40S', device_count=1, cpu='AMD EPYC 9254 24-Core Processor', runtime='CUDA', platform='Linux-5.15.0-164-generic-x86_64-with-glibc2.35', torch='2.6.0+cu124', hostname='...') + +test: + Passed: True + Duration: 3.18s + Result: {'test-count': '5', 'test.0.status': 'pass', 'test.1.status': 'pass', ...} +``` + +### Integration Tests + +Run the full integration test suite: + +```bash +BUILDKITE_API_TOKEN= uv run pytest tests/test_buildkite.py -v -m integration +``` + +Tests include: +- `test_buildkite_launcher_python_script` - Real evaluation with vectoradd_py +- `test_buildkite_launcher_failing_script` - Verifies cheating scripts correctly fail +- `test_buildkite_queue_status` - Tests agent queue API + +### Available Examples + +Any example in the `examples/` directory works: + +```bash +# List available examples +ls examples/ + +# Run a specific example +BUILDKITE_API_TOKEN=xxx uv run python scripts/submit_buildkite_job.py --eval identity_py +``` + +## Operational Model + +### Option 1: No Pre-Built Image (Current Default) + +The pipeline installs dependencies at runtime. Each job: + +1. Uses base `nvidia/cuda:12.4.0-devel-ubuntu22.04` image +2. Installs dependencies at runtime (~30-40 seconds): + - `uv` for Python package management + - Clones kernelbot repo + - Runs `uv sync` and `uv pip install torch triton numpy` +3. Runs the evaluation + +**Advantages:** +- No Dockerfile to maintain or rebuild +- No image registry to manage +- Always uses latest code from repo +- **No admin action needed** after code updates + +**Trade-off:** +- Slower cold starts (~40 seconds) + +### Option 2: Pre-Built Image (Fast Cold Starts) + +For faster cold starts (~5 seconds), build the Docker image on each node: + +```bash +# During initial setup: +sudo BUILDKITE_AGENT_TOKEN=xxx GPU_TYPE=test BUILD_IMAGE=true ./deployment/buildkite/setup-node-simple.sh + +# Or build separately: +./deployment/buildkite/build-image.sh +``` + +Then update the Buildkite pipeline config to use the local image: +```yaml +image: "kernelbot:latest" +``` + +**When to rebuild the image:** +- When dependencies change (new PyTorch version, new packages) +- When you want the latest kernelbot code baked in +- NOT needed for problem/task changes (those come via config) + +**Rebuild command:** +```bash +./deployment/buildkite/build-image.sh +``` + +### When Admin Action Is Needed + +| Scenario | Action Required | +|----------|-----------------| +| Code changes (no deps) | None - pipeline clones fresh code | +| Dependency changes | Rebuild image: `./build-image.sh` | +| Initial node setup | Run `setup-node-simple.sh` once | +| NVIDIA driver updates | May need to rebuild image | +| Buildkite agent updates | Rare - Buildkite handles this | + +### Shared Evaluation Logic + +All runners (GitHub, Modal, Buildkite) use the exact same evaluation engine: + +```python +# src/runners/buildkite-runner.py:49 +from libkernelbot.run_eval import run_config +result = run_config(config) +``` + +This means: +- Any problem that works on GitHub/Modal works on Buildkite +- Same result format (`FullResult`) +- Same test/benchmark logic +- Same correctness checking + +## Current Branch + +The Buildkite infrastructure is on the `buildkite-infrastructure` branch. The pipeline clones from this branch: + +```yaml +git clone --depth 1 --branch buildkite-infrastructure https://github.com/gpu-mode/kernelbot.git +``` + +Once merged to `main`, update the pipeline config to use `main` branch. + +## E2E Testing with Database + +A comprehensive end-to-end test script is available that: +1. Creates a test leaderboard in the database +2. Submits a real kernel evaluation to Buildkite +3. Stores results in PostgreSQL +4. Verifies data integrity + +### Running E2E Tests + +```bash +# Test mode (correctness only) +BUILDKITE_API_TOKEN=xxx uv run python scripts/e2e_buildkite_with_db.py \ + --org mark-saroufim --queue test + +# Leaderboard mode (with benchmarks and scoring) +BUILDKITE_API_TOKEN=xxx uv run python scripts/e2e_buildkite_with_db.py \ + --org mark-saroufim --queue test --mode leaderboard + +# With cleanup (delete test leaderboard after) +BUILDKITE_API_TOKEN=xxx uv run python scripts/e2e_buildkite_with_db.py \ + --org mark-saroufim --queue test --mode leaderboard --cleanup +``` + +### Verified Working (2026-02-04) + +| Mode | Status | Details | +|------|--------|---------| +| Test | ✅ | 5 tests passed, ~3.4s duration | +| Benchmark | ✅ | 30 runs, 4.07ms mean | +| Leaderboard | ✅ | Score computed and stored | +| Database | ✅ | All runs stored with system info | + +--- + +## Known Limitations & Review Notes + +This section documents known limitations and tradeoffs for code reviewers. + +### 1. Cold Start Overhead + +**Problem**: Each job incurs significant startup overhead: + +| Phase | Time | Notes | +|-------|------|-------| +| Docker pull | 10-30s | First run only if image not cached | +| Container start | 2-5s | Includes cgroup setup | +| Python imports | 5-10s | PyTorch, Triton, etc. | +| Code clone | 3-5s | If using runtime install | +| **Total cold start** | **20-50s** | Varies by image caching | + +**Current Approach**: We use a pre-built Docker image (`ghcr.io/gpu-mode/kernelbot:latest`) with dependencies baked in. This reduces cold start to ~10-15s after first pull. + +### 2. Dependency Installation Tradeoffs + +There are two operational models with different tradeoffs: + +#### Option A: Pre-Built Image (Current Default) +```yaml +image: "ghcr.io/gpu-mode/kernelbot:latest" +``` +- **Pros**: Fast cold starts (~5-10s), consistent environment +- **Cons**: Must rebuild image for dependency changes, requires image registry +- **When to rebuild**: PyTorch version change, new packages, security updates + +#### Option B: Runtime Installation +```yaml +image: "nvidia/cuda:12.4.0-devel-ubuntu22.04" +command: | + pip install torch triton numpy + python eval.py +``` +- **Pros**: Always latest dependencies, no image maintenance +- **Cons**: Slow cold starts (~40-60s), network dependency, version drift +- **Use when**: Testing new dependencies, development + +#### Option C: Cached Dependencies on Host +```yaml +volumes: + - "/var/lib/buildkite-agent/cache/pip:/root/.cache/pip:rw" +``` +- **Pros**: Fast after first run, no image rebuild needed +- **Cons**: Cache invalidation complexity, disk usage, per-node setup +- **Use when**: Frequent dependency changes, limited registry access + +**Recommendation**: Use Option A (pre-built image) for production. Use Option B for development/testing new dependencies. + +### 3. GPU Isolation Limitations + +**Current Isolation Model**: +- GPU isolation via `NVIDIA_VISIBLE_DEVICES` environment variable +- CPU isolation via Docker `--cpuset-cpus` +- Memory isolation via Docker `--memory` + +**Known Gaps**: + +| Resource | Isolation Level | Notes | +|----------|-----------------|-------| +| GPU Compute | ✅ Strong | Only assigned GPU visible | +| GPU Memory | ⚠️ Partial | Other processes could exhaust VRAM if running | +| PCIe Bandwidth | ❌ None | Shared across all GPUs | +| NVLink | ❌ None | If present, shared | +| CPU Cache | ⚠️ Partial | L3 cache shared across cores | +| Network | ⚠️ Partial | Docker bridge, but shared bandwidth | +| Disk I/O | ❌ None | Shared unless using separate volumes | + +**Potential Issues**: +- **Noisy neighbor**: One job could impact another via shared resources +- **VRAM exhaustion**: If host processes use GPU memory +- **Timing variability**: Benchmark results may vary due to shared resources + +**Mitigations**: +- Run one agent per GPU (current approach) +- Use dedicated benchmark nodes for competition scoring +- Monitor for outlier results + +### 4. Artifact Handling + +**Current Flow**: +1. Job writes `result.json` to working directory +2. Buildkite agent uploads to S3 +3. Backend downloads via Buildkite API (302 redirect to S3) + +**Limitations**: +- **Size limit**: ~100MB per artifact (Buildkite limit) +- **Retention**: 6 months by default +- **Download latency**: 1-2s for small files, more for large profiles + +### 5. Queue Management + +**Current Model**: One queue per GPU type (e.g., `b200`, `h100`, `mi300`) + +**Limitations**: +- No priority queuing (FIFO only) +- No job preemption +- No fair-share scheduling between users +- Queue depth visibility requires API calls + +**Potential Improvements**: +- Implement priority via build metadata +- Add rate limiting per user +- Create admin queue for verification runs + +### 6. Error Handling + +**Automatic Retries**: +```yaml +retry: + automatic: + - exit_status: -1 # Infrastructure failure + limit: 2 + - exit_status: 255 # Agent disconnect + limit: 1 +``` + +**Not Automatically Retried**: +- Compilation errors (user code issue) +- Test failures (user code issue) +- Timeout (15 min default) +- OOM errors + +### 7. Security Considerations + +**Sandboxing**: +- Jobs run in Docker containers +- No host network access +- Limited volume mounts + +**Risks**: +- User code has full GPU access (could mine crypto briefly) +- User code could attempt network attacks (mitigated by Docker networking) +- Large submissions could exhaust disk space + +**Mitigations**: +- Timeout limits (15 min) +- Disk quotas (via Docker) +- Network isolation (Docker bridge) +- Result validation before storing + +### 8. Organization & Billing + +**Current State**: Running under personal `mark-saroufim` Buildkite org. + +**Limitations**: +- **Not production-ready**: Personal org has limited visibility/access controls +- **Billing unclear**: Need to understand Buildkite pricing for self-hosted agents + - Self-hosted agents are free, but there may be limits on concurrent builds + - Artifact storage (S3) costs depend on volume +- **Access management**: Personal org doesn't support team-based permissions + +**TODO before production**: +- [ ] Create official `gpu-mode` Buildkite organization +- [ ] Understand billing model for high-volume usage +- [ ] Set up proper team access controls +- [ ] Configure SSO/SAML if needed +- [ ] Review artifact retention policies and costs + +--- + +## Future Improvements + +- [ ] Add MIG (Multi-Instance GPU) support for H100/A100 +- [ ] Implement job priority queuing +- [ ] Add per-user rate limiting +- [ ] Support multi-GPU jobs for distributed problems +- [ ] Add warm pool of pre-started containers +- [ ] Implement result caching for identical submissions + diff --git a/deployment/buildkite/Dockerfile b/deployment/buildkite/Dockerfile new file mode 100644 index 00000000..1a31aec2 --- /dev/null +++ b/deployment/buildkite/Dockerfile @@ -0,0 +1,42 @@ +# Kernelbot evaluation image +# Pre-built with all dependencies for fast cold starts +FROM nvidia/cuda:12.4.0-devel-ubuntu22.04 + +ENV DEBIAN_FRONTEND=noninteractive +ENV PYTHONUNBUFFERED=1 + +# System packages +RUN apt-get update && apt-get install -y --no-install-recommends \ + curl \ + ca-certificates \ + git \ + build-essential \ + ninja-build \ + cmake \ + && rm -rf /var/lib/apt/lists/* + +# Install uv +RUN curl -LsSf https://astral.sh/uv/install.sh | sh +ENV PATH="/root/.local/bin:$PATH" + +# Clone and install kernelbot +WORKDIR /opt/kernelbot +RUN git clone --depth 1 --branch buildkite-infrastructure https://github.com/gpu-mode/kernelbot.git . + +# Install dependencies with uv +RUN uv sync + +# Install PyTorch and GPU packages +RUN uv pip install torch triton numpy --index-url https://download.pytorch.org/whl/cu124 + +# Ensure venv is activated for any commands +ENV VIRTUAL_ENV=/opt/kernelbot/.venv +ENV PATH="$VIRTUAL_ENV/bin:$PATH" + +# Verify installation +RUN python -c "import torch; print(f'PyTorch {torch.__version__}')" && \ + python -c "import triton; print(f'Triton installed')" && \ + python -c "from libkernelbot.run_eval import run_config; print('kernelbot installed')" + +# Default command +CMD ["python", "/opt/kernelbot/src/runners/buildkite-runner.py"] diff --git a/deployment/buildkite/build-image.sh b/deployment/buildkite/build-image.sh new file mode 100755 index 00000000..af718450 --- /dev/null +++ b/deployment/buildkite/build-image.sh @@ -0,0 +1,43 @@ +#!/bin/bash +# Build the kernelbot Docker image locally on a GPU node +# Usage: ./build-image.sh [--push] + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" + +IMAGE_NAME="${KERNELBOT_IMAGE:-kernelbot:latest}" +BRANCH="${KERNELBOT_BRANCH:-buildkite-infrastructure}" + +echo "=== Building Kernelbot Image ===" +echo "Image: $IMAGE_NAME" +echo "Branch: $BRANCH" +echo "" + +# Update Dockerfile to use correct branch +sed -i "s|--branch [a-zA-Z0-9_-]*|--branch $BRANCH|g" "$SCRIPT_DIR/Dockerfile" 2>/dev/null || \ + sed -i '' "s|--branch [a-zA-Z0-9_-]*|--branch $BRANCH|g" "$SCRIPT_DIR/Dockerfile" + +echo "Building image..." +docker build -t "$IMAGE_NAME" -f "$SCRIPT_DIR/Dockerfile" "$REPO_ROOT" + +echo "" +echo "=== Build Complete ===" +echo "Image: $IMAGE_NAME" +docker images "$IMAGE_NAME" + +# Optional: push to registry +if [[ "${1:-}" == "--push" ]]; then + REGISTRY="${KERNELBOT_REGISTRY:-ghcr.io/gpu-mode}" + REMOTE_IMAGE="$REGISTRY/kernelbot:latest" + echo "" + echo "Pushing to $REMOTE_IMAGE..." + docker tag "$IMAGE_NAME" "$REMOTE_IMAGE" + docker push "$REMOTE_IMAGE" + echo "Pushed: $REMOTE_IMAGE" +fi + +echo "" +echo "To use this image, update your pipeline config:" +echo " image: \"$IMAGE_NAME\"" diff --git a/deployment/buildkite/pipeline-artifact-test.yml b/deployment/buildkite/pipeline-artifact-test.yml new file mode 100644 index 00000000..d7c37aba --- /dev/null +++ b/deployment/buildkite/pipeline-artifact-test.yml @@ -0,0 +1,69 @@ +# Simple artifact test pipeline +# Tests: submit job -> run in Docker -> write result.json -> upload artifact -> download + +steps: + - label: ":package: Artifact Test" + agents: + queue: "${KERNELBOT_QUEUE:-test}" + + plugins: + - docker#v5.11.0: + image: "python:3.11-slim" + propagate-environment: true + environment: + - KERNELBOT_PAYLOAD + - KERNELBOT_RUN_ID + - NVIDIA_VISIBLE_DEVICES + + command: | + python3 << 'PYEOF' + import base64 + import json + import os + import zlib + from datetime import datetime + + run_id = os.environ.get("KERNELBOT_RUN_ID", "unknown") + payload_b64 = os.environ.get("KERNELBOT_PAYLOAD", "") + + print("=== Artifact Test ===") + print(f"Run ID: {run_id}") + print(f"GPU: {os.environ.get('NVIDIA_VISIBLE_DEVICES', 'not set')}") + + # Decode payload if present + config = {} + if payload_b64: + try: + compressed = base64.b64decode(payload_b64) + config_json = zlib.decompress(compressed).decode("utf-8") + config = json.loads(config_json) + print(f"Decoded config: {json.dumps(config, indent=2)}") + except Exception as e: + print(f"Could not decode payload: {e}") + + # Create result + result = { + "success": True, + "error": None, + "run_id": run_id, + "timestamp": datetime.now().isoformat(), + "config_received": config, + "system": { + "gpu": os.environ.get("NVIDIA_VISIBLE_DEVICES", "none"), + }, + "runs": {} + } + + # Write result.json + with open("result.json", "w") as f: + json.dump(result, f, indent=2) + + print("\n=== Result ===") + print(json.dumps(result, indent=2)) + print("\nResult written to result.json") + PYEOF + + artifact_paths: + - "result.json" + + timeout_in_minutes: 5 diff --git a/deployment/buildkite/pipeline-eval.yml b/deployment/buildkite/pipeline-eval.yml new file mode 100644 index 00000000..8bfa573b --- /dev/null +++ b/deployment/buildkite/pipeline-eval.yml @@ -0,0 +1,72 @@ +# Kernelbot Evaluation Pipeline for Buildkite +# Mirrors GitHub runner: clone repo, install deps, run evaluation + +steps: + - label: ":rocket: Kernel Evaluation" + agents: + queue: "${KERNELBOT_QUEUE:-test}" + + plugins: + - docker#v5.11.0: + image: "nvidia/cuda:12.4.0-devel-ubuntu22.04" + always-pull: false + gpus: "all" + propagate-environment: true + shell: ["/bin/bash", "-e", "-c"] + environment: + - NVIDIA_VISIBLE_DEVICES + - CUDA_VISIBLE_DEVICES + - KERNELBOT_PAYLOAD + - KERNELBOT_RUN_ID + cpus: "${KERNELBOT_CPUS:-8}" + memory: "${KERNELBOT_MEMORY:-64g}" + workdir: /workdir + + command: | + set -e + + echo "=== Environment ===" + echo "NVIDIA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES" + echo "KERNELBOT_RUN_ID=$KERNELBOT_RUN_ID" + nvidia-smi -L + + echo "" + echo "=== Installing System Dependencies ===" + apt-get update -qq + apt-get install -y -qq curl ca-certificates git + + echo "" + echo "=== Installing uv ===" + curl -LsSf https://astral.sh/uv/install.sh | sh + . /root/.local/bin/env + + echo "" + echo "=== Cloning Repository ===" + git clone --depth 1 --branch buildkite-infrastructure https://github.com/gpu-mode/kernelbot.git /opt/kernelbot + cd /opt/kernelbot + + echo "" + echo "=== Installing Dependencies ===" + uv sync + + echo "" + echo "=== Installing PyTorch ===" + uv pip install torch triton numpy --index-url https://download.pytorch.org/whl/cu124 + + echo "" + echo "=== Running Evaluation ===" + . .venv/bin/activate + python src/runners/buildkite-runner.py + + echo "" + echo "=== Copying Artifacts ===" + cp result.json /workdir/result.json + cp -r profile_data /workdir/profile_data 2>/dev/null || true + + echo "=== Done ===" + + artifact_paths: + - "result.json" + - "profile_data/*" + + timeout_in_minutes: 30 diff --git a/deployment/buildkite/pipeline-fast.yml b/deployment/buildkite/pipeline-fast.yml new file mode 100644 index 00000000..4d50a3fb --- /dev/null +++ b/deployment/buildkite/pipeline-fast.yml @@ -0,0 +1,53 @@ +# Kernelbot Fast Evaluation Pipeline +# Uses pre-built image for fast cold starts (~5s vs ~40s) +# +# Prerequisites: +# 1. Build image on node: ./deployment/buildkite/build-image.sh +# 2. Or pull from registry: docker pull ghcr.io/gpu-mode/kernelbot:latest + +steps: + - label: ":rocket: Kernel Evaluation" + agents: + queue: "${KERNELBOT_QUEUE:-test}" + + plugins: + - docker#v5.11.0: + image: "${KERNELBOT_IMAGE:-kernelbot:latest}" + always-pull: false + gpus: "all" + propagate-environment: true + shell: ["/bin/bash", "-e", "-c"] + environment: + - NVIDIA_VISIBLE_DEVICES + - CUDA_VISIBLE_DEVICES + - KERNELBOT_PAYLOAD + - KERNELBOT_RUN_ID + cpus: "${KERNELBOT_CPUS:-8}" + memory: "${KERNELBOT_MEMORY:-64g}" + workdir: /workdir + + command: | + set -e + + echo "=== Environment ===" + echo "NVIDIA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES" + echo "KERNELBOT_RUN_ID=$KERNELBOT_RUN_ID" + nvidia-smi -L + + echo "" + echo "=== Running Evaluation ===" + cd /opt/kernelbot + python src/runners/buildkite-runner.py + + echo "" + echo "=== Copying Artifacts ===" + cp result.json /workdir/result.json + cp -r profile_data /workdir/profile_data 2>/dev/null || true + + echo "=== Done ===" + + artifact_paths: + - "result.json" + - "profile_data/*" + + timeout_in_minutes: 15 diff --git a/deployment/buildkite/pipeline-test-docker.yml b/deployment/buildkite/pipeline-test-docker.yml new file mode 100644 index 00000000..bba1c955 --- /dev/null +++ b/deployment/buildkite/pipeline-test-docker.yml @@ -0,0 +1,59 @@ +# Simple Docker test pipeline for Buildkite +# Paste this into your pipeline settings to test Docker + GPU isolation + artifacts + +steps: + - label: ":whale: Docker GPU Test" + agents: + queue: "test" # Change to your queue name + + plugins: + - docker#v5.11.0: + image: "nvidia/cuda:12.4.0-runtime-ubuntu22.04" + always-pull: true + gpus: "all" + propagate-environment: true + environment: + - NVIDIA_VISIBLE_DEVICES + - CUDA_VISIBLE_DEVICES + - KERNELBOT_PAYLOAD + - KERNELBOT_RUN_ID + # Resource constraints from environment hook + cpus: "${KERNELBOT_CPUS:-8}" + memory: "${KERNELBOT_MEMORY:-64g}" + + command: | + echo "=== Environment ===" + echo "NVIDIA_VISIBLE_DEVICES=$$NVIDIA_VISIBLE_DEVICES" + echo "CUDA_VISIBLE_DEVICES=$$CUDA_VISIBLE_DEVICES" + echo "KERNELBOT_RUN_ID=$$KERNELBOT_RUN_ID" + echo "" + echo "=== GPU Info ===" + nvidia-smi + echo "" + echo "=== CPU Info ===" + nproc + echo "" + echo "=== Memory Info ===" + free -h + echo "" + echo "=== Creating result.json ===" + cat > result.json << JSONEOF + { + "success": true, + "error": "", + "runs": {}, + "system": { + "gpu_name": "$$NVIDIA_VISIBLE_DEVICES", + "cuda_version": "12.4", + "python_version": "N/A" + } + } + JSONEOF + cat result.json + echo "" + echo "=== Done ===" + + artifact_paths: + - "result.json" + + timeout_in_minutes: 5 diff --git a/deployment/buildkite/pipeline.yml b/deployment/buildkite/pipeline.yml new file mode 100644 index 00000000..9939658d --- /dev/null +++ b/deployment/buildkite/pipeline.yml @@ -0,0 +1,52 @@ +# Kernelbot Evaluation Pipeline +# Jobs target specific GPU queue, Buildkite routes to idle agent +# +# NOTE: This pipeline is designed to be triggered via API with KERNELBOT_PAYLOAD. +# Direct push/PR triggers will skip gracefully. + +steps: + - label: ":rocket: Kernel Evaluation" + command: "python /app/src/runners/buildkite-runner.py" + + # Queue is set dynamically via KERNELBOT_QUEUE env var + agents: + queue: "${KERNELBOT_QUEUE}" + + plugins: + - docker#v5.11.0: + image: "${KERNELBOT_IMAGE:-ghcr.io/gpu-mode/kernelbot:latest}" + always-pull: true + gpus: "all" # Use gpus instead of runtime: nvidia for reliability + # GPU isolation - agent exports NVIDIA_VISIBLE_DEVICES + propagate-environment: true + environment: + - NVIDIA_VISIBLE_DEVICES + - CUDA_VISIBLE_DEVICES + - KERNELBOT_PAYLOAD + - KERNELBOT_RUN_ID + - KERNELBOT_GPU_INDEX + - KERNELBOT_CPUSET + - KERNELBOT_MEMORY + # Resource constraints + cpus: "${KERNELBOT_CPUS:-8}" + memory: "${KERNELBOT_MEMORY:-64g}" + # Mount for caching + volumes: + - "/var/lib/buildkite-agent/cache:/cache:rw" + # Cleanup + leave-container: false + + timeout_in_minutes: 15 + + # Artifacts + artifact_paths: + - "result.json" + - "profile_data/**/*" + + # Retry on infrastructure failures only + retry: + automatic: + - exit_status: -1 + limit: 2 + - exit_status: 255 + limit: 1 diff --git a/deployment/buildkite/setup-node-simple.sh b/deployment/buildkite/setup-node-simple.sh new file mode 100755 index 00000000..351a8a04 --- /dev/null +++ b/deployment/buildkite/setup-node-simple.sh @@ -0,0 +1,229 @@ +#!/bin/bash +# Buildkite GPU Node Setup +# Usage: sudo BUILDKITE_AGENT_TOKEN=xxx GPU_TYPE=test ./setup-node-simple.sh +# +# PREREQUISITES: +# 1. Create a Buildkite account and pipeline named 'kernelbot' +# 2. Generate an Agent Token from: Agents > Agent Tokens +# 3. Create a queue in: Agents > Default cluster > Queues > New Queue +# - Enter your GPU_TYPE as the key (e.g., 'test', 'b200', 'h100') +# - Select 'Self hosted' +# 4. Run this script with the token and GPU type + +set -euo pipefail + +# === CONFIGURATION === +BUILDKITE_TOKEN="${BUILDKITE_AGENT_TOKEN:?Must set BUILDKITE_AGENT_TOKEN}" +GPU_TYPE="${GPU_TYPE:?Must set GPU_TYPE (e.g., b200, mi300, h100, test)}" +NODE_NAME="${NODE_NAME:-$(hostname)}" + +# Auto-detect GPU count +if command -v nvidia-smi &> /dev/null; then + GPU_COUNT=$(nvidia-smi --query-gpu=count --format=csv,noheader | head -1) +else + GPU_COUNT="${GPU_COUNT:-1}" +fi + +echo "=== Buildkite GPU Node Setup ===" +echo "Node: ${NODE_NAME}" +echo "GPU Type: ${GPU_TYPE}" +echo "GPU Count: ${GPU_COUNT}" +echo "" + +# === CHECK ROOT === +if [[ $EUID -ne 0 ]]; then + echo "ERROR: This script must be run as root (use sudo)" + exit 1 +fi + +# === INSTALL BUILDKITE AGENT === +if ! command -v buildkite-agent &> /dev/null; then + echo "Installing Buildkite Agent..." + apt-get update + apt-get install -y apt-transport-https gnupg + curl -fsSL https://keys.openpgp.org/vks/v1/by-fingerprint/32A37959C2FA5C3C99EFBC32A79206696452D198 | \ + gpg --dearmor -o /usr/share/keyrings/buildkite-agent-archive-keyring.gpg + echo "deb [signed-by=/usr/share/keyrings/buildkite-agent-archive-keyring.gpg] https://apt.buildkite.com/buildkite-agent stable main" | \ + tee /etc/apt/sources.list.d/buildkite-agent.list + apt-get update + apt-get install -y buildkite-agent + echo "Buildkite Agent installed." +else + echo "Buildkite Agent already installed." +fi + +# === STOP EXISTING AGENTS === +echo "Stopping existing agents..." +systemctl stop buildkite-agent 2>/dev/null || true +systemctl disable buildkite-agent 2>/dev/null || true +for i in $(seq 0 15); do + systemctl stop "buildkite-agent-gpu${i}" 2>/dev/null || true + systemctl disable "buildkite-agent-gpu${i}" 2>/dev/null || true +done + +# === CREATE DIRECTORIES === +echo "Creating directories..." +mkdir -p /var/lib/buildkite-agent/builds +mkdir -p /var/lib/buildkite-agent/plugins +mkdir -p /etc/buildkite-agent/hooks +chown -R buildkite-agent:buildkite-agent /var/lib/buildkite-agent +chown -R buildkite-agent:buildkite-agent /etc/buildkite-agent + +# === CONFIGURE GIT TO USE HTTPS (avoids SSH key issues) === +echo "Configuring git to use HTTPS..." +cd /tmp +sudo -u buildkite-agent git config --global url."https://github.com/".insteadOf "git@github.com:" + +# === CREATE ENVIRONMENT HOOK FOR GPU ISOLATION === +echo "Creating environment hook for GPU/CPU/RAM isolation..." +cat > /etc/buildkite-agent/hooks/environment << 'HOOKEOF' +#!/bin/bash +# Resource isolation hook - auto-detects and divides resources by GPU count + +GPU_INDEX="${BUILDKITE_AGENT_META_DATA_GPU_INDEX:-0}" + +# Auto-detect total resources +TOTAL_CPUS=$(nproc) +TOTAL_RAM_KB=$(grep MemTotal /proc/meminfo | awk '{print $2}') +TOTAL_RAM_GB=$((TOTAL_RAM_KB / 1024 / 1024)) + +# Auto-detect GPU count +if command -v nvidia-smi &> /dev/null; then + GPU_COUNT=$(nvidia-smi --query-gpu=count --format=csv,noheader | head -1) +else + GPU_COUNT=1 +fi + +# Calculate per-GPU allocation +CPUS_PER_GPU=$((TOTAL_CPUS / GPU_COUNT)) +RAM_PER_GPU=$((TOTAL_RAM_GB / GPU_COUNT)) + +# GPU isolation +export NVIDIA_VISIBLE_DEVICES="${GPU_INDEX}" +export CUDA_VISIBLE_DEVICES="${GPU_INDEX}" + +# CPU isolation (assign a range of CPUs to each GPU) +CPU_START=$((GPU_INDEX * CPUS_PER_GPU)) +CPU_END=$((CPU_START + CPUS_PER_GPU - 1)) +export KERNELBOT_CPUSET="${CPU_START}-${CPU_END}" +export KERNELBOT_CPUS="${CPUS_PER_GPU}" + +# Memory isolation +export KERNELBOT_MEMORY="${RAM_PER_GPU}g" + +# GPU index for the runner +export KERNELBOT_GPU_INDEX="${GPU_INDEX}" + +echo "=== Resource Isolation ===" +echo "Machine: ${TOTAL_CPUS} CPUs, ${TOTAL_RAM_GB}GB RAM, ${GPU_COUNT} GPUs" +echo "This job: GPU ${NVIDIA_VISIBLE_DEVICES}, CPUs ${KERNELBOT_CPUSET}, RAM ${KERNELBOT_MEMORY}" +HOOKEOF +chmod +x /etc/buildkite-agent/hooks/environment +chown buildkite-agent:buildkite-agent /etc/buildkite-agent/hooks/environment + +# === CREATE AGENT FOR EACH GPU === +echo "Creating ${GPU_COUNT} agents..." + +for gpu_idx in $(seq 0 $((GPU_COUNT - 1))); do + agent_name="${NODE_NAME}-gpu${gpu_idx}" + config_file="/etc/buildkite-agent/buildkite-agent-gpu${gpu_idx}.cfg" + build_dir="/var/lib/buildkite-agent/builds/gpu${gpu_idx}" + + mkdir -p "${build_dir}" + chown buildkite-agent:buildkite-agent "${build_dir}" + + # Write config + cat > "${config_file}" << EOF +token="${BUILDKITE_TOKEN}" +name="${agent_name}" +tags="queue=${GPU_TYPE},gpu=${GPU_TYPE},gpu-index=${gpu_idx},node=${NODE_NAME}" +build-path="${build_dir}" +hooks-path="/etc/buildkite-agent/hooks" +plugins-path="/var/lib/buildkite-agent/plugins" +EOF + chown buildkite-agent:buildkite-agent "${config_file}" + + # Write systemd service + cat > "/etc/systemd/system/buildkite-agent-gpu${gpu_idx}.service" << EOF +[Unit] +Description=Buildkite Agent (GPU ${gpu_idx}) +Documentation=https://buildkite.com/docs/agent/v3 +After=network.target + +[Service] +Type=simple +User=buildkite-agent +ExecStart=/usr/bin/buildkite-agent start --config ${config_file} +RestartSec=5 +Restart=on-failure +TimeoutStartSec=10 +TimeoutStopSec=60 + +[Install] +WantedBy=multi-user.target +EOF + + echo " Created agent ${gpu_idx}: ${agent_name}" +done + +# === START AGENTS === +echo "Starting agents..." +systemctl daemon-reload + +for gpu_idx in $(seq 0 $((GPU_COUNT - 1))); do + systemctl enable "buildkite-agent-gpu${gpu_idx}" + systemctl start "buildkite-agent-gpu${gpu_idx}" +done + +sleep 3 + +echo "" +echo "=== Agent Status ===" +for gpu_idx in $(seq 0 $((GPU_COUNT - 1))); do + status=$(systemctl is-active "buildkite-agent-gpu${gpu_idx}" 2>/dev/null || echo "unknown") + echo " GPU ${gpu_idx}: ${status}" +done + +echo "" +echo "=== Setup Complete ===" +echo "" +echo "Created ${GPU_COUNT} agents for queue: ${GPU_TYPE}" +echo "GPU isolation is handled via environment hook (NVIDIA_VISIBLE_DEVICES)" +echo "" +echo "IMPORTANT: Make sure you created the '${GPU_TYPE}' queue in Buildkite:" +echo " 1. Go to: https://buildkite.com/organizations/YOUR_ORG/clusters" +echo " 2. Click 'Default cluster' > 'Queues' > 'New Queue'" +echo " 3. Enter '${GPU_TYPE}' as the key, select 'Self hosted'" +echo "" +echo "Your agents should appear at: https://buildkite.com/organizations/YOUR_ORG/agents" +echo "" +echo "Test with this pipeline step:" +echo ' steps:' +echo ' - label: "GPU Test"' +echo ' command: "echo NVIDIA_VISIBLE_DEVICES=$$NVIDIA_VISIBLE_DEVICES && nvidia-smi -L"' +echo ' agents:' +echo " queue: \"${GPU_TYPE}\"" + +# === BUILD DOCKER IMAGE (optional) === +if [[ "${BUILD_IMAGE:-}" == "true" ]]; then + echo "" + echo "=== Building Docker Image ===" + SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + + if [[ -f "$SCRIPT_DIR/Dockerfile" ]]; then + docker build -t kernelbot:latest -f "$SCRIPT_DIR/Dockerfile" "$SCRIPT_DIR/../.." + echo "Docker image built: kernelbot:latest" + echo "" + echo "To use the fast pipeline, update Buildkite config to use:" + echo " image: \"kernelbot:latest\"" + else + echo "WARNING: Dockerfile not found at $SCRIPT_DIR/Dockerfile" + echo "Clone the repo first: git clone https://github.com/gpu-mode/kernelbot.git" + fi +fi + +echo "" +echo "For faster cold starts, build the Docker image:" +echo " BUILD_IMAGE=true $0" +echo "Or manually:" +echo " ./deployment/buildkite/build-image.sh" diff --git a/deployment/buildkite/setup-node.sh b/deployment/buildkite/setup-node.sh new file mode 100755 index 00000000..70186499 --- /dev/null +++ b/deployment/buildkite/setup-node.sh @@ -0,0 +1,222 @@ +#!/bin/bash +# Buildkite GPU Node Setup +# Usage: BUILDKITE_AGENT_TOKEN=xxx GPU_TYPE=b200 ./setup-node.sh + +set -euo pipefail + +# === CONFIGURATION === +BUILDKITE_TOKEN="${BUILDKITE_AGENT_TOKEN:?Must set BUILDKITE_AGENT_TOKEN}" +GPU_TYPE="${GPU_TYPE:?Must set GPU_TYPE (e.g., b200, mi300, h100)}" +NODE_NAME="${NODE_NAME:-$(hostname)}" + +# Auto-detect GPU count +detect_gpu_count() { + if command -v nvidia-smi &> /dev/null; then + nvidia-smi --query-gpu=count --format=csv,noheader | head -1 + elif command -v rocm-smi &> /dev/null; then + rocm-smi --showid | grep -c "GPU" + else + echo "8" # Default + fi +} + +GPU_COUNT="${GPU_COUNT:-$(detect_gpu_count)}" +CPUS_PER_GPU="${CPUS_PER_GPU:-8}" +RAM_PER_GPU="${RAM_PER_GPU:-64g}" + +# Queue name - same for all agents on this node +QUEUE_NAME="${GPU_TYPE}" + +echo "=== Buildkite GPU Node Setup ===" +echo "Node: ${NODE_NAME}" +echo "GPU Type: ${GPU_TYPE}" +echo "GPU Count: ${GPU_COUNT}" +echo "Queue: ${QUEUE_NAME}" +echo "CPUs per GPU: ${CPUS_PER_GPU}" +echo "RAM per GPU: ${RAM_PER_GPU}" +echo "" + +# === INSTALL DEPENDENCIES === + +install_docker_nvidia() { + echo "Installing Docker and NVIDIA Container Toolkit..." + + # Docker + if ! command -v docker &> /dev/null; then + curl -fsSL https://get.docker.com | sh + usermod -aG docker ubuntu 2>/dev/null || true + fi + + # NVIDIA Container Toolkit + if ! dpkg -l | grep -q nvidia-container-toolkit; then + curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | \ + gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg + curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \ + sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \ + tee /etc/apt/sources.list.d/nvidia-container-toolkit.list + apt-get update + apt-get install -y nvidia-container-toolkit + nvidia-ctk runtime configure --runtime=docker + systemctl restart docker + fi + + echo "Docker + NVIDIA toolkit installed." +} + +install_buildkite_agent() { + echo "Installing Buildkite Agent..." + + if ! command -v buildkite-agent &> /dev/null; then + apt-get install -y apt-transport-https gnupg + curl -fsSL https://keys.openpgp.org/vks/v1/by-fingerprint/32A37959C2FA5C3C99EFBC32A79206696452D198 | \ + gpg --dearmor -o /usr/share/keyrings/buildkite-agent-archive-keyring.gpg + echo "deb [signed-by=/usr/share/keyrings/buildkite-agent-archive-keyring.gpg] https://apt.buildkite.com/buildkite-agent stable main" | \ + tee /etc/apt/sources.list.d/buildkite-agent.list + apt-get update + apt-get install -y buildkite-agent + fi + + echo "Buildkite Agent installed." +} + +# === CREATE PER-GPU AGENTS === + +setup_agents() { + echo "Configuring ${GPU_COUNT} agents..." + + # Create base directories + mkdir -p /etc/buildkite-agent/hooks + mkdir -p /var/lib/buildkite-agent + + # Create shared hooks + cat > /etc/buildkite-agent/hooks/environment << 'HOOKEOF' +#!/bin/bash +# GPU isolation hook - runs before each job +set -euo pipefail + +# GPU index is set per-agent via environment +echo "GPU ${BUILDKITE_AGENT_META_DATA_GPU_INDEX} allocated for this job" +echo "NVIDIA_VISIBLE_DEVICES=${NVIDIA_VISIBLE_DEVICES}" +HOOKEOF + chmod +x /etc/buildkite-agent/hooks/environment + + # Create pre-exit hook for cleanup + cat > /etc/buildkite-agent/hooks/pre-exit << 'HOOKEOF' +#!/bin/bash +# Cleanup after job +docker system prune -f --filter "until=1h" 2>/dev/null || true +HOOKEOF + chmod +x /etc/buildkite-agent/hooks/pre-exit + + # Stop any existing agents + systemctl stop 'buildkite-agent-gpu*' 2>/dev/null || true + + # Create agent for each GPU + for gpu_idx in $(seq 0 $((GPU_COUNT - 1))); do + local cpu_start=$((gpu_idx * CPUS_PER_GPU)) + local cpu_end=$((cpu_start + CPUS_PER_GPU - 1)) + local agent_name="${NODE_NAME}-gpu${gpu_idx}" + local config_dir="/etc/buildkite-agent/agent-${gpu_idx}" + local build_dir="/var/lib/buildkite-agent/gpu-${gpu_idx}/builds" + + mkdir -p "${config_dir}" + mkdir -p "${build_dir}" + + # Agent configuration + cat > "${config_dir}/buildkite-agent.cfg" << CFGEOF +# Buildkite Agent Configuration - GPU ${gpu_idx} +token="${BUILDKITE_TOKEN}" +name="${agent_name}" +tags="queue=${QUEUE_NAME},gpu=${GPU_TYPE},gpu-index=${gpu_idx},node=${NODE_NAME}" +build-path="${build_dir}" +hooks-path="/etc/buildkite-agent/hooks" +plugins-path="/var/lib/buildkite-agent/plugins" +disconnect-after-job=false +disconnect-after-idle-timeout=0 +CFGEOF + + # Agent environment file (for GPU isolation) + cat > "${config_dir}/environment" << ENVEOF +NVIDIA_VISIBLE_DEVICES=${gpu_idx} +CUDA_VISIBLE_DEVICES=${gpu_idx} +KERNELBOT_GPU_INDEX=${gpu_idx} +KERNELBOT_CPU_START=${cpu_start} +KERNELBOT_CPU_END=${cpu_end} +KERNELBOT_CPUSET=${cpu_start}-${cpu_end} +KERNELBOT_MEMORY=${RAM_PER_GPU} +ENVEOF + + # Systemd service + cat > "/etc/systemd/system/buildkite-agent-gpu${gpu_idx}.service" << SVCEOF +[Unit] +Description=Buildkite Agent (GPU ${gpu_idx}) +Documentation=https://buildkite.com/docs/agent/v3 +After=network.target docker.service +Requires=docker.service + +[Service] +Type=simple +User=buildkite-agent +EnvironmentFile=${config_dir}/environment +ExecStart=/usr/bin/buildkite-agent start --config ${config_dir}/buildkite-agent.cfg +RestartSec=5 +Restart=on-failure +RestartForceExitStatus=SIGPIPE +TimeoutStartSec=10 +TimeoutStopSec=60 +KillMode=process + +[Install] +WantedBy=multi-user.target +SVCEOF + + echo " Agent ${gpu_idx}: GPU=${gpu_idx}, CPUs=${cpu_start}-${cpu_end}" + done + + # Fix permissions + chown -R buildkite-agent:buildkite-agent /var/lib/buildkite-agent + chown -R buildkite-agent:buildkite-agent /etc/buildkite-agent + + # Add buildkite-agent to docker group + usermod -aG docker buildkite-agent +} + +# === START AGENTS === + +start_agents() { + echo "Starting agents..." + systemctl daemon-reload + + for gpu_idx in $(seq 0 $((GPU_COUNT - 1))); do + systemctl enable "buildkite-agent-gpu${gpu_idx}" + systemctl start "buildkite-agent-gpu${gpu_idx}" + done + + sleep 3 + + echo "" + echo "=== Agent Status ===" + for gpu_idx in $(seq 0 $((GPU_COUNT - 1))); do + status=$(systemctl is-active "buildkite-agent-gpu${gpu_idx}" 2>/dev/null || echo "unknown") + echo " GPU ${gpu_idx}: ${status}" + done +} + +# === MAIN === + +if [[ $EUID -ne 0 ]]; then + echo "This script must be run as root" + exit 1 +fi + +install_docker_nvidia +install_buildkite_agent +setup_agents +start_agents + +echo "" +echo "=== Setup Complete ===" +echo "Agents should appear at: https://buildkite.com/organizations/YOUR_ORG/agents" +echo "Queue: ${QUEUE_NAME}" +echo "" +echo "Test with: buildkite-agent start --help" diff --git a/scripts/e2e_buildkite_with_db.py b/scripts/e2e_buildkite_with_db.py new file mode 100644 index 00000000..d7ff3a09 --- /dev/null +++ b/scripts/e2e_buildkite_with_db.py @@ -0,0 +1,331 @@ +#!/usr/bin/env python3 +"""End-to-end test for Buildkite integration with database storage. + +This script: +1. Creates a test leaderboard in the local database +2. Submits a real kernel evaluation job to Buildkite +3. Stores results in the PostgreSQL database +4. Verifies everything is stored correctly + +Usage: + BUILDKITE_API_TOKEN=xxx uv run python scripts/e2e_buildkite_with_db.py + +Options: + --queue Buildkite queue (default: test) + --org Buildkite org (default: mark-saroufim) + --pipeline Pipeline name (default: kernelbot) + --example Example to run (default: vectoradd_py) + --cleanup Delete the test leaderboard after the test + --dry-run Print config without submitting +""" + +import argparse +import asyncio +import datetime +import os +import sys +from pathlib import Path + +# Add src to path for local testing +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src")) + + +class SimpleReporter: + """Simple progress reporter for CLI output.""" + + def __init__(self, title: str = ""): + self.title = title + self.messages = [] + + async def push(self, msg): + self.messages.append(msg) + print(f" [PUSH] {msg}") + + async def update(self, msg): + print(f" [UPDATE] {msg}") + + async def update_title(self, title): + self.title = title + print(f" [TITLE] {title}") + + async def display_report(self, title, report): + print(f"\n [REPORT] {title}") + for line in report: + print(f" {line}") + + +class MultiReporter: + """Multi-run progress reporter.""" + + def __init__(self): + self.runs = [] + + def add_run(self, name: str) -> SimpleReporter: + reporter = SimpleReporter(name) + self.runs.append(reporter) + print(f"\n--- Run: {name} ---") + return reporter + + async def show(self, msg): + print(f"\n[SHOW] {msg}") + + +async def main(): # noqa: C901 + parser = argparse.ArgumentParser(description="E2E Buildkite test with database storage") + parser.add_argument("--queue", default="test", help="Buildkite queue (default: test)") + parser.add_argument("--org", default="mark-saroufim", help="Buildkite org slug") + parser.add_argument("--pipeline", default="kernelbot", help="Pipeline slug") + parser.add_argument("--example", default="vectoradd_py", help="Example to run") + parser.add_argument("--mode", choices=["test", "leaderboard"], default="test", help="Submission mode") + parser.add_argument("--cleanup", action="store_true", help="Delete test leaderboard after test") + parser.add_argument("--dry-run", action="store_true", help="Print config without submitting") + args = parser.parse_args() + + # Check for required environment variables + token = os.environ.get("BUILDKITE_API_TOKEN") + if not token: + print("ERROR: BUILDKITE_API_TOKEN environment variable not set") + print("\nTo get a token:") + print(" 1. Go to https://buildkite.com/user/api-access-tokens") + print(" 2. Create token with: read_builds, write_builds, read_artifacts, read_agents") + sys.exit(1) + + database_url = os.environ.get("DATABASE_URL", "postgresql://marksaroufim@localhost:5432/kernelbot") + disable_ssl = os.environ.get("DISABLE_SSL", "true") + + print("=" * 60) + print("Buildkite E2E Test with Database Storage") + print("=" * 60) + print(f"Organization: {args.org}") + print(f"Pipeline: {args.pipeline}") + print(f"Queue: {args.queue}") + print(f"Example: {args.example}") + print(f"Mode: {args.mode}") + print(f"Database: {database_url}") + print() + + # Import kernelbot modules + from libkernelbot.consts import BuildkiteGPU, SubmissionMode + from libkernelbot.launchers.buildkite import BuildkiteConfig, BuildkiteLauncher + from libkernelbot.leaderboard_db import LeaderboardDB + from libkernelbot.task import make_task_definition + + # Set up database connection + db = LeaderboardDB(url=database_url, ssl_mode="disable" if disable_ssl else "require") + + # Find example + project_root = Path(__file__).parent.parent + task_path = project_root / "examples" / args.example + + if not task_path.exists(): + print(f"ERROR: Example '{args.example}' not found at {task_path}") + print("Available examples:") + for p in (project_root / "examples").iterdir(): + if p.is_dir() and (p / "task.yml").exists(): + print(f" - {p.name}") + sys.exit(1) + + # Load task definition + task_definition = make_task_definition(task_path) + leaderboard_name = f"e2e-test-{args.example}" + + # Find submission file + for name in ["submission_triton.py", "submission.py", "submission_cuda_inline.py"]: + if (task_path / name).exists(): + submission_file = task_path / name + break + else: + print(f"ERROR: No submission file found in {task_path}") + sys.exit(1) + + submission_code = submission_file.read_text() + + print(f"Task: {task_path.name}") + print(f"Submission: {submission_file.name}") + print(f"Leaderboard: {leaderboard_name}") + + if args.dry_run: + print("\n[DRY RUN] Would create leaderboard and submit job") + config_keys = list(task_definition.task.config.keys()) if task_definition.task.config else "None" + print(f" Task config keys: {config_keys}") + return + + # Step 1: Create test leaderboard + print("\n" + "=" * 60) + print("Step 1: Creating test leaderboard") + print("=" * 60) + + with db: + # Check if leaderboard already exists + existing = db.get_leaderboard_names() + if leaderboard_name in existing: + print(f" Leaderboard '{leaderboard_name}' already exists, deleting...") + db.delete_leaderboard(leaderboard_name, force=True) + + # Create leaderboard + deadline = datetime.datetime.now(datetime.timezone.utc) + datetime.timedelta(days=30) + lb_id = db.create_leaderboard( + name=leaderboard_name, + deadline=deadline, + definition=task_definition, + creator_id=1, # Test user + forum_id=0, + gpu_types=["L40S_BK"], # Buildkite test queue GPU + ) + print(f" Created leaderboard with ID: {lb_id}") + + # Step 2: Set up backend with Buildkite launcher + print("\n" + "=" * 60) + print("Step 2: Setting up Buildkite launcher") + print("=" * 60) + + launcher = BuildkiteLauncher( + BuildkiteConfig( + org_slug=args.org, + pipeline_slug=args.pipeline, + api_token=token, + ) + ) + + # Check queue status + queue_status = await launcher.get_queue_status(args.queue) + print(f" Queue: {queue_status.get('queue')}") + print(f" Total agents: {queue_status.get('total')}") + print(f" Idle agents: {queue_status.get('idle')}") + for agent in queue_status.get("agents", []): + print(f" - {agent['name']}: {agent['state']} (busy={agent['busy']})") + + if queue_status.get("total", 0) == 0: + print("\n WARNING: No agents in queue. Job may wait indefinitely.") + print(" Make sure you have agents running on the Buildkite queue.") + + # Step 3: Create submission and run evaluation + print("\n" + "=" * 60) + print("Step 3: Creating submission and running evaluation") + print("=" * 60) + + with db: + # Create submission entry + submission_id = db.create_submission( + leaderboard=leaderboard_name, + file_name=submission_file.name, + user_id=1, # Test user + code=submission_code, + time=datetime.datetime.now(datetime.timezone.utc), + user_name="e2e-test-user", + ) + print(f" Created submission with ID: {submission_id}") + + # Build task config + from libkernelbot.task import build_task_config + + submission_mode = SubmissionMode.LEADERBOARD if args.mode == "leaderboard" else SubmissionMode.TEST + config = build_task_config( + task=task_definition.task, + submission_content=submission_code, + arch=0, # Will be set by runner + mode=submission_mode, + ) + config["submission_id"] = submission_id + + # Run on Buildkite + print("\n Submitting to Buildkite...") + gpu_type = BuildkiteGPU.L40S_BK + reporter = SimpleReporter(f"Test run on {gpu_type.name}") + + result = await launcher.run_submission(config, gpu_type, reporter) + + print(f"\n Result: success={result.success}") + if result.error: + print(f" Error: {result.error}") + print(f" System: {result.system}") + + # Step 4: Store results in database + print("\n" + "=" * 60) + print("Step 4: Storing results in database") + print("=" * 60) + + if result.success: + with db: + for run_name, run_result in result.runs.items(): + if run_result.run is None: + print(f" Skipping {run_name}: no run result") + continue + + score = None + if run_name == "leaderboard" and run_result.run.passed: + # Compute score for leaderboard runs + from libkernelbot.submission import compute_score + score = compute_score(result, task_definition.task, submission_id) + + db.create_submission_run( + submission=submission_id, + start=run_result.start, + end=run_result.end, + mode=run_name, + runner=gpu_type.name, + score=score, + secret=False, + compilation=run_result.compilation, + result=run_result.run, + system=result.system, + ) + passed = run_result.run.passed + duration = run_result.run.duration + print(f" Stored run: {run_name} (passed={passed}, duration={duration:.2f}s)") + + # Mark submission as done + db.mark_submission_done(submission_id) + print(f"\n Marked submission {submission_id} as done") + + # Step 5: Verify data in database + print("\n" + "=" * 60) + print("Step 5: Verifying data in database") + print("=" * 60) + + with db: + submission = db.get_submission_by_id(submission_id) + if submission: + print(f" Submission ID: {submission['submission_id']}") + print(f" Leaderboard: {submission['leaderboard_name']}") + print(f" File: {submission['file_name']}") + print(f" Done: {submission['done']}") + print(f" Runs: {len(submission['runs'])}") + for run in submission['runs']: + print(f" - {run['mode']}: passed={run['passed']}, runner={run['runner']}") + if run.get('system'): + gpu_name = run['system'].get('gpu', 'unknown') if isinstance(run['system'], dict) else 'unknown' + print(f" GPU: {gpu_name}") + else: + print(" ERROR: Could not retrieve submission from database!") + + # Step 6: Show summary + print("\n" + "=" * 60) + print("Summary") + print("=" * 60) + print(f" Leaderboard: {leaderboard_name}") + print(f" Submission ID: {submission_id}") + print(f" Success: {result.success}") + if result.runs: + for name, run in result.runs.items(): + if run.run: + print(f" {name}: passed={run.run.passed}, duration={run.run.duration:.2f}s") + + # Cleanup if requested + if args.cleanup: + print("\n" + "=" * 60) + print("Cleanup") + print("=" * 60) + with db: + db.delete_leaderboard(leaderboard_name, force=True) + print(f" Deleted leaderboard: {leaderboard_name}") + + print("\n" + "=" * 60) + print("E2E Test Complete!") + print("=" * 60) + + sys.exit(0 if result.success else 1) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/scripts/submit_buildkite_job.py b/scripts/submit_buildkite_job.py new file mode 100755 index 00000000..8d835657 --- /dev/null +++ b/scripts/submit_buildkite_job.py @@ -0,0 +1,173 @@ +#!/usr/bin/env python3 +"""Submit a test job to Buildkite and download the result. + +Usage: + # Simple test (just writes dummy result.json): + BUILDKITE_API_TOKEN=xxx python scripts/submit_buildkite_job.py + + # Real evaluation with vectoradd example: + BUILDKITE_API_TOKEN=xxx python scripts/submit_buildkite_job.py --eval vectoradd_py + + # Real evaluation with identity example: + BUILDKITE_API_TOKEN=xxx python scripts/submit_buildkite_job.py --eval identity_py +""" + +import argparse +import asyncio +import json +import os +import sys +from pathlib import Path + +# Add src to path +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src")) + +from libkernelbot.consts import BuildkiteGPU, SubmissionMode +from libkernelbot.launchers.buildkite import BuildkiteConfig, BuildkiteLauncher +from libkernelbot.task import build_task_config, make_task_definition + + +class SimpleReporter: + async def push(self, msg): + print(f"[STATUS] {msg}") + + async def update(self, msg): + print(f"[UPDATE] {msg}") + + +async def main(): # noqa: C901 + parser = argparse.ArgumentParser(description="Submit a test job to Buildkite") + parser.add_argument("--org", default="mark-saroufim", help="Buildkite org slug") + parser.add_argument("--pipeline", default="kernelbot", help="Pipeline slug") + parser.add_argument("--queue", default="test", help="Queue name") + parser.add_argument("--run-id", default="manual-test", help="Run ID for this job") + parser.add_argument( + "--eval", + type=str, + default=None, + help="Run real evaluation with example (e.g., 'vectoradd_py', 'identity_py')", + ) + parser.add_argument( + "--submission", + type=str, + default=None, + help="Submission file to use (default: auto-detect)", + ) + args = parser.parse_args() + + token = os.environ.get("BUILDKITE_API_TOKEN") + if not token: + print("ERROR: Set BUILDKITE_API_TOKEN environment variable") + sys.exit(1) + + print("=== Buildkite Job Submission ===") + print(f"Org: {args.org}") + print(f"Pipeline: {args.pipeline}") + print(f"Queue: {args.queue}") + print(f"Run ID: {args.run_id}") + + launcher = BuildkiteLauncher( + BuildkiteConfig( + org_slug=args.org, + pipeline_slug=args.pipeline, + api_token=token, + ) + ) + + if args.eval: + # Real evaluation mode + print(f"Eval: {args.eval}") + print() + + project_root = Path(__file__).parent.parent + task_path = project_root / "examples" / args.eval + + if not task_path.exists(): + print(f"ERROR: Example '{args.eval}' not found at {task_path}") + print("Available examples:") + for p in (project_root / "examples").iterdir(): + if p.is_dir() and (p / "task.yml").exists(): + print(f" - {p.name}") + sys.exit(1) + + task_definition = make_task_definition(task_path) + + # Find submission file + if args.submission: + submission_file = task_path / args.submission + else: + # Try common submission names + for name in ["submission_triton.py", "submission.py", "submission_cuda_inline.py"]: + if (task_path / name).exists(): + submission_file = task_path / name + break + else: + print(f"ERROR: No submission file found in {task_path}") + sys.exit(1) + + print(f"Task: {task_path.name}") + print(f"Submission: {submission_file.name}") + + submission_content = submission_file.read_text() + + config = build_task_config( + task=task_definition.task, + submission_content=submission_content, + arch=0, + mode=SubmissionMode.TEST, + ) + + gpu_type = BuildkiteGPU.L40S_BK + result = await launcher.run_submission(config, gpu_type, SimpleReporter()) + + print() + print("=== Result ===") + print(f"Success: {result.success}") + if result.error: + print(f"Error: {result.error}") + print(f"System: {result.system}") + if result.runs: + for name, run in result.runs.items(): + print(f"\n{name}:") + print(f" Passed: {run.run.passed if run.run else 'N/A'}") + print(f" Duration: {run.run.duration if run.run else 'N/A'}s") + if run.run and run.run.result: + print(f" Result: {run.run.result}") + + else: + # Simple test mode + print("Mode: Simple test (no evaluation)") + print() + + config = { + "test": True, + "message": "Hello from manual test", + "run_id": args.run_id, + } + + print("Submitting job...") + result = await launcher._launch( + run_id=args.run_id, + config=config, + queue=args.queue, + status=SimpleReporter(), + ) + + print() + print("=== Result ===") + print(f"Success: {result.success}") + if result.error: + print(f"Error: {result.error}") + if result.build_url: + print(f"Build URL: {result.build_url}") + if result.result: + print("Downloaded artifact:") + print(json.dumps(result.result, indent=2)) + else: + print("No artifact downloaded") + + sys.exit(0 if (result.success if hasattr(result, "success") else True) else 1) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/src/kernelbot/env.py b/src/kernelbot/env.py index 90dd276c..380ed55b 100644 --- a/src/kernelbot/env.py +++ b/src/kernelbot/env.py @@ -33,6 +33,11 @@ env.GITHUB_WORKFLOW_BRANCH = os.getenv("GITHUB_WORKFLOW_BRANCH", get_github_branch_name()) env.PROBLEMS_REPO = os.getenv("PROBLEMS_REPO") +# Buildkite-specific constants +env.BUILDKITE_API_TOKEN = os.getenv("BUILDKITE_API_TOKEN") +env.BUILDKITE_ORG = os.getenv("BUILDKITE_ORG", "mark-saroufim") +env.BUILDKITE_PIPELINE = os.getenv("BUILDKITE_PIPELINE", "kernelbot") + # Directory that will be used for local problem development. env.PROBLEM_DEV_DIR = os.getenv("PROBLEM_DEV_DIR", "examples") diff --git a/src/kernelbot/main.py b/src/kernelbot/main.py index 71736ee0..749a1d56 100644 --- a/src/kernelbot/main.py +++ b/src/kernelbot/main.py @@ -16,7 +16,8 @@ from libkernelbot import consts from libkernelbot.backend import KernelBackend from libkernelbot.background_submission_manager import BackgroundSubmissionManager -from libkernelbot.launchers import GitHubLauncher, ModalLauncher +from libkernelbot.launchers import BuildkiteLauncher, GitHubLauncher, ModalLauncher +from libkernelbot.launchers.buildkite import BuildkiteConfig from libkernelbot.utils import setup_logging logger = setup_logging(__name__) @@ -29,6 +30,17 @@ def create_backend(debug_mode: bool = False) -> KernelBackend: backend.register_launcher( GitHubLauncher(env.GITHUB_REPO, env.GITHUB_TOKEN, env.GITHUB_WORKFLOW_BRANCH) ) + + # Register Buildkite launcher if API token is configured + if env.BUILDKITE_API_TOKEN: + buildkite_config = BuildkiteConfig( + org_slug=env.BUILDKITE_ORG, + pipeline_slug=env.BUILDKITE_PIPELINE, + api_token=env.BUILDKITE_API_TOKEN, + ) + backend.register_launcher(BuildkiteLauncher(buildkite_config)) + logger.info("Buildkite launcher registered") + return backend diff --git a/src/libkernelbot/consts.py b/src/libkernelbot/consts.py index f60764de..3f52737b 100644 --- a/src/libkernelbot/consts.py +++ b/src/libkernelbot/consts.py @@ -33,6 +33,14 @@ class ModalGPU(Enum): L4x4 = "L4x4" +class BuildkiteGPU(Enum): + """GPUs available via Buildkite-managed infrastructure.""" + B200_BK = "B200_BK" + H100_BK = "H100_BK" + MI300_BK = "MI300_BK" + L40S_BK = "L40S_BK" # Test infrastructure + + @dataclasses.dataclass class GPU: name: str @@ -50,7 +58,7 @@ def _make_gpu_lookup(runner_map: dict[str, Type[Enum]]): return lookup -_GPU_LOOKUP = _make_gpu_lookup({"Modal": ModalGPU, "GitHub": GitHubGPU}) +_GPU_LOOKUP = _make_gpu_lookup({"Modal": ModalGPU, "GitHub": GitHubGPU, "Buildkite": BuildkiteGPU}) def get_gpu_by_name(name: str) -> GPU: @@ -121,6 +129,11 @@ class RankCriterion(Enum): "MI300": None, "MI300x8": None, "MI250": None, + # Buildkite-managed GPUs + "B200_BK": "100", + "H100_BK": "90a", + "MI300_BK": None, + "L40S_BK": "89", # Ada Lovelace } diff --git a/src/libkernelbot/launchers/__init__.py b/src/libkernelbot/launchers/__init__.py index df47476f..1a7a8a39 100644 --- a/src/libkernelbot/launchers/__init__.py +++ b/src/libkernelbot/launchers/__init__.py @@ -1,5 +1,6 @@ +from .buildkite import BuildkiteLauncher from .github import GitHubLauncher from .launcher import Launcher from .modal import ModalLauncher -__all__ = [Launcher, GitHubLauncher, ModalLauncher] +__all__ = [Launcher, GitHubLauncher, ModalLauncher, BuildkiteLauncher] diff --git a/src/libkernelbot/launchers/buildkite.py b/src/libkernelbot/launchers/buildkite.py new file mode 100644 index 00000000..f160a2c7 --- /dev/null +++ b/src/libkernelbot/launchers/buildkite.py @@ -0,0 +1,447 @@ +"""Buildkite launcher for kernel evaluation jobs. + +Uses single-queue model where all agents on a node share the same queue. +Buildkite automatically routes jobs to idle agents. +""" + +from __future__ import annotations + +import asyncio +import base64 +import datetime +import json +import os +import zlib +from dataclasses import dataclass, field +from typing import Any + +import httpx + +from libkernelbot.consts import GPU, BuildkiteGPU +from libkernelbot.report import RunProgressReporter +from libkernelbot.run_eval import ( + CompileResult, + EvalResult, + FullResult, + ProfileResult, + RunResult, + SystemInfo, +) +from libkernelbot.utils import setup_logging + +from .launcher import Launcher + +logger = setup_logging(__name__) + +BUILDKITE_API = "https://api.buildkite.com/v2" + + +@dataclass +class BuildkiteConfig: + """Buildkite launcher configuration.""" + + org_slug: str = "mark-saroufim" + pipeline_slug: str = "kernelbot" + api_token: str = field(default_factory=lambda: os.environ.get("BUILDKITE_API_TOKEN", "")) + + # Docker image for jobs + image: str = "ghcr.io/gpu-mode/kernelbot:latest" + + # Timeouts + poll_interval_seconds: int = 10 + max_wait_seconds: int = 900 # 15 minutes + + # Resource defaults + cpus: int = 8 + memory: str = "64g" + + +@dataclass +class BuildkiteResult: + """Result from a Buildkite job.""" + + success: bool + error: str | None + result: dict[str, Any] | None + build_url: str | None = None + build_number: int | None = None + + +class BuildkiteLauncher(Launcher): + """Launcher that submits jobs to Buildkite.""" + + def __init__(self, config: BuildkiteConfig | None = None): + super().__init__(name="Buildkite", gpus=BuildkiteGPU) + self.config = config or BuildkiteConfig() + self._client: httpx.AsyncClient | None = None + + async def _get_client(self) -> httpx.AsyncClient: + if self._client is None: + self._client = httpx.AsyncClient( + headers={ + "Authorization": f"Bearer {self.config.api_token}", + "Content-Type": "application/json", + }, + timeout=30.0, + ) + return self._client + + def _encode_payload(self, config: dict[str, Any]) -> str: + """Compress and base64-encode config.""" + json_bytes = json.dumps(config).encode("utf-8") + compressed = zlib.compress(json_bytes) + return base64.b64encode(compressed).decode("ascii") + + def _get_queue_for_gpu(self, gpu_type: GPU) -> str: + """Map GPU type to Buildkite queue name.""" + queue_map = { + "B200_BK": "b200", + "H100_BK": "h100", + "MI300_BK": "mi300", + "L40S_BK": "test", # Test infrastructure + } + return queue_map.get(gpu_type.name, gpu_type.name.lower().replace("_bk", "")) + + async def run_submission( + self, config: dict, gpu_type: GPU, status: RunProgressReporter + ) -> FullResult: + """ + Launch a kernel evaluation job on Buildkite. + + Args: + config: Evaluation configuration dict + gpu_type: Which GPU to run on + status: Progress reporter for status updates + + Returns: + FullResult with success status and results + """ + queue = self._get_queue_for_gpu(gpu_type) + run_id = f"sub-{config.get('submission_id', 'unknown')}-{gpu_type.name}" + + await status.push(f"Submitting to Buildkite queue: {queue}") + logger.info(f"Submitting job {run_id} to Buildkite queue {queue}") + + result = await self._launch( + run_id=run_id, + config=config, + queue=queue, + status=status, + ) + + if not result.success: + return FullResult( + success=False, + error=result.error or "Buildkite job failed", + runs={}, + system=SystemInfo(), + ) + + if result.result is None: + return FullResult( + success=False, + error="No result returned from Buildkite job", + runs={}, + system=SystemInfo(), + ) + + # Parse the result + return self._parse_result(result.result) + + async def _launch( + self, + run_id: str, + config: dict[str, Any], + queue: str, + status: RunProgressReporter, + inline_steps: list[dict[str, Any]] | None = None, + ) -> BuildkiteResult: + """ + Launch a kernel evaluation job. + + Args: + run_id: Unique identifier for this run + config: Evaluation configuration dict + queue: GPU queue name (e.g., "b200", "mi300") + status: Progress reporter + inline_steps: Optional inline pipeline steps (for testing without pipeline config) + + Returns: + BuildkiteResult with success status and results + """ + client = await self._get_client() + payload = self._encode_payload(config) + + # Create build + url = ( + f"{BUILDKITE_API}/organizations/{self.config.org_slug}" + f"/pipelines/{self.config.pipeline_slug}/builds" + ) + + build_data = { + "commit": "HEAD", + "branch": "buildkite-infrastructure", + "message": f"Kernel eval: {run_id}", + "env": { + "KERNELBOT_RUN_ID": run_id, + "KERNELBOT_PAYLOAD": payload, + "KERNELBOT_QUEUE": queue, + "KERNELBOT_IMAGE": self.config.image, + "KERNELBOT_CPUS": str(self.config.cpus), + "KERNELBOT_MEMORY": self.config.memory, + }, + "meta_data": { + "run_id": run_id, + "queue": queue, + }, + } + + # If inline steps provided, use them instead of pipeline from repo + if inline_steps: + build_data["steps"] = inline_steps + + try: + response = await client.post(url, json=build_data) + response.raise_for_status() + build = response.json() + except httpx.HTTPError as e: + logger.error(f"Failed to create build: {e}") + return BuildkiteResult( + success=False, + error=f"Failed to create build: {e}", + result=None, + ) + + build_url = build.get("web_url") + build_number = build.get("number") + logger.info(f"Build created: {build_url}") + await status.update(f"Build created: [{build_number}](<{build_url}>)") + + # Wait for completion + return await self._wait_for_build(build, run_id, status) + + async def _wait_for_build( + self, build: dict, run_id: str, status: RunProgressReporter + ) -> BuildkiteResult: + """Poll until build completes and download artifacts.""" + client = await self._get_client() + build_url = build.get("url") + web_url = build.get("web_url") + start = asyncio.get_event_loop().time() + + while asyncio.get_event_loop().time() - start < self.config.max_wait_seconds: + try: + response = await client.get(build_url) + response.raise_for_status() + build = response.json() + except httpx.HTTPError as e: + logger.warning(f"Error polling build: {e}") + await asyncio.sleep(self.config.poll_interval_seconds) + continue + + state = build.get("state") + elapsed = asyncio.get_event_loop().time() - start + + if state == "passed": + await status.update(f"Build completed: [{build.get('number')}](<{web_url}>)") + result = await self._download_result(build) + return BuildkiteResult( + success=True, + error=None, + result=result, + build_url=web_url, + build_number=build.get("number"), + ) + + if state in ("failed", "canceled", "blocked"): + return BuildkiteResult( + success=False, + error=f"Build {state}", + result=None, + build_url=web_url, + build_number=build.get("number"), + ) + + await status.update( + f"⏳ Build [{build.get('number')}](<{web_url}>): {state} ({elapsed:.1f}s)" + ) + await asyncio.sleep(self.config.poll_interval_seconds) + + return BuildkiteResult( + success=False, + error="Build timed out", + result=None, + build_url=web_url, + build_number=build.get("number"), + ) + + async def _download_result(self, build: dict) -> dict[str, Any] | None: + """Download result.json artifact.""" + client = await self._get_client() + + # Get artifacts from first job + jobs = build.get("jobs", []) + if not jobs: + return None + + job = jobs[0] + artifacts_url = job.get("artifacts_url") + if not artifacts_url: + return None + + try: + response = await client.get(artifacts_url) + response.raise_for_status() + artifacts = response.json() + + for artifact in artifacts: + if artifact.get("filename") == "result.json": + download_url = artifact.get("download_url") + # Buildkite returns a 302 redirect to S3 + # We need to follow it without the auth header + result_resp = await client.get(download_url, follow_redirects=False) + if result_resp.status_code == 302: + # Get the redirect URL and fetch without auth + s3_url = result_resp.headers.get("location") + async with httpx.AsyncClient(timeout=30.0) as s3_client: + result_resp = await s3_client.get(s3_url) + result_resp.raise_for_status() + return result_resp.json() + else: + result_resp.raise_for_status() + return result_resp.json() + except Exception as e: + logger.error(f"Failed to download artifacts: {e}") + + return None + + def _parse_result(self, data: dict[str, Any]) -> FullResult: + """Parse result.json into FullResult.""" + runs = {} + + for k, v in data.get("runs", {}).items(): + comp_res = None if v.get("compilation") is None else CompileResult(**v["compilation"]) + run_res = None if v.get("run") is None else RunResult(**v["run"]) + profile_res = None if v.get("profile") is None else ProfileResult(**v["profile"]) + + res = EvalResult( + start=datetime.datetime.fromisoformat(v["start"]), + end=datetime.datetime.fromisoformat(v["end"]), + compilation=comp_res, + run=run_res, + profile=profile_res, + ) + runs[k] = res + + system = SystemInfo(**data.get("system", {})) + return FullResult(success=True, error="", runs=runs, system=system) + + async def get_queue_status(self, queue: str) -> dict[str, Any]: + """Get status of agents in a queue.""" + client = await self._get_client() + url = f"{BUILDKITE_API}/organizations/{self.config.org_slug}/agents" + + try: + response = await client.get(url) + response.raise_for_status() + agents = response.json() + except httpx.HTTPError as e: + return {"error": str(e), "agents": []} + + queue_agents = [] + for agent in agents: + agent_queue = None + for meta in agent.get("metadata", []): + if meta.startswith("queue="): + agent_queue = meta.split("=", 1)[1] + break + + if agent_queue == queue: + queue_agents.append({ + "name": agent.get("name"), + "state": agent.get("connection_state"), + "busy": agent.get("job") is not None, + "gpu_index": next( + (m.split("=")[1] for m in agent.get("metadata", []) + if m.startswith("gpu-index=")), + None + ), + }) + + return { + "queue": queue, + "total": len(queue_agents), + "idle": sum(1 for a in queue_agents if not a["busy"]), + "agents": queue_agents, + } + + def create_artifact_test_steps(self, queue: str) -> list[dict[str, Any]]: + """Create inline steps for artifact upload/download testing.""" + # Python script that decodes payload and writes result.json + script = ''' +import base64 +import json +import os +import zlib +from datetime import datetime + +run_id = os.environ.get("KERNELBOT_RUN_ID", "unknown") +payload_b64 = os.environ.get("KERNELBOT_PAYLOAD", "") + +print("=== Artifact Test ===") +print(f"Run ID: {run_id}") +print(f"GPU: {os.environ.get('NVIDIA_VISIBLE_DEVICES', 'not set')}") + +# Decode payload if present +config = {} +if payload_b64: + try: + compressed = base64.b64decode(payload_b64) + config_json = zlib.decompress(compressed).decode("utf-8") + config = json.loads(config_json) + print(f"Decoded config keys: {list(config.keys())}") + except Exception as e: + print(f"Could not decode payload: {e}") + +# Create result matching FullResult structure +result = { + "success": True, + "error": "", + "runs": {}, + "system": { + "gpu_name": os.environ.get("NVIDIA_VISIBLE_DEVICES", "unknown"), + "cuda_version": "test", + "python_version": "3.11", + }, +} + +# Write result.json +with open("result.json", "w") as f: + json.dump(result, f, indent=2) + +print("\\n=== Result ===") +print(json.dumps(result, indent=2)) +print("\\nResult written to result.json") +''' + return [ + { + "label": ":test_tube: Artifact Test", + "agents": {"queue": queue}, + "plugins": [ + { + "docker#v5.11.0": { + "image": "python:3.11-slim", + "propagate-environment": True, + "environment": [ + "KERNELBOT_PAYLOAD", + "KERNELBOT_RUN_ID", + "NVIDIA_VISIBLE_DEVICES", + ], + } + } + ], + "command": f"python3 -c {json.dumps(script)}", + "artifact_paths": ["result.json"], + "timeout_in_minutes": 5, + } + ] diff --git a/src/runners/buildkite-runner.py b/src/runners/buildkite-runner.py new file mode 100644 index 00000000..716270db --- /dev/null +++ b/src/runners/buildkite-runner.py @@ -0,0 +1,73 @@ +#!/usr/bin/env python3 +"""Buildkite job runner for kernel evaluation.""" + +import base64 +import json +import os +import sys +import zlib +from dataclasses import asdict +from datetime import datetime +from pathlib import Path + + +def serialize(obj: object): + """Serialize datetime objects for JSON.""" + if isinstance(obj, datetime): + return obj.isoformat() + raise TypeError(f"Type {type(obj)} not serializable") + + +def main(): + run_id = os.environ.get("KERNELBOT_RUN_ID", "unknown") + payload_b64 = os.environ.get("KERNELBOT_PAYLOAD") + + print("=== Kernelbot Evaluation ===") + print(f"Run ID: {run_id}") + print(f"GPU: {os.environ.get('NVIDIA_VISIBLE_DEVICES', 'not set')}") + print(f"GPU Index: {os.environ.get('KERNELBOT_GPU_INDEX', 'not set')}") + print() + + if not payload_b64: + # No payload means this was triggered by push/PR, not API + # Exit gracefully so CI doesn't fail + print("KERNELBOT_PAYLOAD not set - this build was triggered by push/PR, not API.") + print("Skipping evaluation. To run an evaluation, trigger via BuildkiteLauncher API.") + print() + print("=== Skipped (no payload) ===") + sys.exit(0) + + # Decode payload + try: + compressed = base64.b64decode(payload_b64) + config_json = zlib.decompress(compressed).decode("utf-8") + config = json.loads(config_json) + except Exception as e: + print(f"ERROR: Failed to decode payload: {e}", file=sys.stderr) + sys.exit(1) + + # Import here to catch import errors clearly + from libkernelbot.run_eval import run_config + + # Run evaluation + print("Starting evaluation...") + result = run_config(config) + + # Write result + result_dict = asdict(result) + result_json = json.dumps(result_dict, default=serialize, indent=2) + Path("result.json").write_text(result_json) + print("Result written to result.json") + + # Print summary + print() + print("=== Result ===") + print(f"Success: {result.success}") + if result.error: + print(f"Error: {result.error}") + + sys.exit(0 if result.success else 1) + + +if __name__ == "__main__": + main() diff --git a/tests/e2e_buildkite_test.py b/tests/e2e_buildkite_test.py new file mode 100644 index 00000000..a1df0b94 --- /dev/null +++ b/tests/e2e_buildkite_test.py @@ -0,0 +1,126 @@ +#!/usr/bin/env python3 +"""End-to-end test for Buildkite integration. + +Usage: + BUILDKITE_API_TOKEN=xxx python tests/e2e_buildkite_test.py [--queue QUEUE] + +This script: +1. Creates a simple test job +2. Submits it to Buildkite with inline steps (no pipeline config needed) +3. Waits for completion +4. Downloads and prints the result artifact +""" + +import argparse +import asyncio +import os +import sys + +# Add src to path for local testing +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src")) + + +async def main(): + parser = argparse.ArgumentParser(description="E2E test for Buildkite integration") + parser.add_argument("--queue", default="test", help="Buildkite queue name (default: test)") + parser.add_argument("--org", default="gpu-mode", help="Buildkite org slug") + parser.add_argument("--pipeline", default="kernelbot", help="Buildkite pipeline slug") + parser.add_argument("--dry-run", action="store_true", help="Just print config, don't submit") + parser.add_argument( + "--mode", + choices=["artifact", "full"], + default="artifact", + help="Test mode: artifact (simple inline test) or full (uses pipeline from repo)", + ) + args = parser.parse_args() + + token = os.environ.get("BUILDKITE_API_TOKEN") + if not token: + print("ERROR: BUILDKITE_API_TOKEN environment variable not set") + sys.exit(1) + + from libkernelbot.launchers.buildkite import BuildkiteConfig, BuildkiteLauncher + + config = BuildkiteConfig( + org_slug=args.org, + pipeline_slug=args.pipeline, + api_token=token, + ) + + print("=== Buildkite E2E Test ===") + print(f"Organization: {config.org_slug}") + print(f"Pipeline: {config.pipeline_slug}") + print(f"Queue: {args.queue}") + print(f"Mode: {args.mode}") + print() + + # Simple test config + test_config = { + "test": True, + "message": "Hello from e2e test", + } + + if args.dry_run: + print("Dry run - config would be:") + import json + + print(json.dumps(test_config, indent=2)) + return + + launcher = BuildkiteLauncher(config) + + # Create a simple status reporter + class SimpleReporter: + async def push(self, msg): + print(f"[STATUS] {msg}") + + async def update(self, msg): + print(f"[UPDATE] {msg}") + + print("Submitting test job...") + + # Use inline steps for artifact mode (no pipeline config needed in Buildkite) + inline_steps = None + if args.mode == "artifact": + inline_steps = launcher.create_artifact_test_steps(args.queue) + print("Using inline steps (no pipeline config needed)") + + result = await launcher._launch( + run_id="e2e-test", + config=test_config, + queue=args.queue, + status=SimpleReporter(), + inline_steps=inline_steps, + ) + + print() + print("=== Result ===") + print(f"Success: {result.success}") + if result.error: + print(f"Error: {result.error}") + if result.build_url: + print(f"Build URL: {result.build_url}") + if result.result: + import json + + print("Downloaded artifact:") + print(json.dumps(result.result, indent=2)) + else: + print("No artifact downloaded (result.json not found or download failed)") + + # Also test queue status + print() + print("=== Queue Status ===") + status = await launcher.get_queue_status(args.queue) + print(f"Queue: {status.get('queue')}") + print(f"Total agents: {status.get('total')}") + print(f"Idle agents: {status.get('idle')}") + for agent in status.get("agents", []): + print(f" - {agent['name']}: {agent['state']} (busy={agent['busy']})") + + # Exit with appropriate code + sys.exit(0 if result.success else 1) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/tests/test_buildkite.py b/tests/test_buildkite.py new file mode 100644 index 00000000..cb051f86 --- /dev/null +++ b/tests/test_buildkite.py @@ -0,0 +1,174 @@ +"""Integration tests for Buildkite launcher. + +Usage: + BUILDKITE_API_TOKEN=xxx pytest tests/test_buildkite.py -v -m integration + +These tests require: +1. A Buildkite account with a 'kernelbot' pipeline +2. A self-hosted runner in the 'test' queue +3. The pipeline configured with deployment/buildkite/pipeline-eval.yml +""" + +import os +from pathlib import Path + +import pytest + +from libkernelbot.consts import BuildkiteGPU, SubmissionMode +from libkernelbot.launchers.buildkite import BuildkiteConfig, BuildkiteLauncher +from libkernelbot.report import RunProgressReporter +from libkernelbot.task import build_task_config, make_task_definition + + +class MockProgressReporter(RunProgressReporter): + """Test progress reporter that captures messages.""" + + def __init__(self, title: str = "Test Buildkite Run"): + super().__init__(title) + self.messages = [] + self.updates = [] + + async def push(self, message: str): + self.messages.append(message) + print(f"[STATUS] {message}") + + async def update(self, message: str): + self.updates.append(message) + print(f"[UPDATE] {message}") + + +@pytest.fixture(scope="session") +def buildkite_config(): + """Get Buildkite configuration from environment.""" + token = os.getenv("BUILDKITE_API_TOKEN") + if not token: + pytest.skip("Buildkite integration tests require BUILDKITE_API_TOKEN environment variable") + + org = os.getenv("BUILDKITE_ORG", "mark-saroufim") + pipeline = os.getenv("BUILDKITE_PIPELINE", "kernelbot") + + return BuildkiteConfig( + org_slug=org, + pipeline_slug=pipeline, + api_token=token, + ) + + +@pytest.mark.integration +@pytest.mark.asyncio +@pytest.mark.parametrize("gpu_type", [BuildkiteGPU.L40S_BK]) +async def test_buildkite_launcher_python_script( + project_root: Path, buildkite_config: BuildkiteConfig, gpu_type: BuildkiteGPU +): + """ + Test BuildkiteLauncher with a real Python script. + Uses the vectoradd_py example to verify end-to-end evaluation. + """ + launcher = BuildkiteLauncher(buildkite_config) + reporter = MockProgressReporter("Buildkite Integration Test") + + # Load the vectoradd_py task + task_path = project_root / "examples" / "vectoradd_py" + if not task_path.exists(): + pytest.skip("examples/vectoradd_py not found - skipping Buildkite integration test") + + task_definition = make_task_definition(task_path) + submission_content = (task_path / "submission_triton.py").read_text() + + config = build_task_config( + task=task_definition.task, + submission_content=submission_content, + arch=0, # L40S uses Ada Lovelace architecture + mode=SubmissionMode.TEST, + ) + + result = await launcher.run_submission(config, gpu_type, reporter) + + # Basic structure and success + assert result.success, f"Expected successful run, got: {result.error}" + assert result.error == "" + assert isinstance(result.runs, dict) + + # System info + assert "L40S" in result.system.gpu or "NVIDIA" in result.system.gpu + assert "Linux" in result.system.platform + + # Test run structure + assert "test" in result.runs + test_run = result.runs["test"] + + # Run needs to succeed + assert test_run.run.success is True + assert test_run.run.passed is True + assert test_run.run.exit_code == 0 + assert test_run.run.duration > 0 + + # Test results + assert test_run.run.result["check"] == "pass" + test_count = int(test_run.run.result["test-count"]) + assert test_count >= 1 + + # Sanity check for timings + assert test_run.start < test_run.end + + # Check reporter messages + assert any("Buildkite" in msg or "queue" in msg for msg in reporter.messages) + + +@pytest.mark.integration +@pytest.mark.asyncio +async def test_buildkite_launcher_failing_script( + project_root: Path, buildkite_config: BuildkiteConfig +): + """ + Test BuildkiteLauncher with a script designed to fail. + Ensures we don't pass incorrect submissions. + """ + launcher = BuildkiteLauncher(buildkite_config) + reporter = MockProgressReporter("Buildkite Failing Test") + gpu_type = BuildkiteGPU.L40S_BK + + # Load the identity_py task + task_path = project_root / "examples" / "identity_py" + if not task_path.exists(): + pytest.skip("examples/identity_py not found - skipping Buildkite integration test") + + task_definition = make_task_definition(task_path) + # Use a cheating script that should fail + submission_content = (task_path / "cheat-rng.py").read_text() + + task_definition.task.seed = 653212 + config = build_task_config( + task=task_definition.task, + submission_content=submission_content, + arch=0, + mode=SubmissionMode.LEADERBOARD, + ) + + result = await launcher.run_submission(config, gpu_type, reporter) + + # The workflow should run successfully + assert result.success, f"Expected successful workflow run, got: {result.error}" + assert result.error == "" + + # But the actual test or benchmark should fail + test_passed = result.runs.get("test", {}).run.passed if "test" in result.runs else True + benchmark_passed = result.runs.get("benchmark", {}).run.passed if "benchmark" in result.runs else True + + assert not (test_passed and benchmark_passed), "Expected at least one run to fail for cheating script" + + +@pytest.mark.integration +@pytest.mark.asyncio +async def test_buildkite_queue_status(buildkite_config: BuildkiteConfig): + """Test that we can query queue status.""" + launcher = BuildkiteLauncher(buildkite_config) + + status = await launcher.get_queue_status("test") + + assert "queue" in status + assert status["queue"] == "test" + assert "total" in status + assert "idle" in status + assert "agents" in status + assert isinstance(status["agents"], list)