diff --git a/.buildkite/README.md b/.buildkite/README.md new file mode 100644 index 000000000..c38870a51 --- /dev/null +++ b/.buildkite/README.md @@ -0,0 +1,123 @@ +# Multi-Queue Buildkite Configuration + +This directory contains the configuration for running SciMLBenchmarks on multiple Buildkite queues, enabling GPU-intensive benchmarks to run on GPU-enabled compute resources while maintaining CPU-only benchmarks on the standard queue. + +## Files + +- `queue_config.yml`: Central configuration mapping benchmarks to queues +- `generate_pipeline.jl`: Dynamic pipeline generator that creates appropriate Buildkite steps +- `test_sciml.yml`: Main Buildkite pipeline configuration +- `path_processors/project-coalescing`: Script to determine which benchmarks need rebuilding + +## How It Works + +1. **Queue Detection**: When a benchmark runs, the system determines which queue to use based on: + - Benchmark-specific `[buildkite]` section in `Project.toml` (highest priority) + - Central mapping in `queue_config.yml` + - Default queue (`juliaecosystem`) as fallback + +2. **Dynamic Pipeline Generation**: The `generate_pipeline.jl` script: + - Reads the queue configuration + - Processes changed files to determine which benchmarks to run + - Generates appropriate Buildkite steps with correct queue assignments + - Uploads the dynamic pipeline to Buildkite + +3. **Queue-Specific Configuration**: Each queue can have different: + - Architecture requirements (`arch`) + - Operating system (`os`) + - Environment variables (e.g., GPU-specific settings) + - Timeout values + +## Available Queues + +### `juliaecosystem` (Default) +- **Purpose**: CPU-only compute queue +- **Architecture**: x86_64 Linux +- **Usage**: Standard Julia benchmarks, ODE solvers, linear algebra + +### `gpu` +- **Purpose**: GPU-enabled compute queue +- **Architecture**: x86_64 Linux with GPU support +- **Usage**: Neural PDE benchmarks, GPU-accelerated simulations +- **Environment**: Includes CUDA-related environment variables + +## Configuring Benchmarks for Specific Queues + +### Method 1: Project.toml (Recommended) + +Add a `[buildkite]` section to your benchmark's `Project.toml`: + +```toml +[deps] +# ... your dependencies + +[buildkite] +# Specify the queue this benchmark should use +queue = "gpu" +``` + +### Method 2: Central Configuration + +Edit `.buildkite/queue_config.yml` and add your benchmark folder: + +```yaml +benchmark_queues: + YourBenchmarkFolder: "gpu" +``` + +## Adding New Queues + +1. **Define the queue** in `queue_config.yml`: +```yaml +queues: + your_new_queue: + arch: "x86_64" + os: "linux" + description: "Description of queue capabilities" +``` + +2. **Configure benchmarks** to use the new queue using either method above + +3. **Set up the Buildkite infrastructure** to have agents with the `your_new_queue` tag + +## Testing + +To test the pipeline generation locally: + +```bash +# Test with specific files +julia .buildkite/generate_pipeline.jl benchmarks/PINNOptimizers/poisson.jmd + +# Test with benchmark folders +julia .buildkite/generate_pipeline.jl benchmarks/PINNOptimizers +``` + +## Backward Compatibility + +- Benchmarks without queue specifications continue to use `juliaecosystem` +- Existing CI behavior is preserved for benchmarks that don't opt into GPU queues +- The system gracefully handles missing configuration files or invalid queue names + +## Environment Variables + +For GPU benchmarks, the following environment variables are automatically set: + +- `JULIA_CUDA_USE_BINARYBUILDER=false`: Use system CUDA instead of artifacts +- `JULIA_GPU_ALLOW_DEFAULT=true`: Allow default GPU selection + +## Troubleshooting + +### Pipeline Not Generated +- Check that `YAML.jl` is available in the Julia environment +- Verify the `queue_config.yml` syntax is valid YAML +- Ensure changed files are properly detected + +### Wrong Queue Assignment +- Check the `[buildkite]` section in your benchmark's `Project.toml` +- Verify the benchmark name matches the folder name in `queue_config.yml` +- Confirm the queue exists in the `queues` section + +### GPU Jobs Failing +- Verify GPU agents are available and tagged with `queue: "gpu"` +- Check that CUDA dependencies are properly configured on GPU nodes +- Review environment variables for GPU-specific settings \ No newline at end of file diff --git a/.buildkite/generate_pipeline.jl b/.buildkite/generate_pipeline.jl new file mode 100644 index 000000000..4cf297bba --- /dev/null +++ b/.buildkite/generate_pipeline.jl @@ -0,0 +1,132 @@ +#!/usr/bin/env julia + +using YAML, Pkg + +""" +Generate dynamic Buildkite pipeline based on queue configuration and changed files. +""" + +# Read queue configuration +config_path = joinpath(@__DIR__, "queue_config.yml") +if !isfile(config_path) + error("Queue configuration file not found: $config_path") +end + +config = YAML.load_file(config_path) +default_queue = get(config, "default_queue", "juliaecosystem") +queues = get(config, "queues", Dict()) +benchmark_queues = get(config, "benchmark_queues", Dict()) + +# Get changed files from environment or arguments +changed_files = if haskey(ENV, "BUILDKITE_CHANGED_FILES") + split(ENV["BUILDKITE_CHANGED_FILES"], "\n") +elseif length(ARGS) > 0 + ARGS +else + # Default to testing folder for demonstration + ["benchmarks/Testing/test.jmd"] +end + +# Process changed files using the existing project-coalescing logic +coalescing_script = joinpath(@__DIR__, "path_processors", "project-coalescing") +if isfile(coalescing_script) && haskey(ENV, "BUILDKITE_AGENT_ACCESS_TOKEN") + # Only use project-coalescing if we're in a real Buildkite environment + try + build_targets = strip(read(`$coalescing_script $changed_files`, String)) + targets = split(build_targets) + catch e + @warn "project-coalescing failed, using fallback" exception=e + targets = changed_files + end +else + # Fallback: use the provided targets directly or extract from file paths + targets = if any(startswith(f, "benchmarks/") for f in changed_files) + unique([startswith(f, "benchmarks/") ? f : "benchmarks/$f" for f in changed_files]) + else + changed_files + end +end + +# Generate pipeline steps +steps = [] + +for target in targets + if isfile(target) + # Single file target + folder = dirname(target) + benchmark_name = basename(folder) + else + # Folder target + benchmark_name = basename(target) + end + + # Determine queue for this benchmark + # Priority: 1. Project.toml metadata, 2. queue_config.yml, 3. default + queue_name = default_queue + + # Check if benchmark has its own Project.toml with queue specification + project_path = if isfile(target) + joinpath(dirname(target), "Project.toml") + else + joinpath(target, "Project.toml") + end + + if isfile(project_path) + try + project_content = read(project_path, String) + # Simple regex to extract queue from [buildkite] section + queue_match = match(r"\[buildkite\].*?queue\s*=\s*\"([^\"]+)\""s, project_content) + if queue_match !== nothing + queue_name = queue_match.captures[1] + end + catch e + @warn "Could not parse Project.toml for queue info: $(project_path)" exception=e + end + end + + # Fallback to configuration file + if queue_name == default_queue + queue_name = get(benchmark_queues, benchmark_name, default_queue) + end + + queue_info = get(queues, queue_name, Dict("arch" => "x86_64", "os" => "linux")) + + # Create step configuration + step = Dict( + "label" => ":julia: $(benchmark_name) on $(queue_name)", + "command" => "julia benchmark.jl $(target)", + "plugins" => [ + Dict("JuliaCI/julia#v1" => Dict("version" => "1.10")), + Dict("JuliaCI/julia-test#v1" => nothing) + ], + "timeout_in_minutes" => 60, # Increased timeout for potential GPU jobs + "artifact_paths" => [ + "html/$(benchmark_name)/*.html", + "markdown/$(benchmark_name)/*.md", + "notebook/$(benchmark_name)/*.ipynb", + "pdf/$(benchmark_name)/*.pdf", + "script/$(benchmark_name)/*.jl" + ], + "agents" => Dict( + "queue" => queue_name, + "arch" => get(queue_info, "arch", "x86_64"), + "os" => get(queue_info, "os", "linux") + ) + ) + + # Add GPU-specific environment variables if needed + if queue_name == "gpu" + step["env"] = Dict( + "JULIA_CUDA_USE_BINARYBUILDER" => "false", + "JULIA_GPU_ALLOW_DEFAULT" => "true" + ) + end + + push!(steps, step) +end + +# Generate final pipeline +pipeline = Dict("steps" => steps) + +# Output as YAML +println(YAML.write(pipeline)) \ No newline at end of file diff --git a/.buildkite/queue_config.yml b/.buildkite/queue_config.yml new file mode 100644 index 000000000..0fa53ba95 --- /dev/null +++ b/.buildkite/queue_config.yml @@ -0,0 +1,34 @@ +# Queue Configuration for SciMLBenchmarks +# This file maps benchmark folders to their preferred Buildkite queues +# If a benchmark is not listed here, it will use the default queue + +# Default queue for all benchmarks (backward compatibility) +default_queue: "juliaecosystem" + +# Available queues and their characteristics +queues: + juliaecosystem: + arch: "x86_64" + os: "linux" + description: "CPU-only compute queue" + + gpu: + arch: "x86_64" + os: "linux" + description: "GPU-enabled compute queue" + # Add other GPU-specific requirements as needed + +# Benchmark-specific queue assignments +benchmark_queues: + # GPU-intensive benchmarks + PINNOptimizers: "gpu" + PINNErrorsVsTime: "gpu" + NBodySimulator: "gpu" # Can benefit from GPU acceleration + + # CPU-only benchmarks (explicitly set for clarity) + NonStiffODE: "juliaecosystem" + StiffODE: "juliaecosystem" + LinearSolve: "juliaecosystem" + + # Add more mappings as needed + # BenchmarkFolder: "queue_name" \ No newline at end of file diff --git a/.buildkite/test_sciml.yml b/.buildkite/test_sciml.yml index 99f9d6734..c16609fa7 100644 --- a/.buildkite/test_sciml.yml +++ b/.buildkite/test_sciml.yml @@ -1,21 +1,11 @@ steps: - - label: ":julia: Run tests on LTS" - plugins: - - JuliaCI/julia#v1: - version: '1.10' - - JuliaCI/julia-test#v1: - timeout_in_minutes: 20 - artifact_paths: - # Upload .html - - "html/Testing/*.html" - # Upload markdown - - "markdown/Testing/*.md" - # Upload notebook - - "notebook/Testing/*.ipynb" - # Upload .pdf files - - "pdf/Testing/*.pdf" - # Upload Julia script - - "script/Testing/*.jl" + - label: ":gear: Generate Dynamic Pipeline" + command: | + # Install required Julia packages + julia -e 'using Pkg; Pkg.add("YAML")' + + # Generate dynamic pipeline based on changed files and queue configuration + julia .buildkite/generate_pipeline.jl | buildkite-agent pipeline upload agents: queue: "juliaecosystem" arch: "x86_64" diff --git a/benchmarks/PINNOptimizers/Project.toml b/benchmarks/PINNOptimizers/Project.toml index 7cd4bce2a..86e307866 100644 --- a/benchmarks/PINNOptimizers/Project.toml +++ b/benchmarks/PINNOptimizers/Project.toml @@ -17,3 +17,7 @@ OptimizationOptimJL = "0.4" OptimizationOptimisers = "0.3" Plots = "1" SciMLBenchmarks = "0.1" + +[buildkite] +# Specify that this benchmark should run on the GPU queue +queue = "gpu"