diff --git a/.github/workflows/build-inference.yml b/.github/workflows/build-inference.yml
new file mode 100644
index 0000000..61f8869
--- /dev/null
+++ b/.github/workflows/build-inference.yml
@@ -0,0 +1,199 @@
+# Build llama-server for every supported GGML backend.
+#
+# Backend matrix:
+#   cuda  — NVIDIA CUDA, built inside Docker on ubuntu-latest, pushed to GHCR
+#   rocm  — AMD ROCm/HIP, built inside Docker on ubuntu-latest, pushed to GHCR
+#   cpu   — CPU-only, built inside Docker on ubuntu-latest, pushed to GHCR
+#   metal — Apple Metal, built natively on macos-latest (Metal GPU frameworks
+#            are unavailable inside Linux containers); binaries uploaded as
+#            workflow artifacts and attached to GitHub Releases.
+#
+# Images are tagged:
+#   ghcr.io/<owner>/atlas/llama-server:<branch>-<backend>
+#   ghcr.io/<owner>/atlas/llama-server:sha-<sha>-<backend>
+#   ghcr.io/<owner>/atlas/llama-server:<semver>-<backend>   (on release)
+#
+# Trigger conditions:
+#   • push to main that touches inference/ or this file
+#   • any pull request that touches inference/ or this file (build only, no push)
+#   • GitHub Release published (build + push + attach Metal zip to release)
+#   • workflow_dispatch for ad-hoc builds
+
+name: Build Inference Images
+
+on:
+  push:
+    branches: [main]
+    paths:
+      - "inference/**"
+      - ".github/workflows/build-inference.yml"
+  pull_request:
+    paths:
+      - "inference/**"
+      - ".github/workflows/build-inference.yml"
+  release:
+    types: [published]
+  workflow_dispatch:
+    inputs:
+      push_images:
+        description: "Push images to GHCR (linux backends)"
+        type: boolean
+        default: false
+      cuda_architectures:
+        description: "CUDA architectures (semicolon-separated, e.g. 89-real;90-real;120-real)"
+        type: string
+        default: "89-real;90-real;120-real"
+
+env:
+  REGISTRY: ghcr.io
+  # Image namespace: ghcr.io/<owner>/atlas/llama-server
+  IMAGE_NAME: ${{ github.repository_owner }}/atlas/llama-server
+
+jobs:
+  # ─────────────────────────────────────────────────────────────────────────
+  # Linux builds: CUDA / ROCm / CPU
+  # The Dockerfiles contain all compiler toolchains (nvcc, hipcc) so no GPU
+  # hardware is required on the runner itself — compilation happens inside
+  # the container image layers.
+  # ─────────────────────────────────────────────────────────────────────────
+  build-linux:
+    name: "${{ matrix.backend }} (ubuntu-latest)"
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: write
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - backend: cuda
+            # CUDA arch targets:
+            #   89-real = Ada Lovelace (RTX 4000, L40)
+            #   90-real = Hopper (H100, H200)
+            #   120-real = Blackwell (GB200, RTX 5000 series)
+            # Override via workflow_dispatch input to target a single GPU.
+            cuda_architectures: "89-real;90-real;120-real"
+          - backend: rocm
+            cuda_architectures: ""
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Log in to GHCR
+        # Skip on PRs to avoid credential exposure for untrusted forks
+        if: >
+          github.event_name != 'pull_request' &&
+          (github.event_name != 'workflow_dispatch' || inputs.push_images)
+        uses: docker/login-action@v3
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Resolve CUDA architectures
+        id: cuda_arch
+        run: |
+          # workflow_dispatch input overrides matrix default
+          if [ "${{ github.event_name }}" = "workflow_dispatch" ] && \
+             [ -n "${{ inputs.cuda_architectures }}" ]; then
+            echo "value=${{ inputs.cuda_architectures }}" >> "$GITHUB_OUTPUT"
+          else
+            echo "value=${{ matrix.cuda_architectures }}" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Docker metadata
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
+          tags: |
+            type=ref,event=branch,suffix=-${{ matrix.backend }}
+            type=ref,event=pr,suffix=-${{ matrix.backend }}
+            type=semver,pattern={{version}},suffix=-${{ matrix.backend }}
+            type=semver,pattern={{major}}.{{minor}},suffix=-${{ matrix.backend }}
+            type=sha,prefix=sha-,suffix=-${{ matrix.backend }}
+
+      - name: Build (and push) Docker image
+        uses: docker/build-push-action@v6
+        with:
+          context: ./inference
+          file: ./inference/Dockerfile.v31
+          build-args: |
+            GGML_BACKEND=${{ matrix.backend }}
+            CUDA_ARCHITECTURES=${{ steps.cuda_arch.outputs.value }}
+          push: >-
+            ${{
+              github.event_name == 'push' ||
+              github.event_name == 'release' ||
+              (github.event_name == 'workflow_dispatch' && inputs.push_images)
+            }}
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
+          # Layer cache scoped per backend so cuda/rocm/cpu don't share
+          cache-from: type=gha,scope=inference-${{ matrix.backend }}
+          cache-to: type=gha,scope=inference-${{ matrix.backend }},mode=max
+
+  # ─────────────────────────────────────────────────────────────────────────
+  # Metal build: native macOS
+  #
+  # Metal GPU frameworks (Metal.framework, MetalPerformanceShaders, etc.) are
+  # macOS-only and cannot be accessed from inside a Linux Docker container.
+  # The binary produced here runs directly on the host — no container needed.
+  #
+  # Outputs:
+  #   • workflow artifact: llama-server-metal-macos-arm64
+  #   • on release: zip attached to the GitHub Release
+  # ─────────────────────────────────────────────────────────────────────────
+  build-metal:
+    name: "metal (macos-latest)"
+    runs-on: macos-latest
+    permissions:
+      contents: write  # needed to upload release assets
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Install build dependencies
+        run: brew install cmake
+
+      - name: Clone llama.cpp
+        run: |
+          git clone --depth 1 https://github.com/ggml-org/llama.cpp /tmp/llama.cpp
+
+      - name: Build with GGML_METAL=ON
+        run: |
+          cd /tmp/llama.cpp
+          cmake -B build \
+            -DGGML_METAL=ON \
+            -DBUILD_SHARED_LIBS=OFF \
+            -DCMAKE_BUILD_TYPE=Release
+          cmake --build build --config Release -j$(sysctl -n hw.logicalcpu)
+
+      - name: Smoke-test binary
+        run: /tmp/llama.cpp/build/bin/llama-server --version
+
+      - name: Upload binaries as workflow artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: llama-server-metal-macos-arm64
+          path: |
+            /tmp/llama.cpp/build/bin/llama-server
+            /tmp/llama.cpp/build/bin/llama-cli
+          if-no-files-found: error
+          retention-days: 90
+
+      - name: Attach binaries to GitHub Release
+        if: github.event_name == 'release'
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          cd /tmp/llama.cpp/build/bin
+          zip -j llama-server-metal-macos-arm64.zip llama-server llama-cli
+          gh release upload "${{ github.ref_name }}" \
+            llama-server-metal-macos-arm64.zip \
+            --repo "${{ github.repository }}"
diff --git a/benchmark/analysis/hardware_info.py b/benchmark/analysis/hardware_info.py
index 2b404dc..9797ed7 100644
--- a/benchmark/analysis/hardware_info.py
+++ b/benchmark/analysis/hardware_info.py
@@ -39,45 +39,28 @@ def run_command(cmd: str, default: str = "") -> str:
         return default
 
 
-def get_gpu_info() -> Dict[str, Any]:
+def _get_nvidia_gpu_info(info: Dict[str, Any]) -> bool:
     """
-    Get GPU information using nvidia-smi.
-
-    Returns:
-        Dictionary with GPU model, VRAM, driver version, and power draw
+    Populate *info* from nvidia-smi.  Returns True if NVIDIA GPU was found.
     """
-    info = {
-        "model": "",
-        "vram_gb": 0.0,
-        "driver_version": "",
-        "power_draw_watts": 0.0
-    }
-
-    # Try nvidia-smi
-    nvidia_smi = run_command("which nvidia-smi")
-    if not nvidia_smi:
-        return info
+    if not run_command("which nvidia-smi"):
+        return False
 
-    # Get GPU name
     name = run_command("nvidia-smi --query-gpu=name --format=csv,noheader,nounits")
     if name:
         info["model"] = name.split('\n')[0].strip()
 
-    # Get VRAM
     vram = run_command("nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits")
     if vram:
         try:
-            # Convert MiB to GB
             info["vram_gb"] = float(vram.split('\n')[0].strip()) / 1024
         except ValueError:
             pass
 
-    # Get driver version
     driver = run_command("nvidia-smi --query-gpu=driver_version --format=csv,noheader,nounits")
     if driver:
         info["driver_version"] = driver.split('\n')[0].strip()
 
-    # Get current power draw
     power = run_command("nvidia-smi --query-gpu=power.draw --format=csv,noheader,nounits")
     if power:
         try:
@@ -85,27 +68,111 @@ def get_gpu_info() -> Dict[str, Any]:
         except ValueError:
             pass
 
+    return bool(info["model"])
+
+
+def _get_rocm_gpu_info(info: Dict[str, Any]) -> bool:
+    """
+    Populate *info* from rocm-smi (AMD ROCm).  Returns True if AMD GPU was found.
+    """
+    if not run_command("which rocm-smi"):
+        return False
+
+    # rocm-smi --showproductname prints lines like "GPU[0] : Product Name: Radeon RX 7900 XTX"
+    name = run_command("rocm-smi --showproductname --noheader 2>/dev/null | awk -F': ' '/Product Name/{print $NF; exit}'")
+    if name:
+        info["model"] = name.strip()
+
+    # VRAM in bytes → GB
+    vram = run_command("rocm-smi --showmeminfo vram --noheader 2>/dev/null | awk '/Total Memory/{print $NF; exit}'")
+    if vram:
+        try:
+            info["vram_gb"] = float(vram.strip()) / (1024 ** 3)
+        except ValueError:
+            pass
+
+    driver = run_command("rocm-smi --showdriverversion --noheader 2>/dev/null | awk '{print $NF; exit}'")
+    if driver:
+        info["driver_version"] = driver.strip()
+
+    return bool(info["model"])
+
+
+def _get_metal_gpu_info(info: Dict[str, Any]) -> bool:
+    """
+    Populate *info* from system_profiler (Apple Metal / macOS).
+    Returns True if a Metal GPU was found.
+    """
+    if platform.system() != "Darwin":
+        return False
+
+    sp_out = run_command("system_profiler SPDisplaysDataType 2>/dev/null")
+    if not sp_out:
+        return False
+
+    match = re.search(r'Chipset Model:\s*(.+)', sp_out)
+    if match:
+        info["model"] = match.group(1).strip()
+
+    match = re.search(r'VRAM \([^)]+\):\s*([\d.]+)\s*(MB|GB)', sp_out, re.IGNORECASE)
+    if match:
+        try:
+            vram_val = float(match.group(1))
+            if match.group(2).upper() == "MB":
+                vram_val /= 1024
+            info["vram_gb"] = vram_val
+        except ValueError:
+            pass
+
+    return bool(info["model"])
+
+
+def get_gpu_info() -> Dict[str, Any]:
+    """
+    Get GPU information, trying NVIDIA, AMD ROCm, and Apple Metal in order.
+
+    Returns:
+        Dictionary with GPU model, VRAM, driver version, and power draw
+    """
+    info = {
+        "model": "",
+        "vram_gb": 0.0,
+        "driver_version": "",
+        "power_draw_watts": 0.0
+    }
+
+    _get_nvidia_gpu_info(info) or _get_rocm_gpu_info(info) or _get_metal_gpu_info(info)
     return info
 
 
 def get_cuda_version() -> str:
     """
-    Get CUDA version.
+    Get the GPU accelerator version (CUDA, ROCm, or Metal).
 
     Returns:
-        CUDA version string
+        Version string for the active GPU accelerator, or empty string.
     """
-    # Try nvcc first
+    # CUDA — try nvcc then nvidia-smi
     nvcc_version = run_command("nvcc --version | grep release | sed 's/.*release //' | sed 's/,.*//'")
     if nvcc_version:
         return nvcc_version
 
-    # Try nvidia-smi
     nvidia_smi_output = run_command("nvidia-smi")
     match = re.search(r'CUDA Version:\s*(\d+\.\d+)', nvidia_smi_output)
     if match:
         return match.group(1)
 
+    # ROCm — hipconfig
+    rocm_version = run_command("hipconfig --version 2>/dev/null | head -1")
+    if rocm_version:
+        return rocm_version
+
+    # Metal — macOS build version (Metal is always present on modern macOS)
+    if platform.system() == "Darwin":
+        macos_ver = run_command("sw_vers -productVersion 2>/dev/null")
+        if macos_ver:
+            return f"Metal (macOS {macos_ver.strip()})"
+
     return ""
 
 
diff --git a/docker-compose.rocm.yml b/docker-compose.rocm.yml
new file mode 100644
index 0000000..274fdfa
--- /dev/null
+++ b/docker-compose.rocm.yml
@@ -0,0 +1,24 @@
+# AMD ROCm override for docker-compose.yml
+#
+# Usage:
+#   GGML_BACKEND=rocm docker compose -f docker-compose.yml -f docker-compose.rocm.yml up --build
+#
+# This override:
+#   - Sets GGML_BACKEND=rocm for the llama-server build
+#   - Replaces the NVIDIA deploy block with ROCm device mappings
+#   - Mounts /dev/kfd (ROCm kernel driver) and /dev/dri (GPU render nodes)
+
+services:
+  llama-server:
+    build:
+      args:
+        GGML_BACKEND: rocm
+    deploy: !reset {}       # remove the nvidia deploy block
+    devices:
+      - /dev/kfd:/dev/kfd
+      - /dev/dri:/dev/dri
+    group_add:
+      - video
+      - render
+    environment:
+      - HSA_OVERRIDE_GFX_VERSION=${HSA_OVERRIDE_GFX_VERSION:-}
diff --git a/docker-compose.yml b/docker-compose.yml
index 07ea2c2..b0a8b00 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1,17 +1,28 @@
 services:
-  # --- LLM Inference (CUDA + grammar support) ---
+  # --- LLM Inference ---
+  # Build backend is selected with GGML_BACKEND (cuda|rocm|cpu).
+  # Pre-built Metal binaries for macOS are produced by the GitHub Actions
+  # workflow (.github/workflows/build-inference.yml) on macos-latest and
+  # are not used with docker compose — run llama-server directly on macOS.
+  #
+  # GPU access uses the Docker Compose `deploy.resources` spec:
+  #   - NVIDIA (default): works with the nvidia container runtime installed.
+  #   - AMD/ROCm: use docker-compose.rocm.yml override (maps /dev/kfd, /dev/dri).
+  #   - CPU-only: remove the `deploy` block (or override with GGML_BACKEND=cpu).
   llama-server:
     build:
       context: ./inference
       dockerfile: Dockerfile.v31
-    # GPU access: works with Docker (nvidia runtime) and Podman (--device)
-    devices:
-      - /dev/nvidia0:/dev/nvidia0
-      - /dev/nvidiactl:/dev/nvidiactl
-      - /dev/nvidia-uvm:/dev/nvidia-uvm
-    environment:
-      - NVIDIA_VISIBLE_DEVICES=all
-      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
+      args:
+        GGML_BACKEND: ${GGML_BACKEND:-cuda}
+        CUDA_ARCHITECTURES: ${CUDA_ARCHITECTURES:-89-real;90-real;120-real}
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
     volumes:
       - ${ATLAS_MODELS_DIR:-./models}:/models:ro
     command: >
diff --git a/inference/Dockerfile b/inference/Dockerfile
index 44e04e7..79f1d59 100644
--- a/inference/Dockerfile
+++ b/inference/Dockerfile
@@ -1,19 +1,54 @@
-FROM docker.io/nvidia/cuda:12.8.0-devel-rockylinux9 AS builder
-RUN dnf install -y git cmake gcc-c++ && dnf clean all
-RUN git clone https://github.com/ggml-org/llama.cpp /llama.cpp && \
-    cd /llama.cpp && \
-    # ATLAS patch: prevent --embeddings from poisoning the draft model context.
-    # Without this, embedding=true propagates to the draft via params_dft = params_base,
-    # causing output_all=true and n_ubatch mismatch → 0% spec decode acceptance.
-    # See: patches/fix-embeddings-spec-decode.patch for details.
-    sed -i '/auto params_dft = params_base;/a\        params_dft.embedding = false;  // ATLAS: draft never needs embeddings' \
-        tools/server/server-context.cpp && \
-    cmake -B build -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_CUDA_ARCHITECTURES=120 && \
-    cmake --build build --config Release -j$(nproc)
+# GGML_BACKEND selects the GPU/compute back-end for Linux container builds:
+#   cuda  (default) — NVIDIA CUDA via nvidia/cuda base image
+#   rocm             — AMD ROCm/HIP via rocm/dev-ubuntu base image
+#   cpu              — CPU-only fallback, no GPU required
+#
+# Apple Metal requires native macOS toolchain — use the GitHub Actions workflow
+# (.github/workflows/build-inference.yml) which runs on macos-latest.
+#
+# CUDA_ARCHITECTURES targets:
+#   89-real  — Ada Lovelace (RTX 4000 series, L40)
+#   90-real  — Hopper       (H100, H200)
+#   120-real — Blackwell    (GB200, RTX 5000 series)
+# Narrow to your GPU to reduce compile time, e.g.:
+#   docker build --build-arg CUDA_ARCHITECTURES=89-real .
+ARG GGML_BACKEND=cuda
+ARG CUDA_ARCHITECTURES=89-real;90-real;120-real
 
-FROM docker.io/nvidia/cuda:12.8.0-runtime-rockylinux9
+# === Build-stage base images (only the selected one is resolved) ===
+FROM docker.io/nvidia/cuda:12.8.0-devel-ubuntu22.04 AS base-cuda
+FROM docker.io/rocm/dev-ubuntu-24.04:6.3-complete    AS base-rocm
+FROM ubuntu:24.04                                     AS base-cpu
+
+# === Builder ===
+FROM base-${GGML_BACKEND} AS builder
+ARG GGML_BACKEND=cuda
+ARG CUDA_ARCHITECTURES=89-real;90-real;120-real
+
+ENV DEBIAN_FRONTEND=noninteractive TMPDIR=/llama.cpp/tmp
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        git cmake gcc g++ make \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN git clone https://github.com/ggml-org/llama.cpp /llama.cpp \
+    && mkdir -p /llama.cpp/tmp \
+    && cd /llama.cpp \
+    && case "$GGML_BACKEND" in \
+         cuda) CMAKE_EXTRA="-DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCHITECTURES}" ;; \
+         rocm) CMAKE_EXTRA="-DGGML_HIP=ON -DAMDGPU_TARGETS=gfx906;gfx908;gfx90a;gfx1030;gfx1100" ;; \
+         *)    CMAKE_EXTRA="" ;; \
+       esac \
+    && cmake -B build -DBUILD_SHARED_LIBS=OFF ${CMAKE_EXTRA} \
+    && cmake --build build --config Release -j$(nproc)
+
+# === Runtime base images ===
+FROM docker.io/nvidia/cuda:12.8.0-runtime-ubuntu22.04 AS runtime-cuda
+FROM docker.io/rocm/dev-ubuntu-24.04:6.3-complete     AS runtime-rocm
+FROM ubuntu:24.04                                      AS runtime-cpu
+
+FROM runtime-${GGML_BACKEND} AS final
 COPY --from=builder /llama.cpp/build/bin/llama-server /usr/local/bin/
-COPY --from=builder /llama.cpp/build/bin/llama-cli /usr/local/bin/
+COPY --from=builder /llama.cpp/build/bin/llama-cli    /usr/local/bin/
 RUN mkdir -p /models /templates
 COPY templates/ /templates/
 COPY entrypoint.sh /entrypoint.sh
diff --git a/inference/Dockerfile.v31 b/inference/Dockerfile.v31
index 4181ff1..ca67263 100644
--- a/inference/Dockerfile.v31
+++ b/inference/Dockerfile.v31
@@ -1,21 +1,57 @@
-FROM docker.io/nvidia/cuda:12.8.0-devel-rockylinux9 AS builder
-RUN dnf install -y git cmake gcc-c++ && dnf clean all
-# Use /llama.cpp/tmp for build temp to avoid /tmp space exhaustion
-ENV TMPDIR=/llama.cpp/tmp
-RUN git clone https://github.com/ggml-org/llama.cpp /llama.cpp && \
-    cd /llama.cpp && \
-    mkdir -p /llama.cpp/tmp && \
-    # ATLAS patch: prevent --embeddings from poisoning the draft model context.
-    # Kept for compatibility — harmless when no draft model is used (Qwen3.5).
-    # If the patch target line doesn't exist in newer versions, the sed is a no-op.
-    sed -i '/auto params_dft = params_base;/a\        params_dft.embedding = false;  // ATLAS: draft never needs embeddings' \
-        tools/server/server-context.cpp 2>/dev/null || true && \
-    cmake -B build -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_CUDA_ARCHITECTURES=120 && \
-    cmake --build build --config Release -j5
+# GGML_BACKEND selects the GPU/compute back-end for Linux container builds:
+#   cuda  (default) — NVIDIA CUDA via nvidia/cuda base image
+#   rocm             — AMD ROCm/HIP via rocm/dev-ubuntu base image
+#   cpu              — CPU-only fallback, no GPU required
+#
+# Apple Metal is NOT supported here. Metal requires native macOS toolchain
+# (Xcode + Metal framework) that is unavailable inside any Linux container.
+# Use the GitHub Actions workflow (.github/workflows/build-inference.yml),
+# which runs on macos-latest, to produce Metal binaries.
+#
+# CUDA_ARCHITECTURES targets:
+#   89-real  — Ada Lovelace (RTX 4000 series, L40)
+#   90-real  — Hopper       (H100, H200)
+#   120-real — Blackwell    (GB200, RTX 5000 series)
+# Narrow to your GPU to reduce compile time, e.g.:
+#   docker build --build-arg GGML_BACKEND=rocm .
+#   docker build --build-arg CUDA_ARCHITECTURES=89-real .
+ARG GGML_BACKEND=cuda
+ARG CUDA_ARCHITECTURES=89-real;90-real;120-real
 
-FROM docker.io/nvidia/cuda:12.8.0-runtime-rockylinux9
+# === Build-stage base images (only the selected one is resolved) ===
+FROM docker.io/nvidia/cuda:12.8.0-devel-ubuntu22.04 AS base-cuda
+FROM docker.io/rocm/dev-ubuntu-24.04:6.3-complete    AS base-rocm
+FROM ubuntu:24.04                                     AS base-cpu
+
+# === Builder ===
+FROM base-${GGML_BACKEND} AS builder
+ARG GGML_BACKEND=cuda
+ARG CUDA_ARCHITECTURES=89-real;90-real;120-real
+
+ENV DEBIAN_FRONTEND=noninteractive TMPDIR=/llama.cpp/tmp
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        git cmake gcc g++ make \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN git clone https://github.com/ggml-org/llama.cpp /llama.cpp \
+    && mkdir -p /llama.cpp/tmp \
+    && cd /llama.cpp \
+    && case "$GGML_BACKEND" in \
+         cuda) CMAKE_EXTRA="-DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCHITECTURES}" ;; \
+         rocm) CMAKE_EXTRA="-DGGML_HIP=ON -DAMDGPU_TARGETS=gfx906;gfx908;gfx90a;gfx1030;gfx1100" ;; \
+         *)    CMAKE_EXTRA="" ;; \
+       esac \
+    && cmake -B build -DBUILD_SHARED_LIBS=OFF ${CMAKE_EXTRA} \
+    && cmake --build build --config Release -j$(nproc)
+
+# === Runtime base images ===
+FROM docker.io/nvidia/cuda:12.8.0-runtime-ubuntu22.04 AS runtime-cuda
+FROM docker.io/rocm/dev-ubuntu-24.04:6.3-complete     AS runtime-rocm
+FROM ubuntu:24.04                                      AS runtime-cpu
+
+FROM runtime-${GGML_BACKEND} AS final
 COPY --from=builder /llama.cpp/build/bin/llama-server /usr/local/bin/
-COPY --from=builder /llama.cpp/build/bin/llama-cli /usr/local/bin/
+COPY --from=builder /llama.cpp/build/bin/llama-cli    /usr/local/bin/
 RUN mkdir -p /models /templates
 COPY templates/ /templates/
 COPY entrypoint-v3.1-9b.sh /entrypoint.sh
diff --git a/inference/entrypoint-mtp.sh b/inference/entrypoint-mtp.sh
index ddba8b0..664b611 100755
--- a/inference/entrypoint-mtp.sh
+++ b/inference/entrypoint-mtp.sh
@@ -14,9 +14,12 @@ KV_FLAGS="-ctk $KV_CACHE_K -ctv $KV_CACHE_V"
 PARALLEL="${PARALLEL_SLOTS:-4}"
 MODEL_FILE="${MODEL_PATH:-/models/Qwen3.5-9B-MTP-Q4_K_M-F16mtp.gguf}"
 
-export GGML_CUDA_NO_PINNED="${GGML_CUDA_NO_PINNED:-0}"
-export CUDA_DEVICE_MAX_CONNECTIONS="${CUDA_DEVICE_MAX_CONNECTIONS:-1}"
-export CUDA_MODULE_LOADING="${CUDA_MODULE_LOADING:-LAZY}"
+# CUDA performance tuning — ignored on non-CUDA backends
+if command -v nvidia-smi >/dev/null 2>&1; then
+    export GGML_CUDA_NO_PINNED="${GGML_CUDA_NO_PINNED:-0}"
+    export CUDA_DEVICE_MAX_CONNECTIONS="${CUDA_DEVICE_MAX_CONNECTIONS:-1}"
+    export CUDA_MODULE_LOADING="${CUDA_MODULE_LOADING:-LAZY}"
+fi
 
 echo "=== V3.1 MTP: Qwen3.5-9B — Generation + MTP + Self-Embeddings ==="
 echo "  Model: $MODEL_FILE"
diff --git a/inference/entrypoint-v3-specdec.sh b/inference/entrypoint-v3-specdec.sh
index 2d708dd..7bd4700 100644
--- a/inference/entrypoint-v3-specdec.sh
+++ b/inference/entrypoint-v3-specdec.sh
@@ -25,9 +25,12 @@ TEMPLATE="${CHAT_TEMPLATE:-Qwen3-custom.jinja}"
 PARALLEL="${PARALLEL_SLOTS:-2}"
 DRAFT_MODEL="${DRAFT_MODEL:-/models/Qwen3-0.6B-Q8_0.gguf}"
 
-export GGML_CUDA_NO_PINNED="${GGML_CUDA_NO_PINNED:-0}"
-export CUDA_DEVICE_MAX_CONNECTIONS="${CUDA_DEVICE_MAX_CONNECTIONS:-1}"
-export CUDA_MODULE_LOADING="${CUDA_MODULE_LOADING:-LAZY}"
+# CUDA performance tuning — ignored on non-CUDA backends
+if command -v nvidia-smi >/dev/null 2>&1; then
+    export GGML_CUDA_NO_PINNED="${GGML_CUDA_NO_PINNED:-0}"
+    export CUDA_DEVICE_MAX_CONNECTIONS="${CUDA_DEVICE_MAX_CONNECTIONS:-1}"
+    export CUDA_MODULE_LOADING="${CUDA_MODULE_LOADING:-LAZY}"
+fi
 
 echo "=== V3: Generation + Self-Embeddings + Speculative Decoding ==="
 echo "  Context: $CTX_LENGTH (draft: $DRAFT_CTX) | KV: $KV_CACHE_TYPE | Parallel: $PARALLEL"
diff --git a/inference/entrypoint-v3.1-9b.sh b/inference/entrypoint-v3.1-9b.sh
index 02da5ef..ca591ea 100755
--- a/inference/entrypoint-v3.1-9b.sh
+++ b/inference/entrypoint-v3.1-9b.sh
@@ -29,9 +29,12 @@ KV_FLAGS="-ctk $KV_CACHE_K -ctv $KV_CACHE_V"
 PARALLEL="${PARALLEL_SLOTS:-4}"
 MODEL_FILE="${MODEL_PATH:-/models/Qwen3.5-9B-Q6_K.gguf}"
 
-export GGML_CUDA_NO_PINNED="${GGML_CUDA_NO_PINNED:-0}"
-export CUDA_DEVICE_MAX_CONNECTIONS="${CUDA_DEVICE_MAX_CONNECTIONS:-1}"
-export CUDA_MODULE_LOADING="${CUDA_MODULE_LOADING:-LAZY}"
+# CUDA performance tuning — ignored on non-CUDA backends
+if command -v nvidia-smi >/dev/null 2>&1; then
+    export GGML_CUDA_NO_PINNED="${GGML_CUDA_NO_PINNED:-0}"
+    export CUDA_DEVICE_MAX_CONNECTIONS="${CUDA_DEVICE_MAX_CONNECTIONS:-1}"
+    export CUDA_MODULE_LOADING="${CUDA_MODULE_LOADING:-LAZY}"
+fi
 
 echo "=== V3.1: Qwen3.5-9B — Generation + Self-Embeddings ==="
 echo "  Model: $MODEL_FILE"