diff --git a/.github/workflows/build-inference.yml b/.github/workflows/build-inference.yml new file mode 100644 index 0000000..61f8869 --- /dev/null +++ b/.github/workflows/build-inference.yml @@ -0,0 +1,199 @@ +# Build llama-server for every supported GGML backend. +# +# Backend matrix: +# cuda — NVIDIA CUDA, built inside Docker on ubuntu-latest, pushed to GHCR +# rocm — AMD ROCm/HIP, built inside Docker on ubuntu-latest, pushed to GHCR +# cpu — CPU-only, built inside Docker on ubuntu-latest, pushed to GHCR +# metal — Apple Metal, built natively on macos-latest (Metal GPU frameworks +# are unavailable inside Linux containers); binaries uploaded as +# workflow artifacts and attached to GitHub Releases. +# +# Images are tagged: +# ghcr.io//atlas/llama-server:- +# ghcr.io//atlas/llama-server:sha-- +# ghcr.io//atlas/llama-server:- (on release) +# +# Trigger conditions: +# • push to main that touches inference/ or this file +# • any pull request that touches inference/ or this file (build only, no push) +# • GitHub Release published (build + push + attach Metal zip to release) +# • workflow_dispatch for ad-hoc builds + +name: Build Inference Images + +on: + push: + branches: [main] + paths: + - "inference/**" + - ".github/workflows/build-inference.yml" + pull_request: + paths: + - "inference/**" + - ".github/workflows/build-inference.yml" + release: + types: [published] + workflow_dispatch: + inputs: + push_images: + description: "Push images to GHCR (linux backends)" + type: boolean + default: false + cuda_architectures: + description: "CUDA architectures (semicolon-separated, e.g. 89-real;90-real;120-real)" + type: string + default: "89-real;90-real;120-real" + +env: + REGISTRY: ghcr.io + # Image namespace: ghcr.io//atlas/llama-server + IMAGE_NAME: ${{ github.repository_owner }}/atlas/llama-server + +jobs: + # ───────────────────────────────────────────────────────────────────────── + # Linux builds: CUDA / ROCm / CPU + # The Dockerfiles contain all compiler toolchains (nvcc, hipcc) so no GPU + # hardware is required on the runner itself — compilation happens inside + # the container image layers. + # ───────────────────────────────────────────────────────────────────────── + build-linux: + name: "${{ matrix.backend }} (ubuntu-latest)" + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + strategy: + fail-fast: false + matrix: + include: + - backend: cuda + # CUDA arch targets: + # 89-real = Ada Lovelace (RTX 4000, L40) + # 90-real = Hopper (H100, H200) + # 120-real = Blackwell (GB200, RTX 5000 series) + # Override via workflow_dispatch input to target a single GPU. + cuda_architectures: "89-real;90-real;120-real" + - backend: rocm + cuda_architectures: "" + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Log in to GHCR + # Skip on PRs to avoid credential exposure for untrusted forks + if: > + github.event_name != 'pull_request' && + (github.event_name != 'workflow_dispatch' || inputs.push_images) + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Resolve CUDA architectures + id: cuda_arch + run: | + # workflow_dispatch input overrides matrix default + if [ "${{ github.event_name }}" = "workflow_dispatch" ] && \ + [ -n "${{ inputs.cuda_architectures }}" ]; then + echo "value=${{ inputs.cuda_architectures }}" >> "$GITHUB_OUTPUT" + else + echo "value=${{ matrix.cuda_architectures }}" >> "$GITHUB_OUTPUT" + fi + + - name: Docker metadata + id: meta + uses: docker/metadata-action@v5 + with: + images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} + tags: | + type=ref,event=branch,suffix=-${{ matrix.backend }} + type=ref,event=pr,suffix=-${{ matrix.backend }} + type=semver,pattern={{version}},suffix=-${{ matrix.backend }} + type=semver,pattern={{major}}.{{minor}},suffix=-${{ matrix.backend }} + type=sha,prefix=sha-,suffix=-${{ matrix.backend }} + + - name: Build (and push) Docker image + uses: docker/build-push-action@v6 + with: + context: ./inference + file: ./inference/Dockerfile.v31 + build-args: | + GGML_BACKEND=${{ matrix.backend }} + CUDA_ARCHITECTURES=${{ steps.cuda_arch.outputs.value }} + push: >- + ${{ + github.event_name == 'push' || + github.event_name == 'release' || + (github.event_name == 'workflow_dispatch' && inputs.push_images) + }} + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + # Layer cache scoped per backend so cuda/rocm/cpu don't share + cache-from: type=gha,scope=inference-${{ matrix.backend }} + cache-to: type=gha,scope=inference-${{ matrix.backend }},mode=max + + # ───────────────────────────────────────────────────────────────────────── + # Metal build: native macOS + # + # Metal GPU frameworks (Metal.framework, MetalPerformanceShaders, etc.) are + # macOS-only and cannot be accessed from inside a Linux Docker container. + # The binary produced here runs directly on the host — no container needed. + # + # Outputs: + # • workflow artifact: llama-server-metal-macos-arm64 + # • on release: zip attached to the GitHub Release + # ───────────────────────────────────────────────────────────────────────── + build-metal: + name: "metal (macos-latest)" + runs-on: macos-latest + permissions: + contents: write # needed to upload release assets + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Install build dependencies + run: brew install cmake + + - name: Clone llama.cpp + run: | + git clone --depth 1 https://github.com/ggml-org/llama.cpp /tmp/llama.cpp + + - name: Build with GGML_METAL=ON + run: | + cd /tmp/llama.cpp + cmake -B build \ + -DGGML_METAL=ON \ + -DBUILD_SHARED_LIBS=OFF \ + -DCMAKE_BUILD_TYPE=Release + cmake --build build --config Release -j$(sysctl -n hw.logicalcpu) + + - name: Smoke-test binary + run: /tmp/llama.cpp/build/bin/llama-server --version + + - name: Upload binaries as workflow artifact + uses: actions/upload-artifact@v4 + with: + name: llama-server-metal-macos-arm64 + path: | + /tmp/llama.cpp/build/bin/llama-server + /tmp/llama.cpp/build/bin/llama-cli + if-no-files-found: error + retention-days: 90 + + - name: Attach binaries to GitHub Release + if: github.event_name == 'release' + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + cd /tmp/llama.cpp/build/bin + zip -j llama-server-metal-macos-arm64.zip llama-server llama-cli + gh release upload "${{ github.ref_name }}" \ + llama-server-metal-macos-arm64.zip \ + --repo "${{ github.repository }}" diff --git a/benchmark/analysis/hardware_info.py b/benchmark/analysis/hardware_info.py index 2b404dc..9797ed7 100644 --- a/benchmark/analysis/hardware_info.py +++ b/benchmark/analysis/hardware_info.py @@ -39,45 +39,28 @@ def run_command(cmd: str, default: str = "") -> str: return default -def get_gpu_info() -> Dict[str, Any]: +def _get_nvidia_gpu_info(info: Dict[str, Any]) -> bool: """ - Get GPU information using nvidia-smi. - - Returns: - Dictionary with GPU model, VRAM, driver version, and power draw + Populate *info* from nvidia-smi. Returns True if NVIDIA GPU was found. """ - info = { - "model": "", - "vram_gb": 0.0, - "driver_version": "", - "power_draw_watts": 0.0 - } - - # Try nvidia-smi - nvidia_smi = run_command("which nvidia-smi") - if not nvidia_smi: - return info + if not run_command("which nvidia-smi"): + return False - # Get GPU name name = run_command("nvidia-smi --query-gpu=name --format=csv,noheader,nounits") if name: info["model"] = name.split('\n')[0].strip() - # Get VRAM vram = run_command("nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits") if vram: try: - # Convert MiB to GB info["vram_gb"] = float(vram.split('\n')[0].strip()) / 1024 except ValueError: pass - # Get driver version driver = run_command("nvidia-smi --query-gpu=driver_version --format=csv,noheader,nounits") if driver: info["driver_version"] = driver.split('\n')[0].strip() - # Get current power draw power = run_command("nvidia-smi --query-gpu=power.draw --format=csv,noheader,nounits") if power: try: @@ -85,27 +68,111 @@ def get_gpu_info() -> Dict[str, Any]: except ValueError: pass + return bool(info["model"]) + + +def _get_rocm_gpu_info(info: Dict[str, Any]) -> bool: + """ + Populate *info* from rocm-smi (AMD ROCm). Returns True if AMD GPU was found. + """ + if not run_command("which rocm-smi"): + return False + + # rocm-smi --showproductname prints lines like "GPU[0] : Product Name: Radeon RX 7900 XTX" + name = run_command("rocm-smi --showproductname --noheader 2>/dev/null | awk -F': ' '/Product Name/{print $NF; exit}'") + if name: + info["model"] = name.strip() + + # VRAM in bytes → GB + vram = run_command("rocm-smi --showmeminfo vram --noheader 2>/dev/null | awk '/Total Memory/{print $NF; exit}'") + if vram: + try: + info["vram_gb"] = float(vram.strip()) / (1024 ** 3) + except ValueError: + pass + + driver = run_command("rocm-smi --showdriverversion --noheader 2>/dev/null | awk '{print $NF; exit}'") + if driver: + info["driver_version"] = driver.strip() + + return bool(info["model"]) + + +def _get_metal_gpu_info(info: Dict[str, Any]) -> bool: + """ + Populate *info* from system_profiler (Apple Metal / macOS). + Returns True if a Metal GPU was found. + """ + if platform.system() != "Darwin": + return False + + sp_out = run_command("system_profiler SPDisplaysDataType 2>/dev/null") + if not sp_out: + return False + + match = re.search(r'Chipset Model:\s*(.+)', sp_out) + if match: + info["model"] = match.group(1).strip() + + match = re.search(r'VRAM \([^)]+\):\s*([\d.]+)\s*(MB|GB)', sp_out, re.IGNORECASE) + if match: + try: + vram_val = float(match.group(1)) + if match.group(2).upper() == "MB": + vram_val /= 1024 + info["vram_gb"] = vram_val + except ValueError: + pass + + return bool(info["model"]) + + +def get_gpu_info() -> Dict[str, Any]: + """ + Get GPU information, trying NVIDIA, AMD ROCm, and Apple Metal in order. + + Returns: + Dictionary with GPU model, VRAM, driver version, and power draw + """ + info = { + "model": "", + "vram_gb": 0.0, + "driver_version": "", + "power_draw_watts": 0.0 + } + + _get_nvidia_gpu_info(info) or _get_rocm_gpu_info(info) or _get_metal_gpu_info(info) return info def get_cuda_version() -> str: """ - Get CUDA version. + Get the GPU accelerator version (CUDA, ROCm, or Metal). Returns: - CUDA version string + Version string for the active GPU accelerator, or empty string. """ - # Try nvcc first + # CUDA — try nvcc then nvidia-smi nvcc_version = run_command("nvcc --version | grep release | sed 's/.*release //' | sed 's/,.*//'") if nvcc_version: return nvcc_version - # Try nvidia-smi nvidia_smi_output = run_command("nvidia-smi") match = re.search(r'CUDA Version:\s*(\d+\.\d+)', nvidia_smi_output) if match: return match.group(1) + # ROCm — hipconfig + rocm_version = run_command("hipconfig --version 2>/dev/null | head -1") + if rocm_version: + return rocm_version + + # Metal — macOS build version (Metal is always present on modern macOS) + if platform.system() == "Darwin": + macos_ver = run_command("sw_vers -productVersion 2>/dev/null") + if macos_ver: + return f"Metal (macOS {macos_ver.strip()})" + return "" diff --git a/docker-compose.rocm.yml b/docker-compose.rocm.yml new file mode 100644 index 0000000..274fdfa --- /dev/null +++ b/docker-compose.rocm.yml @@ -0,0 +1,24 @@ +# AMD ROCm override for docker-compose.yml +# +# Usage: +# GGML_BACKEND=rocm docker compose -f docker-compose.yml -f docker-compose.rocm.yml up --build +# +# This override: +# - Sets GGML_BACKEND=rocm for the llama-server build +# - Replaces the NVIDIA deploy block with ROCm device mappings +# - Mounts /dev/kfd (ROCm kernel driver) and /dev/dri (GPU render nodes) + +services: + llama-server: + build: + args: + GGML_BACKEND: rocm + deploy: !reset {} # remove the nvidia deploy block + devices: + - /dev/kfd:/dev/kfd + - /dev/dri:/dev/dri + group_add: + - video + - render + environment: + - HSA_OVERRIDE_GFX_VERSION=${HSA_OVERRIDE_GFX_VERSION:-} diff --git a/docker-compose.yml b/docker-compose.yml index 07ea2c2..b0a8b00 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,17 +1,28 @@ services: - # --- LLM Inference (CUDA + grammar support) --- + # --- LLM Inference --- + # Build backend is selected with GGML_BACKEND (cuda|rocm|cpu). + # Pre-built Metal binaries for macOS are produced by the GitHub Actions + # workflow (.github/workflows/build-inference.yml) on macos-latest and + # are not used with docker compose — run llama-server directly on macOS. + # + # GPU access uses the Docker Compose `deploy.resources` spec: + # - NVIDIA (default): works with the nvidia container runtime installed. + # - AMD/ROCm: use docker-compose.rocm.yml override (maps /dev/kfd, /dev/dri). + # - CPU-only: remove the `deploy` block (or override with GGML_BACKEND=cpu). llama-server: build: context: ./inference dockerfile: Dockerfile.v31 - # GPU access: works with Docker (nvidia runtime) and Podman (--device) - devices: - - /dev/nvidia0:/dev/nvidia0 - - /dev/nvidiactl:/dev/nvidiactl - - /dev/nvidia-uvm:/dev/nvidia-uvm - environment: - - NVIDIA_VISIBLE_DEVICES=all - - NVIDIA_DRIVER_CAPABILITIES=compute,utility + args: + GGML_BACKEND: ${GGML_BACKEND:-cuda} + CUDA_ARCHITECTURES: ${CUDA_ARCHITECTURES:-89-real;90-real;120-real} + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: 1 + capabilities: [gpu] volumes: - ${ATLAS_MODELS_DIR:-./models}:/models:ro command: > diff --git a/inference/Dockerfile b/inference/Dockerfile index 44e04e7..79f1d59 100644 --- a/inference/Dockerfile +++ b/inference/Dockerfile @@ -1,19 +1,54 @@ -FROM docker.io/nvidia/cuda:12.8.0-devel-rockylinux9 AS builder -RUN dnf install -y git cmake gcc-c++ && dnf clean all -RUN git clone https://github.com/ggml-org/llama.cpp /llama.cpp && \ - cd /llama.cpp && \ - # ATLAS patch: prevent --embeddings from poisoning the draft model context. - # Without this, embedding=true propagates to the draft via params_dft = params_base, - # causing output_all=true and n_ubatch mismatch → 0% spec decode acceptance. - # See: patches/fix-embeddings-spec-decode.patch for details. - sed -i '/auto params_dft = params_base;/a\ params_dft.embedding = false; // ATLAS: draft never needs embeddings' \ - tools/server/server-context.cpp && \ - cmake -B build -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_CUDA_ARCHITECTURES=120 && \ - cmake --build build --config Release -j$(nproc) +# GGML_BACKEND selects the GPU/compute back-end for Linux container builds: +# cuda (default) — NVIDIA CUDA via nvidia/cuda base image +# rocm — AMD ROCm/HIP via rocm/dev-ubuntu base image +# cpu — CPU-only fallback, no GPU required +# +# Apple Metal requires native macOS toolchain — use the GitHub Actions workflow +# (.github/workflows/build-inference.yml) which runs on macos-latest. +# +# CUDA_ARCHITECTURES targets: +# 89-real — Ada Lovelace (RTX 4000 series, L40) +# 90-real — Hopper (H100, H200) +# 120-real — Blackwell (GB200, RTX 5000 series) +# Narrow to your GPU to reduce compile time, e.g.: +# docker build --build-arg CUDA_ARCHITECTURES=89-real . +ARG GGML_BACKEND=cuda +ARG CUDA_ARCHITECTURES=89-real;90-real;120-real -FROM docker.io/nvidia/cuda:12.8.0-runtime-rockylinux9 +# === Build-stage base images (only the selected one is resolved) === +FROM docker.io/nvidia/cuda:12.8.0-devel-ubuntu22.04 AS base-cuda +FROM docker.io/rocm/dev-ubuntu-24.04:6.3-complete AS base-rocm +FROM ubuntu:24.04 AS base-cpu + +# === Builder === +FROM base-${GGML_BACKEND} AS builder +ARG GGML_BACKEND=cuda +ARG CUDA_ARCHITECTURES=89-real;90-real;120-real + +ENV DEBIAN_FRONTEND=noninteractive TMPDIR=/llama.cpp/tmp +RUN apt-get update && apt-get install -y --no-install-recommends \ + git cmake gcc g++ make \ + && rm -rf /var/lib/apt/lists/* + +RUN git clone https://github.com/ggml-org/llama.cpp /llama.cpp \ + && mkdir -p /llama.cpp/tmp \ + && cd /llama.cpp \ + && case "$GGML_BACKEND" in \ + cuda) CMAKE_EXTRA="-DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCHITECTURES}" ;; \ + rocm) CMAKE_EXTRA="-DGGML_HIP=ON -DAMDGPU_TARGETS=gfx906;gfx908;gfx90a;gfx1030;gfx1100" ;; \ + *) CMAKE_EXTRA="" ;; \ + esac \ + && cmake -B build -DBUILD_SHARED_LIBS=OFF ${CMAKE_EXTRA} \ + && cmake --build build --config Release -j$(nproc) + +# === Runtime base images === +FROM docker.io/nvidia/cuda:12.8.0-runtime-ubuntu22.04 AS runtime-cuda +FROM docker.io/rocm/dev-ubuntu-24.04:6.3-complete AS runtime-rocm +FROM ubuntu:24.04 AS runtime-cpu + +FROM runtime-${GGML_BACKEND} AS final COPY --from=builder /llama.cpp/build/bin/llama-server /usr/local/bin/ -COPY --from=builder /llama.cpp/build/bin/llama-cli /usr/local/bin/ +COPY --from=builder /llama.cpp/build/bin/llama-cli /usr/local/bin/ RUN mkdir -p /models /templates COPY templates/ /templates/ COPY entrypoint.sh /entrypoint.sh diff --git a/inference/Dockerfile.v31 b/inference/Dockerfile.v31 index 4181ff1..ca67263 100644 --- a/inference/Dockerfile.v31 +++ b/inference/Dockerfile.v31 @@ -1,21 +1,57 @@ -FROM docker.io/nvidia/cuda:12.8.0-devel-rockylinux9 AS builder -RUN dnf install -y git cmake gcc-c++ && dnf clean all -# Use /llama.cpp/tmp for build temp to avoid /tmp space exhaustion -ENV TMPDIR=/llama.cpp/tmp -RUN git clone https://github.com/ggml-org/llama.cpp /llama.cpp && \ - cd /llama.cpp && \ - mkdir -p /llama.cpp/tmp && \ - # ATLAS patch: prevent --embeddings from poisoning the draft model context. - # Kept for compatibility — harmless when no draft model is used (Qwen3.5). - # If the patch target line doesn't exist in newer versions, the sed is a no-op. - sed -i '/auto params_dft = params_base;/a\ params_dft.embedding = false; // ATLAS: draft never needs embeddings' \ - tools/server/server-context.cpp 2>/dev/null || true && \ - cmake -B build -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_CUDA_ARCHITECTURES=120 && \ - cmake --build build --config Release -j5 +# GGML_BACKEND selects the GPU/compute back-end for Linux container builds: +# cuda (default) — NVIDIA CUDA via nvidia/cuda base image +# rocm — AMD ROCm/HIP via rocm/dev-ubuntu base image +# cpu — CPU-only fallback, no GPU required +# +# Apple Metal is NOT supported here. Metal requires native macOS toolchain +# (Xcode + Metal framework) that is unavailable inside any Linux container. +# Use the GitHub Actions workflow (.github/workflows/build-inference.yml), +# which runs on macos-latest, to produce Metal binaries. +# +# CUDA_ARCHITECTURES targets: +# 89-real — Ada Lovelace (RTX 4000 series, L40) +# 90-real — Hopper (H100, H200) +# 120-real — Blackwell (GB200, RTX 5000 series) +# Narrow to your GPU to reduce compile time, e.g.: +# docker build --build-arg GGML_BACKEND=rocm . +# docker build --build-arg CUDA_ARCHITECTURES=89-real . +ARG GGML_BACKEND=cuda +ARG CUDA_ARCHITECTURES=89-real;90-real;120-real -FROM docker.io/nvidia/cuda:12.8.0-runtime-rockylinux9 +# === Build-stage base images (only the selected one is resolved) === +FROM docker.io/nvidia/cuda:12.8.0-devel-ubuntu22.04 AS base-cuda +FROM docker.io/rocm/dev-ubuntu-24.04:6.3-complete AS base-rocm +FROM ubuntu:24.04 AS base-cpu + +# === Builder === +FROM base-${GGML_BACKEND} AS builder +ARG GGML_BACKEND=cuda +ARG CUDA_ARCHITECTURES=89-real;90-real;120-real + +ENV DEBIAN_FRONTEND=noninteractive TMPDIR=/llama.cpp/tmp +RUN apt-get update && apt-get install -y --no-install-recommends \ + git cmake gcc g++ make \ + && rm -rf /var/lib/apt/lists/* + +RUN git clone https://github.com/ggml-org/llama.cpp /llama.cpp \ + && mkdir -p /llama.cpp/tmp \ + && cd /llama.cpp \ + && case "$GGML_BACKEND" in \ + cuda) CMAKE_EXTRA="-DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCHITECTURES}" ;; \ + rocm) CMAKE_EXTRA="-DGGML_HIP=ON -DAMDGPU_TARGETS=gfx906;gfx908;gfx90a;gfx1030;gfx1100" ;; \ + *) CMAKE_EXTRA="" ;; \ + esac \ + && cmake -B build -DBUILD_SHARED_LIBS=OFF ${CMAKE_EXTRA} \ + && cmake --build build --config Release -j$(nproc) + +# === Runtime base images === +FROM docker.io/nvidia/cuda:12.8.0-runtime-ubuntu22.04 AS runtime-cuda +FROM docker.io/rocm/dev-ubuntu-24.04:6.3-complete AS runtime-rocm +FROM ubuntu:24.04 AS runtime-cpu + +FROM runtime-${GGML_BACKEND} AS final COPY --from=builder /llama.cpp/build/bin/llama-server /usr/local/bin/ -COPY --from=builder /llama.cpp/build/bin/llama-cli /usr/local/bin/ +COPY --from=builder /llama.cpp/build/bin/llama-cli /usr/local/bin/ RUN mkdir -p /models /templates COPY templates/ /templates/ COPY entrypoint-v3.1-9b.sh /entrypoint.sh diff --git a/inference/entrypoint-mtp.sh b/inference/entrypoint-mtp.sh index ddba8b0..664b611 100755 --- a/inference/entrypoint-mtp.sh +++ b/inference/entrypoint-mtp.sh @@ -14,9 +14,12 @@ KV_FLAGS="-ctk $KV_CACHE_K -ctv $KV_CACHE_V" PARALLEL="${PARALLEL_SLOTS:-4}" MODEL_FILE="${MODEL_PATH:-/models/Qwen3.5-9B-MTP-Q4_K_M-F16mtp.gguf}" -export GGML_CUDA_NO_PINNED="${GGML_CUDA_NO_PINNED:-0}" -export CUDA_DEVICE_MAX_CONNECTIONS="${CUDA_DEVICE_MAX_CONNECTIONS:-1}" -export CUDA_MODULE_LOADING="${CUDA_MODULE_LOADING:-LAZY}" +# CUDA performance tuning — ignored on non-CUDA backends +if command -v nvidia-smi >/dev/null 2>&1; then + export GGML_CUDA_NO_PINNED="${GGML_CUDA_NO_PINNED:-0}" + export CUDA_DEVICE_MAX_CONNECTIONS="${CUDA_DEVICE_MAX_CONNECTIONS:-1}" + export CUDA_MODULE_LOADING="${CUDA_MODULE_LOADING:-LAZY}" +fi echo "=== V3.1 MTP: Qwen3.5-9B — Generation + MTP + Self-Embeddings ===" echo " Model: $MODEL_FILE" diff --git a/inference/entrypoint-v3-specdec.sh b/inference/entrypoint-v3-specdec.sh index 2d708dd..7bd4700 100644 --- a/inference/entrypoint-v3-specdec.sh +++ b/inference/entrypoint-v3-specdec.sh @@ -25,9 +25,12 @@ TEMPLATE="${CHAT_TEMPLATE:-Qwen3-custom.jinja}" PARALLEL="${PARALLEL_SLOTS:-2}" DRAFT_MODEL="${DRAFT_MODEL:-/models/Qwen3-0.6B-Q8_0.gguf}" -export GGML_CUDA_NO_PINNED="${GGML_CUDA_NO_PINNED:-0}" -export CUDA_DEVICE_MAX_CONNECTIONS="${CUDA_DEVICE_MAX_CONNECTIONS:-1}" -export CUDA_MODULE_LOADING="${CUDA_MODULE_LOADING:-LAZY}" +# CUDA performance tuning — ignored on non-CUDA backends +if command -v nvidia-smi >/dev/null 2>&1; then + export GGML_CUDA_NO_PINNED="${GGML_CUDA_NO_PINNED:-0}" + export CUDA_DEVICE_MAX_CONNECTIONS="${CUDA_DEVICE_MAX_CONNECTIONS:-1}" + export CUDA_MODULE_LOADING="${CUDA_MODULE_LOADING:-LAZY}" +fi echo "=== V3: Generation + Self-Embeddings + Speculative Decoding ===" echo " Context: $CTX_LENGTH (draft: $DRAFT_CTX) | KV: $KV_CACHE_TYPE | Parallel: $PARALLEL" diff --git a/inference/entrypoint-v3.1-9b.sh b/inference/entrypoint-v3.1-9b.sh index 02da5ef..ca591ea 100755 --- a/inference/entrypoint-v3.1-9b.sh +++ b/inference/entrypoint-v3.1-9b.sh @@ -29,9 +29,12 @@ KV_FLAGS="-ctk $KV_CACHE_K -ctv $KV_CACHE_V" PARALLEL="${PARALLEL_SLOTS:-4}" MODEL_FILE="${MODEL_PATH:-/models/Qwen3.5-9B-Q6_K.gguf}" -export GGML_CUDA_NO_PINNED="${GGML_CUDA_NO_PINNED:-0}" -export CUDA_DEVICE_MAX_CONNECTIONS="${CUDA_DEVICE_MAX_CONNECTIONS:-1}" -export CUDA_MODULE_LOADING="${CUDA_MODULE_LOADING:-LAZY}" +# CUDA performance tuning — ignored on non-CUDA backends +if command -v nvidia-smi >/dev/null 2>&1; then + export GGML_CUDA_NO_PINNED="${GGML_CUDA_NO_PINNED:-0}" + export CUDA_DEVICE_MAX_CONNECTIONS="${CUDA_DEVICE_MAX_CONNECTIONS:-1}" + export CUDA_MODULE_LOADING="${CUDA_MODULE_LOADING:-LAZY}" +fi echo "=== V3.1: Qwen3.5-9B — Generation + Self-Embeddings ===" echo " Model: $MODEL_FILE"