Skip to content

docs(readme) Add TOC #260

docs(readme) Add TOC

docs(readme) Add TOC #260

Workflow file for this run

# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
#
# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
# property and proprietary rights in and to this material, related
# documentation and any modifications thereto. Any use, reproduction,
# disclosure or distribution of this material and related documentation
# without an express license agreement from NVIDIA CORPORATION or
# its affiliates is strictly prohibited.
# GPU tests live in a separate workflow because NVIDIA self-hosted runners
# block pull_request events entirely. Keeping them here avoids a confusing
# "Skipped" entry with unresolved matrix names on every PR.
name: CI / GPU
on:
workflow_dispatch:
push:
branches:
- main
- "pull-request/[0-9]+"
merge_group:
types:
- checks_requested
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
env:
PIP_NO_CACHE_DIR: "1"
PIP_DISABLE_PIP_VERSION_CHECK: "1"
PIP_PREFER_BINARY: "1"
jobs:
gpu-tests:
runs-on: linux-amd64-gpu-rtxpro6000-latest-1
container:
image: ubuntu:24.04
options: -u root --security-opt seccomp=unconfined --shm-size 16g
env:
NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }}
timeout-minutes: 45
strategy:
fail-fast: false
matrix:
python-version: ["3.11", "3.12", "3.13"]
# cu128 = representative CUDA 12.x wheel; cu130 = CUDA 13.0 wheel.
torch-cuda: ["cu128", "cu130"]
name: "gpu / py${{ matrix.python-version }} / ${{ matrix.torch-cuda }}"
steps:
- name: Setup proxy cache
uses: nv-gha-runners/setup-proxy-cache@main
with:
enable-apt: true
- name: Install system dependencies
run: |
export DEBIAN_FRONTEND=noninteractive
apt-get update
apt-get install -y git git-lfs gcc software-properties-common
add-apt-repository -y ppa:deadsnakes/ppa
apt-get update
apt-get install -y \
python${{ matrix.python-version }} \
python${{ matrix.python-version }}-venv \
python${{ matrix.python-version }}-dev
git lfs install
- uses: actions/checkout@v4
with:
lfs: true
- name: Verify GPU
run: nvidia-smi
- name: Install dependencies and run tests
run: bash code/scripts/check_python_compat.sh
env:
PYTHON_BIN: python${{ matrix.python-version }}
MODE: train
SKIP_TESTS: "0"
REQUIRE_GPU: "1"
TORCH_CUDA: ${{ matrix.torch-cuda }}
VENV_DIR: .venv_train_${{ matrix.python-version }}_${{ matrix.torch-cuda }}
REQ_FILE: code/requirements_public_gpu_${{ matrix.torch-cuda == 'cu130' && 'cu13' || 'cu12' }}.txt
- name: Training + inference with LER check
shell: bash
run: |
source .venv_train_${{ matrix.python-version }}_${{ matrix.torch-cuda }}/bin/activate
bash code/scripts/smoke_run.sh 2>&1 | tee /tmp/ci_train.log
r=${PIPESTATUS[0]}; [ $r -ne 0 ] && exit $r
# 0.35: short run (16k samples, 2 epochs for stable LER across py versions)
python code/scripts/check_ler_from_log.py /tmp/ci_train.log --max-ler 0.35
env:
EXPERIMENT_NAME: ci_short
PREDECODER_TRAIN_SAMPLES: "16384"
PREDECODER_VAL_SAMPLES: "2048"
PREDECODER_TEST_SAMPLES: "2048"
PREDECODER_TRAIN_EPOCHS: "2"
- name: Training + inference with multi-worker DataLoader (num_workers=2)
shell: bash
run: |
source .venv_train_${{ matrix.python-version }}_${{ matrix.torch-cuda }}/bin/activate
bash code/scripts/smoke_run.sh 2>&1 | tee /tmp/ci_multiworker.log
r=${PIPESTATUS[0]}; [ $r -ne 0 ] && exit $r
python code/scripts/check_ler_from_log.py /tmp/ci_multiworker.log --max-ler 0.35
env:
EXPERIMENT_NAME: ci_multiworker
PREDECODER_TRAIN_SAMPLES: "16384"
PREDECODER_VAL_SAMPLES: "2048"
PREDECODER_TEST_SAMPLES: "2048"
PREDECODER_TRAIN_EPOCHS: "2"
PREDECODER_INFERENCE_NUM_WORKERS: "2"
# ---------------------------------------------------------------------------
# Mid-tier (~5-10 min): extended training + inference with LER check.
# Runs only after merge to main (not on PR branches) to save GPU time.
# Single Python version — multi-version coverage is handled by gpu-tests.
# ---------------------------------------------------------------------------
mid-gpu-tests:
if: github.ref == 'refs/heads/main'
needs: gpu-tests
runs-on: linux-amd64-gpu-rtxpro6000-latest-1
container:
image: ubuntu:24.04
options: -u root --security-opt seccomp=unconfined --shm-size 16g
env:
NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }}
timeout-minutes: 40
steps:
- name: Setup proxy cache
uses: nv-gha-runners/setup-proxy-cache@main
with:
enable-apt: true
- name: Install system dependencies
run: |
export DEBIAN_FRONTEND=noninteractive
apt-get update
apt-get install -y git git-lfs gcc software-properties-common
add-apt-repository -y ppa:deadsnakes/ppa
apt-get update
apt-get install -y python3.13 python3.13-venv python3.13-dev
git lfs install
- uses: actions/checkout@v4
with:
lfs: true
- name: Verify GPU
run: nvidia-smi
- name: Install Python dependencies
run: |
python3.13 -m venv .venv_mid
. .venv_mid/bin/activate
python -m pip install --upgrade pip setuptools wheel
# TODO: matrix by CUDA major version [cu12, cu13]
pip install -r code/requirements_public_train-cu12.txt
- name: Mid-tier training + inference with LER check (32k train, 2 epochs)
shell: bash
run: |
. .venv_mid/bin/activate
bash code/scripts/smoke_run.sh 2>&1 | tee /tmp/ci_mid.log
r=${PIPESTATUS[0]}; [ $r -ne 0 ] && exit $r
# 0.2: mid-tier (32k/2 epochs); loosen if flaky
python code/scripts/check_ler_from_log.py /tmp/ci_mid.log --max-ler 0.2
env:
EXPERIMENT_NAME: ci_mid
PREDECODER_TRAIN_SAMPLES: "32768"
PREDECODER_VAL_SAMPLES: "4096"
PREDECODER_TEST_SAMPLES: "4096"
PREDECODER_TRAIN_EPOCHS: "2"
- name: HE compile tests (torch.compile + autotune on GPU)
run: |
. .venv_mid/bin/activate
PYTHONPATH=code python -m unittest discover -s code/tests/mid -p "test_*.py" -v
# ---------------------------------------------------------------------------
# Multi-GPU tests: validates NCCL, DDP gradient sync, and per-rank data
# generation across 2 GPUs.
#
# Runner requirement: a self-hosted runner with >=2 GPUs.
# NVIDIA GHA runners follow the naming pattern
# linux-amd64-gpu-<model>-latest-<gpu-count>
# so the 2-GPU variant of the existing rtxpro6000 runner would be:
# linux-amd64-gpu-rtxpro6000-latest-2
# Confirm this label with your runner pool before enabling; if no 2-GPU
# runner exists the job will queue indefinitely.
#
# Runs only after merge to main (not on PR branches) to conserve GPU quota.
# ---------------------------------------------------------------------------
multi-gpu-tests:
needs: gpu-tests
runs-on: linux-amd64-gpu-rtxpro6000-latest-2
container:
image: ubuntu:24.04
options: -u root --security-opt seccomp=unconfined --shm-size 16g
env:
NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }}
timeout-minutes: 20
steps:
- name: Setup proxy cache
uses: nv-gha-runners/setup-proxy-cache@main
with:
enable-apt: true
- name: Install system dependencies
run: |
export DEBIAN_FRONTEND=noninteractive
apt-get update
apt-get install -y git git-lfs python3 python3-pip python3-venv
git lfs install
- uses: actions/checkout@v4
with:
lfs: true
- name: Verify 2 GPUs are visible
run: |
nvidia-smi
count=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
echo "GPU count: ${count}"
[ "${count}" -ge 2 ] || { echo "ERROR: expected >=2 GPUs, found ${count}"; exit 1; }
- name: Install Python dependencies
run: |
python3 -m venv .venv_multigpu
. .venv_multigpu/bin/activate
python -m pip install --upgrade pip setuptools wheel
# TODO: matrix by CUDA major version [cu12, cu13]
pip install -r code/requirements_public_train-cu12.txt
- name: Run multi-GPU unit tests
run: |
. .venv_multigpu/bin/activate
PYTHONPATH=code python -m unittest discover \
-s code/tests -p "test_multi_gpu.py" -v
- name: Multi-GPU smoke training (2 GPUs, DDP)
# smoke_run.sh hardcodes GPUS=1; call local_run.sh directly so we
# can pass GPUS=2 and exercise the torch.distributed.run path.
shell: bash
run: |
. .venv_multigpu/bin/activate
export PREDECODER_TIMING_RUN=1
export PREDECODER_DISABLE_SDR=1
export PREDECODER_LER_FINAL_ONLY=1
export PREDECODER_INFERENCE_NUM_SAMPLES=32
export PREDECODER_INFERENCE_LATENCY_SAMPLES=0
export PREDECODER_INFERENCE_MEAS_BASIS=both
export PREDECODER_INFERENCE_NUM_WORKERS=0
EXPERIMENT_NAME=ci_multi_gpu WORKFLOW=train GPUS=2 \
bash code/scripts/local_run.sh 2>&1 | tee /tmp/ci_multigpu_train.log
r=${PIPESTATUS[0]}; [ $r -ne 0 ] && exit $r
EXPERIMENT_NAME=ci_multi_gpu WORKFLOW=inference GPUS=2 \
bash code/scripts/local_run.sh 2>&1 | tee /tmp/ci_multigpu_infer.log
r=${PIPESTATUS[0]}; [ $r -ne 0 ] && exit $r
# [LER Validation] lines are emitted during training, not inference
python code/scripts/check_ler_from_log.py /tmp/ci_multigpu_train.log --max-ler 0.35
env:
PREDECODER_TRAIN_SAMPLES: "16384"
PREDECODER_VAL_SAMPLES: "2048"
PREDECODER_TEST_SAMPLES: "2048"
PREDECODER_TRAIN_EPOCHS: "2"
# ---------------------------------------------------------------------------
# GPU coverage: captures GPU-specific code paths missed by the CPU coverage job
# ---------------------------------------------------------------------------
gpu-coverage:
runs-on: linux-amd64-gpu-rtxpro6000-latest-1
container:
image: ubuntu:24.04
options: -u root --security-opt seccomp=unconfined --shm-size 16g
env:
NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }}
timeout-minutes: 20
steps:
- name: Setup proxy cache
uses: nv-gha-runners/setup-proxy-cache@main
with:
enable-apt: true
- name: Install system dependencies
run: |
export DEBIAN_FRONTEND=noninteractive
apt-get update
apt-get install -y git git-lfs python3 python3-pip python3-venv
git lfs install
- uses: actions/checkout@v4
with:
lfs: true
- name: Verify GPU
run: nvidia-smi
- name: Install Python dependencies
run: |
python3 -m venv .venv_gpu_cov
. .venv_gpu_cov/bin/activate
python -m pip install --upgrade pip setuptools wheel
# TODO: matrix by CUDA major version [cu12, cu13]
pip install -r code/requirements_public_train-cu12.txt
pip install -r code/requirements_ci.txt
- name: Run tests with GPU coverage
run: |
. .venv_gpu_cov/bin/activate
PYTHONPATH=code coverage run -m unittest discover -s code/tests -p "test_*.py"
coverage report
coverage html -d htmlcov-gpu
coverage xml -o coverage-gpu.xml
- name: Upload GPU coverage artifacts
if: always()
uses: actions/upload-artifact@v4
with:
name: gpu-coverage-report
path: |
htmlcov-gpu/
coverage-gpu.xml