docs(readme) Add TOC #260

Workflow file for this run

	# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
	# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
	#
	# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
	# property and proprietary rights in and to this material, related
	# documentation and any modifications thereto. Any use, reproduction,
	# disclosure or distribution of this material and related documentation
	# without an express license agreement from NVIDIA CORPORATION or
	# its affiliates is strictly prohibited.

	# GPU tests live in a separate workflow because NVIDIA self-hosted runners
	# block pull_request events entirely. Keeping them here avoids a confusing
	# "Skipped" entry with unresolved matrix names on every PR.

	name: CI / GPU

	on:
	workflow_dispatch:
	push:
	branches:
	- main
	- "pull-request/[0-9]+"
	merge_group:
	types:
	- checks_requested

	concurrency:
	group: ${{ github.workflow }}-${{ github.ref }}
	cancel-in-progress: true

	env:
	PIP_NO_CACHE_DIR: "1"
	PIP_DISABLE_PIP_VERSION_CHECK: "1"
	PIP_PREFER_BINARY: "1"

	jobs:
	gpu-tests:
	runs-on: linux-amd64-gpu-rtxpro6000-latest-1
	container:
	image: ubuntu:24.04
	options: -u root --security-opt seccomp=unconfined --shm-size 16g
	env:
	NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }}
	timeout-minutes: 45
	strategy:
	fail-fast: false
	matrix:
	python-version: ["3.11", "3.12", "3.13"]
	# cu128 = representative CUDA 12.x wheel; cu130 = CUDA 13.0 wheel.
	torch-cuda: ["cu128", "cu130"]
	name: "gpu / py${{ matrix.python-version }} / ${{ matrix.torch-cuda }}"
	steps:
	- name: Setup proxy cache
	uses: nv-gha-runners/setup-proxy-cache@main
	with:
	enable-apt: true

	- name: Install system dependencies
	run: \|
	export DEBIAN_FRONTEND=noninteractive
	apt-get update
	apt-get install -y git git-lfs gcc software-properties-common
	add-apt-repository -y ppa:deadsnakes/ppa
	apt-get update
	apt-get install -y \
	python${{ matrix.python-version }} \
	python${{ matrix.python-version }}-venv \
	python${{ matrix.python-version }}-dev
	git lfs install

	- uses: actions/checkout@v4
	with:
	lfs: true

	- name: Verify GPU
	run: nvidia-smi

	- name: Install dependencies and run tests
	run: bash code/scripts/check_python_compat.sh
	env:
	PYTHON_BIN: python${{ matrix.python-version }}
	MODE: train
	SKIP_TESTS: "0"
	REQUIRE_GPU: "1"
	TORCH_CUDA: ${{ matrix.torch-cuda }}
	VENV_DIR: .venv_train_${{ matrix.python-version }}_${{ matrix.torch-cuda }}
	REQ_FILE: code/requirements_public_gpu_${{ matrix.torch-cuda == 'cu130' && 'cu13' \|\| 'cu12' }}.txt

	- name: Training + inference with LER check
	shell: bash
	run: \|
	source .venv_train_${{ matrix.python-version }}_${{ matrix.torch-cuda }}/bin/activate
	bash code/scripts/smoke_run.sh 2>&1 \| tee /tmp/ci_train.log
	r=${PIPESTATUS[0]}; [ $r -ne 0 ] && exit $r
	# 0.35: short run (16k samples, 2 epochs for stable LER across py versions)
	python code/scripts/check_ler_from_log.py /tmp/ci_train.log --max-ler 0.35
	env:
	EXPERIMENT_NAME: ci_short
	PREDECODER_TRAIN_SAMPLES: "16384"
	PREDECODER_VAL_SAMPLES: "2048"
	PREDECODER_TEST_SAMPLES: "2048"
	PREDECODER_TRAIN_EPOCHS: "2"

	- name: Training + inference with multi-worker DataLoader (num_workers=2)
	shell: bash
	run: \|
	source .venv_train_${{ matrix.python-version }}_${{ matrix.torch-cuda }}/bin/activate
	bash code/scripts/smoke_run.sh 2>&1 \| tee /tmp/ci_multiworker.log
	r=${PIPESTATUS[0]}; [ $r -ne 0 ] && exit $r
	python code/scripts/check_ler_from_log.py /tmp/ci_multiworker.log --max-ler 0.35
	env:
	EXPERIMENT_NAME: ci_multiworker
	PREDECODER_TRAIN_SAMPLES: "16384"
	PREDECODER_VAL_SAMPLES: "2048"
	PREDECODER_TEST_SAMPLES: "2048"
	PREDECODER_TRAIN_EPOCHS: "2"
	PREDECODER_INFERENCE_NUM_WORKERS: "2"

	# ---------------------------------------------------------------------------
	# Mid-tier (~5-10 min): extended training + inference with LER check.
	# Runs only after merge to main (not on PR branches) to save GPU time.
	# Single Python version — multi-version coverage is handled by gpu-tests.
	# ---------------------------------------------------------------------------
	mid-gpu-tests:
	if: github.ref == 'refs/heads/main'
	needs: gpu-tests
	runs-on: linux-amd64-gpu-rtxpro6000-latest-1
	container:
	image: ubuntu:24.04
	options: -u root --security-opt seccomp=unconfined --shm-size 16g
	env:
	NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }}
	timeout-minutes: 40
	steps:
	- name: Setup proxy cache
	uses: nv-gha-runners/setup-proxy-cache@main
	with:
	enable-apt: true

	- name: Install system dependencies
	run: \|
	export DEBIAN_FRONTEND=noninteractive
	apt-get update
	apt-get install -y git git-lfs gcc software-properties-common
	add-apt-repository -y ppa:deadsnakes/ppa
	apt-get update
	apt-get install -y python3.13 python3.13-venv python3.13-dev
	git lfs install

	- uses: actions/checkout@v4
	with:
	lfs: true

	- name: Verify GPU
	run: nvidia-smi

	- name: Install Python dependencies
	run: \|
	python3.13 -m venv .venv_mid
	. .venv_mid/bin/activate
	python -m pip install --upgrade pip setuptools wheel
	# TODO: matrix by CUDA major version [cu12, cu13]
	pip install -r code/requirements_public_train-cu12.txt

	- name: Mid-tier training + inference with LER check (32k train, 2 epochs)
	shell: bash
	run: \|
	. .venv_mid/bin/activate
	bash code/scripts/smoke_run.sh 2>&1 \| tee /tmp/ci_mid.log
	r=${PIPESTATUS[0]}; [ $r -ne 0 ] && exit $r
	# 0.2: mid-tier (32k/2 epochs); loosen if flaky
	python code/scripts/check_ler_from_log.py /tmp/ci_mid.log --max-ler 0.2
	env:
	EXPERIMENT_NAME: ci_mid
	PREDECODER_TRAIN_SAMPLES: "32768"
	PREDECODER_VAL_SAMPLES: "4096"
	PREDECODER_TEST_SAMPLES: "4096"
	PREDECODER_TRAIN_EPOCHS: "2"

	- name: HE compile tests (torch.compile + autotune on GPU)
	run: \|
	. .venv_mid/bin/activate
	PYTHONPATH=code python -m unittest discover -s code/tests/mid -p "test_*.py" -v

	# ---------------------------------------------------------------------------
	# Multi-GPU tests: validates NCCL, DDP gradient sync, and per-rank data
	# generation across 2 GPUs.
	#
	# Runner requirement: a self-hosted runner with >=2 GPUs.
	# NVIDIA GHA runners follow the naming pattern
	# linux-amd64-gpu-<model>-latest-<gpu-count>
	# so the 2-GPU variant of the existing rtxpro6000 runner would be:
	# linux-amd64-gpu-rtxpro6000-latest-2
	# Confirm this label with your runner pool before enabling; if no 2-GPU
	# runner exists the job will queue indefinitely.
	#
	# Runs only after merge to main (not on PR branches) to conserve GPU quota.
	# ---------------------------------------------------------------------------
	multi-gpu-tests:
	needs: gpu-tests
	runs-on: linux-amd64-gpu-rtxpro6000-latest-2
	container:
	image: ubuntu:24.04
	options: -u root --security-opt seccomp=unconfined --shm-size 16g
	env:
	NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }}
	timeout-minutes: 20
	steps:
	- name: Setup proxy cache
	uses: nv-gha-runners/setup-proxy-cache@main
	with:
	enable-apt: true

	- name: Install system dependencies
	run: \|
	export DEBIAN_FRONTEND=noninteractive
	apt-get update
	apt-get install -y git git-lfs python3 python3-pip python3-venv
	git lfs install

	- uses: actions/checkout@v4
	with:
	lfs: true

	- name: Verify 2 GPUs are visible
	run: \|
	nvidia-smi
	count=$(nvidia-smi --query-gpu=name --format=csv,noheader \| wc -l)
	echo "GPU count: ${count}"
	[ "${count}" -ge 2 ] \|\| { echo "ERROR: expected >=2 GPUs, found ${count}"; exit 1; }

	- name: Install Python dependencies
	run: \|
	python3 -m venv .venv_multigpu
	. .venv_multigpu/bin/activate
	python -m pip install --upgrade pip setuptools wheel
	# TODO: matrix by CUDA major version [cu12, cu13]
	pip install -r code/requirements_public_train-cu12.txt

	- name: Run multi-GPU unit tests
	run: \|
	. .venv_multigpu/bin/activate
	PYTHONPATH=code python -m unittest discover \
	-s code/tests -p "test_multi_gpu.py" -v

	- name: Multi-GPU smoke training (2 GPUs, DDP)
	# smoke_run.sh hardcodes GPUS=1; call local_run.sh directly so we
	# can pass GPUS=2 and exercise the torch.distributed.run path.
	shell: bash
	run: \|
	. .venv_multigpu/bin/activate
	export PREDECODER_TIMING_RUN=1
	export PREDECODER_DISABLE_SDR=1
	export PREDECODER_LER_FINAL_ONLY=1
	export PREDECODER_INFERENCE_NUM_SAMPLES=32
	export PREDECODER_INFERENCE_LATENCY_SAMPLES=0
	export PREDECODER_INFERENCE_MEAS_BASIS=both
	export PREDECODER_INFERENCE_NUM_WORKERS=0
	EXPERIMENT_NAME=ci_multi_gpu WORKFLOW=train GPUS=2 \
	bash code/scripts/local_run.sh 2>&1 \| tee /tmp/ci_multigpu_train.log
	r=${PIPESTATUS[0]}; [ $r -ne 0 ] && exit $r
	EXPERIMENT_NAME=ci_multi_gpu WORKFLOW=inference GPUS=2 \
	bash code/scripts/local_run.sh 2>&1 \| tee /tmp/ci_multigpu_infer.log
	r=${PIPESTATUS[0]}; [ $r -ne 0 ] && exit $r
	# [LER Validation] lines are emitted during training, not inference
	python code/scripts/check_ler_from_log.py /tmp/ci_multigpu_train.log --max-ler 0.35
	env:
	PREDECODER_TRAIN_SAMPLES: "16384"
	PREDECODER_VAL_SAMPLES: "2048"
	PREDECODER_TEST_SAMPLES: "2048"
	PREDECODER_TRAIN_EPOCHS: "2"

	# ---------------------------------------------------------------------------
	# GPU coverage: captures GPU-specific code paths missed by the CPU coverage job
	# ---------------------------------------------------------------------------
	gpu-coverage:
	runs-on: linux-amd64-gpu-rtxpro6000-latest-1
	container:
	image: ubuntu:24.04
	options: -u root --security-opt seccomp=unconfined --shm-size 16g
	env:
	NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }}
	timeout-minutes: 20
	steps:
	- name: Setup proxy cache
	uses: nv-gha-runners/setup-proxy-cache@main
	with:
	enable-apt: true

	- name: Install system dependencies
	run: \|
	export DEBIAN_FRONTEND=noninteractive
	apt-get update
	apt-get install -y git git-lfs python3 python3-pip python3-venv
	git lfs install

	- uses: actions/checkout@v4
	with:
	lfs: true

	- name: Verify GPU
	run: nvidia-smi

	- name: Install Python dependencies
	run: \|
	python3 -m venv .venv_gpu_cov
	. .venv_gpu_cov/bin/activate
	python -m pip install --upgrade pip setuptools wheel
	# TODO: matrix by CUDA major version [cu12, cu13]
	pip install -r code/requirements_public_train-cu12.txt
	pip install -r code/requirements_ci.txt

	- name: Run tests with GPU coverage
	run: \|
	. .venv_gpu_cov/bin/activate
	PYTHONPATH=code coverage run -m unittest discover -s code/tests -p "test_*.py"
	coverage report
	coverage html -d htmlcov-gpu
	coverage xml -o coverage-gpu.xml

	- name: Upload GPU coverage artifacts
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: gpu-coverage-report
	path: \|
	htmlcov-gpu/
	coverage-gpu.xml

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

docs(readme) Add TOC #260

Workflow file

docs(readme) Add TOC #260

Uh oh!

Workflow file for this run