cpcloud · cpcloud · Feb 26, 2026 · Feb 26, 2026 · Feb 26, 2026 · Feb 26, 2026
diff --git a/cuda_core/examples/cuda_graphs.py b/cuda_core/examples/cuda_graphs.py
@@ -81,9 +81,9 @@ def main():
     result3 = cp.empty_like(a)
 
     # Prepare launch configuration
-    block_size = 256
-    grid_size = (size + block_size - 1) // block_size
-    config = LaunchConfig(grid=grid_size, block=block_size)
+    block = 256
+    grid = (size + block - 1) // block
+    config = LaunchConfig(grid=grid, block=block)
 
     # Sync before graph capture
     dev.sync()

diff --git a/cuda_core/examples/gl_interop_plasma.py b/cuda_core/examples/gl_interop_plasma.py
@@ -94,8 +94,8 @@ def setup_cuda(kernel_source):
     dev.set_current()
     stream = dev.create_stream()
 
-    opts = ProgramOptions(std="c++11", arch=f"sm_{dev.arch}")
-    prog = Program(kernel_source, code_type="c++", options=opts)
+    program_options = ProgramOptions(std="c++11", arch=f"sm_{dev.arch}")
+    prog = Program(kernel_source, code_type="c++", options=program_options)
     mod = prog.compile("cubin")
     kernel = mod.get_kernel("plasma")
 
@@ -114,14 +114,25 @@ def create_window():
     """Open a pyglet window and return (window, gl_module)."""
     try:
         import pyglet
-        from pyglet.gl import gl as _gl
     except ImportError:
         print(
             "This example requires pyglet >= 2.0.\nInstall it with:  pip install pyglet",
             file=sys.stderr,
         )
         sys.exit(1)
 
+    try:
+        from pyglet.gl import gl as _gl
+    except ImportError as exc:
+        print(
+            "pyglet is installed, but OpenGL could not be initialized.\n"
+            f"Underlying import error: {exc}\n"
+            "Ensure your system provides OpenGL runtime libraries (for example, libGL on Linux), "
+            "and that graphics/display support is available.",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
     window = pyglet.window.Window(
         WIDTH,
         HEIGHT,

diff --git a/cuda_core/examples/pytorch_example.py b/cuda_core/examples/pytorch_example.py
@@ -48,7 +48,7 @@ def __cuda_stream__(self):
         return (0, stream_id)  # Return format required by CUDA Python
 
 
-s = dev.create_stream(PyTorchStreamWrapper(pt_stream))
+stream = dev.create_stream(PyTorchStreamWrapper(pt_stream))
 
 # prepare program
 program_options = ProgramOptions(std="c++11", arch=f"sm_{dev.arch}")
@@ -60,7 +60,7 @@ def __cuda_stream__(self):
 )
 
 # Run in single precision
-ker = mod.get_kernel("saxpy_kernel<float>")
+kernel = mod.get_kernel("saxpy_kernel<float>")
 dtype = torch.float32
 
 # prepare input/output
@@ -75,16 +75,16 @@ def __cuda_stream__(self):
 block = 32
 grid = int((size + block - 1) // block)
 config = LaunchConfig(grid=grid, block=block)
-ker_args = (a.data_ptr(), x.data_ptr(), y.data_ptr(), out.data_ptr(), size)
+kernel_args = (a.data_ptr(), x.data_ptr(), y.data_ptr(), out.data_ptr(), size)
 
 # launch kernel on our stream
-launch(s, config, ker, *ker_args)
+launch(stream, config, kernel, *kernel_args)
 
 # check result
 assert torch.allclose(out, a.item() * x + y)
 
 # let's repeat again with double precision
-ker = mod.get_kernel("saxpy_kernel<double>")
+kernel = mod.get_kernel("saxpy_kernel<double>")
 dtype = torch.float64
 
 # prepare input
@@ -101,10 +101,10 @@ def __cuda_stream__(self):
 block = 64
 grid = int((size + block - 1) // block)
 config = LaunchConfig(grid=grid, block=block)
-ker_args = (a.data_ptr(), x.data_ptr(), y.data_ptr(), out.data_ptr(), size)
+kernel_args = (a.data_ptr(), x.data_ptr(), y.data_ptr(), out.data_ptr(), size)
 
 # launch kernel on PyTorch's stream
-launch(s, config, ker, *ker_args)
+launch(stream, config, kernel, *kernel_args)
 
 # check result
 assert torch.allclose(out, a * x + y)
diff --git a/cuda_core/examples/saxpy.py b/cuda_core/examples/saxpy.py
@@ -35,7 +35,7 @@
 
 dev = Device()
 dev.set_current()
-s = dev.create_stream()
+stream = dev.create_stream()
 
 # prepare program
 program_options = ProgramOptions(std="c++11", arch=f"sm_{dev.arch}")
@@ -51,7 +51,7 @@
 )
 
 # run in single precision
-ker = mod.get_kernel("saxpy<float>")
+kernel = mod.get_kernel("saxpy<float>")
 dtype = cp.float32
 
 # prepare input/output
@@ -61,24 +61,24 @@
 x = rng.random(size, dtype=dtype)
 y = rng.random(size, dtype=dtype)
 out = cp.empty_like(x)
-dev.sync()  # cupy runs on a different stream from s, so sync before accessing
+dev.sync()  # cupy runs on a different stream from stream, so sync before accessing
 
 # prepare launch
 block = 32
 grid = int((size + block - 1) // block)
 config = LaunchConfig(grid=grid, block=block)
-ker_args = (a, x.data.ptr, y.data.ptr, out.data.ptr, size)
+kernel_args = (a, x.data.ptr, y.data.ptr, out.data.ptr, size)
 
-# launch kernel on stream s
-launch(s, config, ker, *ker_args)
-s.sync()
+# launch kernel on stream
+launch(stream, config, kernel, *kernel_args)
+stream.sync()
 
 # check result
 assert cp.allclose(out, a * x + y)
 
 # let's repeat again, this time allocates our own out buffer instead of cupy's
 # run in double precision
-ker = mod.get_kernel("saxpy<double>")
+kernel = mod.get_kernel("saxpy<double>")
 dtype = cp.float64
 
 # prepare input
@@ -91,18 +91,18 @@
 # prepare output
 buf = dev.allocate(
     size * 8,  # = dtype.itemsize
-    stream=s,
+    stream=stream,
 )
 
 # prepare launch
 block = 64
 grid = int((size + block - 1) // block)
 config = LaunchConfig(grid=grid, block=block)
-ker_args = (a, x.data.ptr, y.data.ptr, buf, size)
+kernel_args = (a, x.data.ptr, y.data.ptr, buf, size)
 
-# launch kernel on stream s
-launch(s, config, ker, *ker_args)
-s.sync()
+# launch kernel on stream
+launch(stream, config, kernel, *kernel_args)
+stream.sync()
 
 # check result
 # we wrap output buffer as a cupy array for simplicity
@@ -113,5 +113,5 @@
 
 # clean up resources that we allocate
 # cupy cleans up automatically the rest
-buf.close(s)
-s.close()
+buf.close(stream)
+stream.close()
diff --git a/cuda_core/examples/simple_multi_gpu_example.py b/cuda_core/examples/simple_multi_gpu_example.py
@@ -13,7 +13,7 @@
 
 import cupy as cp
 
-from cuda.core import Device, LaunchConfig, Program, launch, system
+from cuda.core import Device, LaunchConfig, Program, ProgramOptions, launch, system
 
 if system.get_num_devices() < 2:
     print("this example requires at least 2 GPUs", file=sys.stderr)
@@ -40,9 +40,9 @@
     }
 }
 """
-prog_add = Program(code_add, code_type="c++", options={"std": "c++17", "arch": f"sm_{dev0.arch}"})
+prog_add = Program(code_add, code_type="c++", options=ProgramOptions(std="c++17", arch=f"sm_{dev0.arch}"))
 mod_add = prog_add.compile("cubin")
-ker_add = mod_add.get_kernel("vector_add")
+add_kernel = mod_add.get_kernel("vector_add")
 
 # Set GPU 1
 dev1 = Device(1)
@@ -62,9 +62,9 @@
     }
 }
 """
-prog_sub = Program(code_sub, code_type="c++", options={"std": "c++17", "arch": f"sm_{dev1.arch}"})
+prog_sub = Program(code_sub, code_type="c++", options=ProgramOptions(std="c++17", arch=f"sm_{dev1.arch}"))
 mod_sub = prog_sub.compile("cubin")
-ker_sub = mod_sub.get_kernel("vector_sub")
+sub_kernel = mod_sub.get_kernel("vector_sub")
 
 
 # This adaptor ensures that any foreign stream (ex: from CuPy) that have not
@@ -100,7 +100,7 @@ def __cuda_stream__(self):
 stream0.wait(cp_stream0)
 
 # Launch the add kernel on GPU 0 / stream 0
-launch(stream0, config0, ker_add, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size))
+launch(stream0, config0, add_kernel, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size))
 
 # Allocate memory on GPU 1
 # Note: This runs on CuPy's current stream for GPU 1.
@@ -115,7 +115,7 @@ def __cuda_stream__(self):
 stream1.wait(cp_stream1)
 
 # Launch the subtract kernel on GPU 1 / stream 1
-launch(stream1, config1, ker_sub, x.data.ptr, y.data.ptr, z.data.ptr, cp.uint64(size))
+launch(stream1, config1, sub_kernel, x.data.ptr, y.data.ptr, z.data.ptr, cp.uint64(size))
 
 # Synchronize both GPUs are validate the results
 dev0.set_current()

diff --git a/cuda_core/examples/strided_memory_view_gpu.py b/cuda_core/examples/strided_memory_view_gpu.py
@@ -57,7 +57,7 @@
 # We assume the 0-th argument supports either DLPack or CUDA Array Interface (both
 # of which are supported by StridedMemoryView).
 @args_viewable_as_strided_memory((0,))
-def my_func(arr, work_stream, gpu_ker):
+def my_func(arr, work_stream, kernel):
     # Create a memory view over arr (assumed to be a 1D array of int32). The stream
     # ordering is taken care of, so that arr can be safely accessed on our work
     # stream (ordered after a data stream on which arr is potentially prepared).
@@ -73,7 +73,7 @@ def my_func(arr, work_stream, gpu_ker):
     block = 256
     grid = (size + block - 1) // block
     config = LaunchConfig(grid=grid, block=block)
-    launch(work_stream, config, gpu_ker, view.ptr, np.uint64(size))
+    launch(work_stream, config, kernel, view.ptr, np.uint64(size))
     # Here we're being conservative and synchronize over our work stream,
     # assuming we do not know the data stream; if we know then we could
     # just order the data stream after the work stream here, e.g.
@@ -101,24 +101,24 @@ def run():
     # To know the GPU's compute capability, we need to identify which GPU to use.
     dev = Device(0)
     dev.set_current()
-    gpu_prog = Program(gpu_code, code_type="c++", options=ProgramOptions(arch=f"sm_{dev.arch}", std="c++11"))
-    mod = gpu_prog.compile(target_type="cubin")
-    gpu_ker = mod.get_kernel(func_name)
+    prog = Program(gpu_code, code_type="c++", options=ProgramOptions(arch=f"sm_{dev.arch}", std="c++11"))
+    mod = prog.compile(target_type="cubin")
+    kernel = mod.get_kernel(func_name)
 
-    s = dev.create_stream()
+    stream = dev.create_stream()
     try:
         # Create input array on GPU
         arr_gpu = cp.ones(1024, dtype=cp.int32)
         print(f"before: {arr_gpu[:10]=}")
 
         # Run the workload
-        my_func(arr_gpu, s, gpu_ker)
+        my_func(arr_gpu, stream, kernel)
 
         # Check the result
         print(f"after: {arr_gpu[:10]=}")
         assert cp.allclose(arr_gpu, 1 + cp.arange(1024, dtype=cp.int32))
     finally:
-        s.close()
+        stream.close()
 
 
 if __name__ == "__main__":

diff --git a/cuda_core/examples/thread_block_cluster.py b/cuda_core/examples/thread_block_cluster.py
@@ -94,7 +94,7 @@
     options=ProgramOptions(arch=f"sm_{arch}", std="c++17", include_path=include_path),
 )
 mod = prog.compile(target_type="cubin")
-ker = mod.get_kernel("check_cluster_info")
+kernel = mod.get_kernel("check_cluster_info")
 
 # prepare launch config
 grid = 4
@@ -122,7 +122,7 @@
 block_dims[:] = 0
 
 # launch kernel on the default stream
-launch(dev.default_stream, config, ker, grid_buffer, cluster_buffer, block_buffer)
+launch(dev.default_stream, config, kernel, grid_buffer, cluster_buffer, block_buffer)
 dev.sync()
 
 # verify results

diff --git a/cuda_core/examples/vector_add.py b/cuda_core/examples/vector_add.py
@@ -30,15 +30,15 @@
 
 dev = Device()
 dev.set_current()
-s = dev.create_stream()
+stream = dev.create_stream()
 
 # prepare program
 program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}")
 prog = Program(code, code_type="c++", options=program_options)
 mod = prog.compile("cubin", name_expressions=("vector_add<float>",))
 
 # run in single precision
-ker = mod.get_kernel("vector_add<float>")
+kernel = mod.get_kernel("vector_add<float>")
 dtype = cp.float32
 
 # prepare input/output
@@ -48,17 +48,17 @@
 b = rng.random(size, dtype=dtype)
 c = cp.empty_like(a)
 
-# cupy runs on a different stream from s, so sync before accessing
+# cupy runs on a different stream from stream, so sync before accessing
 dev.sync()
 
 # prepare launch
 block = 256
 grid = (size + block - 1) // block
 config = LaunchConfig(grid=grid, block=block)
 
-# launch kernel on stream s
-launch(s, config, ker, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size))
-s.sync()
+# launch kernel on stream
+launch(stream, config, kernel, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size))
+stream.sync()
 
 # check result
 assert cp.allclose(c, a + b)