Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions cuda_core/examples/cuda_graphs.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,9 +81,9 @@ def main():
result3 = cp.empty_like(a)

# Prepare launch configuration
block_size = 256
grid_size = (size + block_size - 1) // block_size
config = LaunchConfig(grid=grid_size, block=block_size)
block = 256
grid = (size + block - 1) // block
config = LaunchConfig(grid=grid, block=block)

# Sync before graph capture
dev.sync()
Expand Down
17 changes: 14 additions & 3 deletions cuda_core/examples/gl_interop_plasma.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,8 +94,8 @@ def setup_cuda(kernel_source):
dev.set_current()
stream = dev.create_stream()

opts = ProgramOptions(std="c++11", arch=f"sm_{dev.arch}")
prog = Program(kernel_source, code_type="c++", options=opts)
program_options = ProgramOptions(std="c++11", arch=f"sm_{dev.arch}")
prog = Program(kernel_source, code_type="c++", options=program_options)
mod = prog.compile("cubin")
kernel = mod.get_kernel("plasma")

Expand All @@ -114,14 +114,25 @@ def create_window():
"""Open a pyglet window and return (window, gl_module)."""
try:
import pyglet
from pyglet.gl import gl as _gl
except ImportError:
print(
"This example requires pyglet >= 2.0.\nInstall it with: pip install pyglet",
file=sys.stderr,
)
sys.exit(1)

try:
from pyglet.gl import gl as _gl
except ImportError as exc:
print(
"pyglet is installed, but OpenGL could not be initialized.\n"
f"Underlying import error: {exc}\n"
"Ensure your system provides OpenGL runtime libraries (for example, libGL on Linux), "
"and that graphics/display support is available.",
file=sys.stderr,
)
sys.exit(1)

window = pyglet.window.Window(
WIDTH,
HEIGHT,
Expand Down
14 changes: 7 additions & 7 deletions cuda_core/examples/pytorch_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def __cuda_stream__(self):
return (0, stream_id) # Return format required by CUDA Python


s = dev.create_stream(PyTorchStreamWrapper(pt_stream))
stream = dev.create_stream(PyTorchStreamWrapper(pt_stream))

# prepare program
program_options = ProgramOptions(std="c++11", arch=f"sm_{dev.arch}")
Expand All @@ -60,7 +60,7 @@ def __cuda_stream__(self):
)

# Run in single precision
ker = mod.get_kernel("saxpy_kernel<float>")
kernel = mod.get_kernel("saxpy_kernel<float>")
dtype = torch.float32

# prepare input/output
Expand All @@ -75,16 +75,16 @@ def __cuda_stream__(self):
block = 32
grid = int((size + block - 1) // block)
config = LaunchConfig(grid=grid, block=block)
ker_args = (a.data_ptr(), x.data_ptr(), y.data_ptr(), out.data_ptr(), size)
kernel_args = (a.data_ptr(), x.data_ptr(), y.data_ptr(), out.data_ptr(), size)

# launch kernel on our stream
launch(s, config, ker, *ker_args)
launch(stream, config, kernel, *kernel_args)

# check result
assert torch.allclose(out, a.item() * x + y)

# let's repeat again with double precision
ker = mod.get_kernel("saxpy_kernel<double>")
kernel = mod.get_kernel("saxpy_kernel<double>")
dtype = torch.float64

# prepare input
Expand All @@ -101,10 +101,10 @@ def __cuda_stream__(self):
block = 64
grid = int((size + block - 1) // block)
config = LaunchConfig(grid=grid, block=block)
ker_args = (a.data_ptr(), x.data_ptr(), y.data_ptr(), out.data_ptr(), size)
kernel_args = (a.data_ptr(), x.data_ptr(), y.data_ptr(), out.data_ptr(), size)

# launch kernel on PyTorch's stream
launch(s, config, ker, *ker_args)
launch(stream, config, kernel, *kernel_args)

# check result
assert torch.allclose(out, a * x + y)
30 changes: 15 additions & 15 deletions cuda_core/examples/saxpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@

dev = Device()
dev.set_current()
s = dev.create_stream()
stream = dev.create_stream()

# prepare program
program_options = ProgramOptions(std="c++11", arch=f"sm_{dev.arch}")
Expand All @@ -51,7 +51,7 @@
)

# run in single precision
ker = mod.get_kernel("saxpy<float>")
kernel = mod.get_kernel("saxpy<float>")
dtype = cp.float32

# prepare input/output
Expand All @@ -61,24 +61,24 @@
x = rng.random(size, dtype=dtype)
y = rng.random(size, dtype=dtype)
out = cp.empty_like(x)
dev.sync() # cupy runs on a different stream from s, so sync before accessing
dev.sync() # cupy runs on a different stream from stream, so sync before accessing

# prepare launch
block = 32
grid = int((size + block - 1) // block)
config = LaunchConfig(grid=grid, block=block)
ker_args = (a, x.data.ptr, y.data.ptr, out.data.ptr, size)
kernel_args = (a, x.data.ptr, y.data.ptr, out.data.ptr, size)

# launch kernel on stream s
launch(s, config, ker, *ker_args)
s.sync()
# launch kernel on stream
launch(stream, config, kernel, *kernel_args)
stream.sync()

# check result
assert cp.allclose(out, a * x + y)

# let's repeat again, this time allocates our own out buffer instead of cupy's
# run in double precision
ker = mod.get_kernel("saxpy<double>")
kernel = mod.get_kernel("saxpy<double>")
dtype = cp.float64

# prepare input
Expand All @@ -91,18 +91,18 @@
# prepare output
buf = dev.allocate(
size * 8, # = dtype.itemsize
stream=s,
stream=stream,
)

# prepare launch
block = 64
grid = int((size + block - 1) // block)
config = LaunchConfig(grid=grid, block=block)
ker_args = (a, x.data.ptr, y.data.ptr, buf, size)
kernel_args = (a, x.data.ptr, y.data.ptr, buf, size)

# launch kernel on stream s
launch(s, config, ker, *ker_args)
s.sync()
# launch kernel on stream
launch(stream, config, kernel, *kernel_args)
stream.sync()

# check result
# we wrap output buffer as a cupy array for simplicity
Expand All @@ -113,5 +113,5 @@

# clean up resources that we allocate
# cupy cleans up automatically the rest
buf.close(s)
s.close()
buf.close(stream)
stream.close()
14 changes: 7 additions & 7 deletions cuda_core/examples/simple_multi_gpu_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

import cupy as cp

from cuda.core import Device, LaunchConfig, Program, launch, system
from cuda.core import Device, LaunchConfig, Program, ProgramOptions, launch, system

if system.get_num_devices() < 2:
print("this example requires at least 2 GPUs", file=sys.stderr)
Expand All @@ -40,9 +40,9 @@
}
}
"""
prog_add = Program(code_add, code_type="c++", options={"std": "c++17", "arch": f"sm_{dev0.arch}"})
prog_add = Program(code_add, code_type="c++", options=ProgramOptions(std="c++17", arch=f"sm_{dev0.arch}"))
mod_add = prog_add.compile("cubin")
ker_add = mod_add.get_kernel("vector_add")
add_kernel = mod_add.get_kernel("vector_add")

# Set GPU 1
dev1 = Device(1)
Expand All @@ -62,9 +62,9 @@
}
}
"""
prog_sub = Program(code_sub, code_type="c++", options={"std": "c++17", "arch": f"sm_{dev1.arch}"})
prog_sub = Program(code_sub, code_type="c++", options=ProgramOptions(std="c++17", arch=f"sm_{dev1.arch}"))
mod_sub = prog_sub.compile("cubin")
ker_sub = mod_sub.get_kernel("vector_sub")
sub_kernel = mod_sub.get_kernel("vector_sub")


# This adaptor ensures that any foreign stream (ex: from CuPy) that have not
Expand Down Expand Up @@ -100,7 +100,7 @@ def __cuda_stream__(self):
stream0.wait(cp_stream0)

# Launch the add kernel on GPU 0 / stream 0
launch(stream0, config0, ker_add, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size))
launch(stream0, config0, add_kernel, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size))

# Allocate memory on GPU 1
# Note: This runs on CuPy's current stream for GPU 1.
Expand All @@ -115,7 +115,7 @@ def __cuda_stream__(self):
stream1.wait(cp_stream1)

# Launch the subtract kernel on GPU 1 / stream 1
launch(stream1, config1, ker_sub, x.data.ptr, y.data.ptr, z.data.ptr, cp.uint64(size))
launch(stream1, config1, sub_kernel, x.data.ptr, y.data.ptr, z.data.ptr, cp.uint64(size))

# Synchronize both GPUs are validate the results
dev0.set_current()
Expand Down
16 changes: 8 additions & 8 deletions cuda_core/examples/strided_memory_view_gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@
# We assume the 0-th argument supports either DLPack or CUDA Array Interface (both
# of which are supported by StridedMemoryView).
@args_viewable_as_strided_memory((0,))
def my_func(arr, work_stream, gpu_ker):
def my_func(arr, work_stream, kernel):
# Create a memory view over arr (assumed to be a 1D array of int32). The stream
# ordering is taken care of, so that arr can be safely accessed on our work
# stream (ordered after a data stream on which arr is potentially prepared).
Expand All @@ -73,7 +73,7 @@ def my_func(arr, work_stream, gpu_ker):
block = 256
grid = (size + block - 1) // block
config = LaunchConfig(grid=grid, block=block)
launch(work_stream, config, gpu_ker, view.ptr, np.uint64(size))
launch(work_stream, config, kernel, view.ptr, np.uint64(size))
# Here we're being conservative and synchronize over our work stream,
# assuming we do not know the data stream; if we know then we could
# just order the data stream after the work stream here, e.g.
Expand Down Expand Up @@ -101,24 +101,24 @@ def run():
# To know the GPU's compute capability, we need to identify which GPU to use.
dev = Device(0)
dev.set_current()
gpu_prog = Program(gpu_code, code_type="c++", options=ProgramOptions(arch=f"sm_{dev.arch}", std="c++11"))
mod = gpu_prog.compile(target_type="cubin")
gpu_ker = mod.get_kernel(func_name)
prog = Program(gpu_code, code_type="c++", options=ProgramOptions(arch=f"sm_{dev.arch}", std="c++11"))
mod = prog.compile(target_type="cubin")
kernel = mod.get_kernel(func_name)

s = dev.create_stream()
stream = dev.create_stream()
try:
# Create input array on GPU
arr_gpu = cp.ones(1024, dtype=cp.int32)
print(f"before: {arr_gpu[:10]=}")

# Run the workload
my_func(arr_gpu, s, gpu_ker)
my_func(arr_gpu, stream, kernel)

# Check the result
print(f"after: {arr_gpu[:10]=}")
assert cp.allclose(arr_gpu, 1 + cp.arange(1024, dtype=cp.int32))
finally:
s.close()
stream.close()


if __name__ == "__main__":
Expand Down
4 changes: 2 additions & 2 deletions cuda_core/examples/thread_block_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@
options=ProgramOptions(arch=f"sm_{arch}", std="c++17", include_path=include_path),
)
mod = prog.compile(target_type="cubin")
ker = mod.get_kernel("check_cluster_info")
kernel = mod.get_kernel("check_cluster_info")

# prepare launch config
grid = 4
Expand Down Expand Up @@ -122,7 +122,7 @@
block_dims[:] = 0

# launch kernel on the default stream
launch(dev.default_stream, config, ker, grid_buffer, cluster_buffer, block_buffer)
launch(dev.default_stream, config, kernel, grid_buffer, cluster_buffer, block_buffer)
dev.sync()

# verify results
Expand Down
12 changes: 6 additions & 6 deletions cuda_core/examples/vector_add.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,15 +30,15 @@

dev = Device()
dev.set_current()
s = dev.create_stream()
stream = dev.create_stream()

# prepare program
program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}")
prog = Program(code, code_type="c++", options=program_options)
mod = prog.compile("cubin", name_expressions=("vector_add<float>",))

# run in single precision
ker = mod.get_kernel("vector_add<float>")
kernel = mod.get_kernel("vector_add<float>")
dtype = cp.float32

# prepare input/output
Expand All @@ -48,17 +48,17 @@
b = rng.random(size, dtype=dtype)
c = cp.empty_like(a)

# cupy runs on a different stream from s, so sync before accessing
# cupy runs on a different stream from stream, so sync before accessing
dev.sync()

# prepare launch
block = 256
grid = (size + block - 1) // block
config = LaunchConfig(grid=grid, block=block)

# launch kernel on stream s
launch(s, config, ker, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size))
s.sync()
# launch kernel on stream
launch(stream, config, kernel, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size))
stream.sync()

# check result
assert cp.allclose(c, a + b)
Loading