forked from NVIDIA/cuda-python
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathoverview.html
More file actions
603 lines (563 loc) · 44 KB
/
overview.html
File metadata and controls
603 lines (563 loc) · 44 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
<!doctype html>
<html class="no-js">
<head><meta charset="utf-8"/>
<meta name="viewport" content="width=device-width,initial-scale=1"/>
<meta name="color-scheme" content="light dark"><link rel="index" title="Index" href="genindex.html" /><link rel="search" title="Search" href="search.html" /><link rel="next" title="Motivation" href="motivation.html" /><link rel="prev" title="Installation" href="install.html" />
<meta name="generator" content="sphinx-4.2.0, furo 2021.10.09"/>
<title>Overview - CUDA Python 11.6.0 documentation</title>
<link rel="stylesheet" type="text/css" href="_static/pygments.css" />
<link rel="stylesheet" type="text/css" href="_static/styles/furo.css?digest=0254c309f5cadf746f1a613e7677379ac9c8cdcd" />
<link rel="stylesheet" type="text/css" href="_static/togglebutton.css" />
<link rel="stylesheet" type="text/css" href="_static/mystnb.css" />
<link rel="stylesheet" type="text/css" href="_static/styles/furo-extensions.css?digest=16fb25fabf47304eee183a5e9af80b1ba98259b1" />
<style>
body {
--color-code-background: #f8f8f8;
--color-code-foreground: black;
}
body[data-theme="dark"] {
--color-code-background: #202020;
--color-code-foreground: #d0d0d0;
}
@media (prefers-color-scheme: dark) {
body:not([data-theme="light"]) {
--color-code-background: #202020;
--color-code-foreground: #d0d0d0;
}
}
</style></head>
<body>
<script>
document.body.dataset.theme = localStorage.getItem("theme") || "auto";
</script>
<svg xmlns="http://www.w3.org/2000/svg" style="display: none;">
<symbol id="svg-toc" viewBox="0 0 24 24">
<title>Contents</title>
<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor"
stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round">
<path stroke="none" d="M0 0h24v24H0z" />
<line x1="4" y1="6" x2="20" y2="6" />
<line x1="10" y1="12" x2="20" y2="12" />
<line x1="6" y1="18" x2="20" y2="18" />
</svg>
</symbol>
<symbol id="svg-menu" viewBox="0 0 24 24">
<title>Menu</title>
<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor"
stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather-menu">
<line x1="3" y1="12" x2="21" y2="12"></line>
<line x1="3" y1="6" x2="21" y2="6"></line>
<line x1="3" y1="18" x2="21" y2="18"></line>
</svg>
</symbol>
<symbol id="svg-arrow-right" viewBox="0 0 24 24">
<title>Expand</title>
<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor"
stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather-chevron-right">
<polyline points="9 18 15 12 9 6"></polyline>
</svg>
</symbol>
<symbol id="svg-sun" viewBox="0 0 24 24">
<title>Light mode</title>
<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor"
stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="feather-sun">
<circle cx="12" cy="12" r="5"></circle>
<line x1="12" y1="1" x2="12" y2="3"></line>
<line x1="12" y1="21" x2="12" y2="23"></line>
<line x1="4.22" y1="4.22" x2="5.64" y2="5.64"></line>
<line x1="18.36" y1="18.36" x2="19.78" y2="19.78"></line>
<line x1="1" y1="12" x2="3" y2="12"></line>
<line x1="21" y1="12" x2="23" y2="12"></line>
<line x1="4.22" y1="19.78" x2="5.64" y2="18.36"></line>
<line x1="18.36" y1="5.64" x2="19.78" y2="4.22"></line>
</svg>
</symbol>
<symbol id="svg-moon" viewBox="0 0 24 24">
<title>Dark mode</title>
<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor"
stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="icon-tabler-moon">
<path stroke="none" d="M0 0h24v24H0z" fill="none" />
<path d="M12 3c.132 0 .263 0 .393 0a7.5 7.5 0 0 0 7.92 12.446a9 9 0 1 1 -8.313 -12.454z" />
</svg>
</symbol>
<symbol id="svg-sun-half" viewBox="0 0 24 24">
<title>Auto light/dark mode</title>
<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor"
stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="icon-tabler-shadow">
<path stroke="none" d="M0 0h24v24H0z" fill="none"/>
<circle cx="12" cy="12" r="9" />
<path d="M13 12h5" />
<path d="M13 15h4" />
<path d="M13 18h1" />
<path d="M13 9h4" />
<path d="M13 6h1" />
</svg>
</symbol>
</svg>
<input type="checkbox" class="sidebar-toggle" name="__navigation" id="__navigation">
<input type="checkbox" class="sidebar-toggle" name="__toc" id="__toc">
<label class="overlay sidebar-overlay" for="__navigation">
<div class="visually-hidden">Hide navigation sidebar</div>
</label>
<label class="overlay toc-overlay" for="__toc">
<div class="visually-hidden">Hide table of contents sidebar</div>
</label>
<div class="page">
<header class="mobile-header">
<div class="header-left">
<label class="nav-overlay-icon" for="__navigation">
<div class="visually-hidden">Toggle site navigation sidebar</div>
<i class="icon"><svg><use href="#svg-menu"></use></svg></i>
</label>
</div>
<div class="header-center">
<a href="index.html"><div class="brand">CUDA Python 11.6.0 documentation</div></a>
</div>
<div class="header-right">
<div class="theme-toggle-container theme-toggle-header">
<button class="theme-toggle">
<div class="visually-hidden">Toggle Light / Dark / Auto color theme</div>
<svg class="theme-icon-when-auto"><use href="#svg-sun-half"></use></svg>
<svg class="theme-icon-when-dark"><use href="#svg-moon"></use></svg>
<svg class="theme-icon-when-light"><use href="#svg-sun"></use></svg>
</button>
</div>
<label class="toc-overlay-icon toc-header-icon" for="__toc">
<div class="visually-hidden">Toggle table of contents sidebar</div>
<i class="icon"><svg><use href="#svg-toc"></use></svg></i>
</label>
</div>
</header>
<aside class="sidebar-drawer">
<div class="sidebar-container">
<div class="sidebar-sticky"><a class="sidebar-brand" href="index.html">
<div class="sidebar-logo-container">
<img class="sidebar-logo only-light" src="_static/logo-light-mode.png" alt="Light Logo"/>
<img class="sidebar-logo only-dark" src="_static/logo-dark-mode.png" alt="Dark Logo"/>
</div>
<span class="sidebar-brand-text">CUDA Python 11.6.0 documentation</span>
</a><form class="sidebar-search-container" method="get" action="search.html" role="search">
<input class="sidebar-search" placeholder=Search name="q" aria-label="Search">
<input type="hidden" name="check_keywords" value="yes">
<input type="hidden" name="area" value="default">
</form>
<div id="searchbox"></div><div class="sidebar-scroll"><div class="sidebar-tree">
<p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
<ul class="current">
<li class="toctree-l1"><a class="reference internal" href="install.html">Installation</a></li>
<li class="toctree-l1 current current-page"><a class="current reference internal" href="#">Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="motivation.html">Motivation</a></li>
<li class="toctree-l1"><a class="reference internal" href="conduct.html">Code of Conduct</a></li>
<li class="toctree-l1"><a class="reference internal" href="contribute.html">Contributing</a></li>
<li class="toctree-l1 has-children"><a class="reference internal" href="release.html">Release Notes</a><input class="toctree-checkbox" id="toctree-checkbox-1" name="toctree-checkbox-1" role="switch" type="checkbox"/><label for="toctree-checkbox-1"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
<li class="toctree-l2"><a class="reference internal" href="release/11.6.0-notes.html"> 11.6.0</a></li>
<li class="toctree-l2"><a class="reference internal" href="release/11.5.0-notes.html"> 11.5.0</a></li>
<li class="toctree-l2"><a class="reference internal" href="release/11.4.0-notes.html"> 11.4.0</a></li>
</ul>
</li>
<li class="toctree-l1 has-children"><a class="reference internal" href="api.html">CUDA Python API Reference</a><input class="toctree-checkbox" id="toctree-checkbox-2" name="toctree-checkbox-2" role="switch" type="checkbox"/><label for="toctree-checkbox-2"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
<li class="toctree-l2"><a class="reference internal" href="module/cuda.html">cuda</a></li>
<li class="toctree-l2"><a class="reference internal" href="module/cudart.html">cudart</a></li>
<li class="toctree-l2"><a class="reference internal" href="module/nvrtc.html">nvrtc</a></li>
</ul>
</li>
</ul>
</div>
</div>
</div>
</div>
</aside>
<div class="main">
<div class="content">
<div class="article-container">
<div class="content-icon-container">
<div class="theme-toggle-container theme-toggle-content">
<button class="theme-toggle">
<div class="visually-hidden">Toggle Light / Dark / Auto color theme</div>
<svg class="theme-icon-when-auto"><use href="#svg-sun-half"></use></svg>
<svg class="theme-icon-when-dark"><use href="#svg-moon"></use></svg>
<svg class="theme-icon-when-light"><use href="#svg-sun"></use></svg>
</button>
</div>
<label class="toc-overlay-icon toc-content-icon" for="__toc">
<div class="visually-hidden">Toggle table of contents sidebar</div>
<i class="icon"><svg><use href="#svg-toc"></use></svg></i>
</label>
</div>
<article role="main">
<div class="tex2jax_ignore mathjax_ignore section" id="overview">
<h1>Overview<a class="headerlink" href="#overview" title="Permalink to this headline">¶</a></h1>
<p style="font-size: 14px; color: grey; text-align: right;">by <a href="https://developer.nvidia.com/blog/author/mnicely/">Matthew Nicely</a></p>
<p>Python plays a key role within the science, engineering, data analytics, and
deep learning application ecosystem. NVIDIA has long been committed to helping
the Python ecosystem leverage the accelerated massively parallel performance of
GPUs to deliver standardized libraries, tools, and applications. Today, we’re
introducing another step towards simplification of the developer experience with
improved Python code portability and compatibility.</p>
<p>Our goal is to help unify the Python CUDA ecosystem with a single standard set
of low-level interfaces, providing full coverage of and access to the CUDA host
APIs from Python. We want to provide an ecosystem foundation to allow
interoperability among different accelerated libraries. Most importantly, it
should be easy for Python developers to use NVIDIA GPUs.</p>
<div class="section" id="cuda-python-workflow">
<h2>CUDA Python workflow<a class="headerlink" href="#cuda-python-workflow" title="Permalink to this headline">¶</a></h2>
<p>Because Python is an interpreted language, you need a way to compile the device
code into
<a class="reference external" href="https://docs.nvidia.com/cuda/parallel-thread-execution/index.html">PTX</a> and
then extract the function to be called at a later point in the application. It’s
not important for understanding CUDA Python, but Parallel Thread Execution (PTX)
is a low-level virtual machine and instruction set architecture (ISA). You
construct your device code in the form of a string and compile it with
<a class="reference external" href="http://docs.nvidia.com/cuda/nvrtc/index.html">NVRTC</a>, a runtime compilation
library for CUDA C++. Using the NVIDIA <a class="reference external" href="http://docs.nvidia.com/cuda/cuda-driver-api/index.html">Driver
API</a>, manually create a
CUDA context and all required resources on the GPU, then launch the compiled
CUDA C++ code and retrieve the results from the GPU. Now that you have an
overview, jump into a commonly used example for parallel programming:
<a class="reference external" href="https://developer.nvidia.com/blog/six-ways-saxpy/">SAXPY</a>.</p>
<p>The first thing to do is import the <a class="reference external" href="https://docs.nvidia.com/cuda/cuda-driver-api/index.html">Driver
API</a> and
<a class="reference external" href="https://docs.nvidia.com/cuda/nvrtc/index.html">NVRTC</a> modules from the CUDA
Python package. In this example, you copy data from the host to device. You need
<a class="reference external" href="https://numpy.org/doc/stable/contents.html">NumPy</a> to store data on the host.</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">cuda</span> <span class="kn">import</span> <span class="n">cuda</span><span class="p">,</span> <span class="n">nvrtc</span>
<span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
</pre></div>
</div>
</div>
</div>
<p>Error checking is a fundamental best practice in code development and a code
example is provided. For brevity, error checking within the example is omitted.
In a future release, this may automatically raise exceptions using a Python
object model.</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="k">def</span> <span class="nf">ASSERT_DRV</span><span class="p">(</span><span class="n">err</span><span class="p">):</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">err</span><span class="p">,</span> <span class="n">cuda</span><span class="o">.</span><span class="n">CUresult</span><span class="p">):</span>
<span class="k">if</span> <span class="n">err</span> <span class="o">!=</span> <span class="n">cuda</span><span class="o">.</span><span class="n">CUresult</span><span class="o">.</span><span class="n">CUDA_SUCCESS</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">RuntimeError</span><span class="p">(</span><span class="s2">"Cuda Error: </span><span class="si">{}</span><span class="s2">"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">err</span><span class="p">))</span>
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">err</span><span class="p">,</span> <span class="n">nvrtc</span><span class="o">.</span><span class="n">nvrtcResult</span><span class="p">):</span>
<span class="k">if</span> <span class="n">err</span> <span class="o">!=</span> <span class="n">nvrtc</span><span class="o">.</span><span class="n">nvrtcResult</span><span class="o">.</span><span class="n">NVRTC_SUCCESS</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">RuntimeError</span><span class="p">(</span><span class="s2">"Nvrtc Error: </span><span class="si">{}</span><span class="s2">"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">err</span><span class="p">))</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">RuntimeError</span><span class="p">(</span><span class="s2">"Unknown error type: </span><span class="si">{}</span><span class="s2">"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">err</span><span class="p">))</span>
</pre></div>
</div>
</div>
</div>
<p>It’s common practice to write CUDA kernels near the top of a translation unit,
so write it next. The entire kernel is wrapped in triple quotes to form a
string. The string is compiled later using NVRTC. This is the only part of CUDA
Python that requires some understanding of CUDA C++. For more information, see
<a class="reference external" href="https://developer.nvidia.com/blog/even-easier-introduction-cuda/">An Even Easier Introduction to
CUDA</a>.</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">saxpy</span> <span class="o">=</span> <span class="s2">"""</span><span class="se">\</span>
<span class="s2">extern "C" __global__</span>
<span class="s2">void saxpy(float a, float *x, float *y, float *out, size_t n)</span>
<span class="s2">{</span>
<span class="s2"> size_t tid = blockIdx.x * blockDim.x + threadIdx.x;</span>
<span class="s2"> if (tid < n) {</span>
<span class="s2"> out[tid] = a * x[tid] + y[tid];</span>
<span class="s2"> }</span>
<span class="s2">}</span>
<span class="s2">"""</span>
</pre></div>
</div>
</div>
</div>
<p>Go ahead and compile the kernel into PTX. Remember that this is executed at runtime using NVRTC. There are three basic steps to NVRTC:</p>
<ul class="simple">
<li><p>Create a program from the string.</p></li>
<li><p>Compile the program.</p></li>
<li><p>Extract PTX from the compiled program.</p></li>
</ul>
<p>In the following code example, compilation is targeting compute capability 75,
or Turing architecture, with FMAD enabled. If compilation fails, use
<code class="docutils literal notranslate"><span class="pre">nvrtcGetProgramLog</span></code> to retrieve a compile log for additional information.</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="c1"># Create program</span>
<span class="n">err</span><span class="p">,</span> <span class="n">prog</span> <span class="o">=</span> <span class="n">nvrtc</span><span class="o">.</span><span class="n">nvrtcCreateProgram</span><span class="p">(</span><span class="nb">str</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span><span class="n">saxpy</span><span class="p">),</span> <span class="sa">b</span><span class="s2">"saxpy.cu"</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="p">[],</span> <span class="p">[])</span>
<span class="c1"># Compile program</span>
<span class="n">opts</span> <span class="o">=</span> <span class="p">[</span><span class="sa">b</span><span class="s2">"--fmad=false"</span><span class="p">,</span> <span class="sa">b</span><span class="s2">"--gpu-architecture=compute_75"</span><span class="p">]</span>
<span class="n">err</span><span class="p">,</span> <span class="o">=</span> <span class="n">nvrtc</span><span class="o">.</span><span class="n">nvrtcCompileProgram</span><span class="p">(</span><span class="n">prog</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="n">opts</span><span class="p">)</span>
<span class="c1"># Get PTX from compilation</span>
<span class="n">err</span><span class="p">,</span> <span class="n">ptxSize</span> <span class="o">=</span> <span class="n">nvrtc</span><span class="o">.</span><span class="n">nvrtcGetPTXSize</span><span class="p">(</span><span class="n">prog</span><span class="p">)</span>
<span class="n">ptx</span> <span class="o">=</span> <span class="sa">b</span><span class="s2">" "</span> <span class="o">*</span> <span class="n">ptxSize</span>
<span class="n">err</span><span class="p">,</span> <span class="o">=</span> <span class="n">nvrtc</span><span class="o">.</span><span class="n">nvrtcGetPTX</span><span class="p">(</span><span class="n">prog</span><span class="p">,</span> <span class="n">ptx</span><span class="p">)</span>
</pre></div>
</div>
</div>
</div>
<p>Before you can use the PTX or do any work on the GPU, you must create a CUDA
context. CUDA contexts are analogous to host processes for the device. In the
following code example, the Driver API is initialized so that the NVIDIA driver
and GPU are accessible. Next, a handle for compute device 0 is passed to
<code class="docutils literal notranslate"><span class="pre">cuCtxCreate</span></code> to designate that GPU for context creation. With the context
created, you can proceed in compiling the CUDA kernel using NVRTC.</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="c1"># Initialize CUDA Driver API</span>
<span class="n">err</span><span class="p">,</span> <span class="o">=</span> <span class="n">cuda</span><span class="o">.</span><span class="n">cuInit</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span>
<span class="c1"># Retrieve handle for device 0</span>
<span class="n">err</span><span class="p">,</span> <span class="n">cuDevice</span> <span class="o">=</span> <span class="n">cuda</span><span class="o">.</span><span class="n">cuDeviceGet</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span>
<span class="c1"># Create context</span>
<span class="n">err</span><span class="p">,</span> <span class="n">context</span> <span class="o">=</span> <span class="n">cuda</span><span class="o">.</span><span class="n">cuCtxCreate</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">cuDevice</span><span class="p">)</span>
</pre></div>
</div>
</div>
</div>
<p>With a CUDA context created on device 0, load the PTX generated earlier into a
module. A module is analogous to dynamically loaded libraries for the device.
After loading into the module, extract a specific kernel with
<code class="docutils literal notranslate"><span class="pre">cuModuleGetFunction</span></code>. It is not uncommon for multiple kernels to reside in PTX.</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="c1"># Load PTX as module data and retrieve function</span>
<span class="n">ptx</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">char</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="n">ptx</span><span class="p">)</span>
<span class="c1"># Note: Incompatible --gpu-architecture would be detected here</span>
<span class="n">err</span><span class="p">,</span> <span class="n">module</span> <span class="o">=</span> <span class="n">cuda</span><span class="o">.</span><span class="n">cuModuleLoadData</span><span class="p">(</span><span class="n">ptx</span><span class="o">.</span><span class="n">ctypes</span><span class="o">.</span><span class="n">data</span><span class="p">)</span>
<span class="n">ASSERT_DRV</span><span class="p">(</span><span class="n">err</span><span class="p">)</span>
<span class="n">err</span><span class="p">,</span> <span class="n">kernel</span> <span class="o">=</span> <span class="n">cuda</span><span class="o">.</span><span class="n">cuModuleGetFunction</span><span class="p">(</span><span class="n">module</span><span class="p">,</span> <span class="sa">b</span><span class="s2">"saxpy"</span><span class="p">)</span>
<span class="n">ASSERT_DRV</span><span class="p">(</span><span class="n">err</span><span class="p">)</span>
</pre></div>
</div>
</div>
</div>
<p>Next, get all your data prepared and transferred to the GPU. For increased
application performance, you can input data on the device to eliminate data
transfers. For completeness, this example shows how you would transfer data to
and from the device.</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">NUM_THREADS</span> <span class="o">=</span> <span class="mi">512</span> <span class="c1"># Threads per block</span>
<span class="n">NUM_BLOCKS</span> <span class="o">=</span> <span class="mi">32768</span> <span class="c1"># Blocks per grid</span>
<span class="n">a</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="mf">2.0</span><span class="p">],</span> <span class="n">dtype</span><span class="o">=</span><span class="n">np</span><span class="o">.</span><span class="n">float32</span><span class="p">)</span>
<span class="n">n</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="n">NUM_THREADS</span> <span class="o">*</span> <span class="n">NUM_BLOCKS</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">np</span><span class="o">.</span><span class="n">uint32</span><span class="p">)</span>
<span class="n">bufferSize</span> <span class="o">=</span> <span class="n">n</span> <span class="o">*</span> <span class="n">a</span><span class="o">.</span><span class="n">itemsize</span>
<span class="n">hX</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">rand</span><span class="p">(</span><span class="n">n</span><span class="p">)</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="n">dtype</span><span class="o">=</span><span class="n">np</span><span class="o">.</span><span class="n">float32</span><span class="p">)</span>
<span class="n">hY</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">rand</span><span class="p">(</span><span class="n">n</span><span class="p">)</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="n">dtype</span><span class="o">=</span><span class="n">np</span><span class="o">.</span><span class="n">float32</span><span class="p">)</span>
<span class="n">hOut</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">zeros</span><span class="p">(</span><span class="n">n</span><span class="p">)</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="n">dtype</span><span class="o">=</span><span class="n">np</span><span class="o">.</span><span class="n">float32</span><span class="p">)</span>
</pre></div>
</div>
</div>
</div>
<p>With the input data <code class="docutils literal notranslate"><span class="pre">a</span></code>, <code class="docutils literal notranslate"><span class="pre">x</span></code>, and <code class="docutils literal notranslate"><span class="pre">y</span></code> created for the SAXPY transform device,
resources must be allocated to store the data using <code class="docutils literal notranslate"><span class="pre">cuMemAlloc</span></code>. To allow for
more overlap between compute and data movement, use the asynchronous function
<code class="docutils literal notranslate"><span class="pre">cuMemcpyHtoDAsync</span></code>. It returns control to the CPU immediately following command
execution.</p>
<p>Python doesn’t have a natural concept of pointers, yet <code class="docutils literal notranslate"><span class="pre">cuMemcpyHtoDAsync</span></code> expects
<code class="docutils literal notranslate"><span class="pre">void*</span></code>. Therefore, <code class="docutils literal notranslate"><span class="pre">XX.ctypes.data</span></code> retrieves the pointer value associated with
XX.</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">err</span><span class="p">,</span> <span class="n">dXclass</span> <span class="o">=</span> <span class="n">cuda</span><span class="o">.</span><span class="n">cuMemAlloc</span><span class="p">(</span><span class="n">bufferSize</span><span class="p">)</span>
<span class="n">err</span><span class="p">,</span> <span class="n">dYclass</span> <span class="o">=</span> <span class="n">cuda</span><span class="o">.</span><span class="n">cuMemAlloc</span><span class="p">(</span><span class="n">bufferSize</span><span class="p">)</span>
<span class="n">err</span><span class="p">,</span> <span class="n">dOutclass</span> <span class="o">=</span> <span class="n">cuda</span><span class="o">.</span><span class="n">cuMemAlloc</span><span class="p">(</span><span class="n">bufferSize</span><span class="p">)</span>
<span class="n">err</span><span class="p">,</span> <span class="n">stream</span> <span class="o">=</span> <span class="n">cuda</span><span class="o">.</span><span class="n">cuStreamCreate</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span>
<span class="n">err</span><span class="p">,</span> <span class="o">=</span> <span class="n">cuda</span><span class="o">.</span><span class="n">cuMemcpyHtoDAsync</span><span class="p">(</span>
<span class="n">dXclass</span><span class="p">,</span> <span class="n">hX</span><span class="o">.</span><span class="n">ctypes</span><span class="o">.</span><span class="n">data</span><span class="p">,</span> <span class="n">bufferSize</span><span class="p">,</span> <span class="n">stream</span>
<span class="p">)</span>
<span class="n">err</span><span class="p">,</span> <span class="o">=</span> <span class="n">cuda</span><span class="o">.</span><span class="n">cuMemcpyHtoDAsync</span><span class="p">(</span>
<span class="n">dYclass</span><span class="p">,</span> <span class="n">hY</span><span class="o">.</span><span class="n">ctypes</span><span class="o">.</span><span class="n">data</span><span class="p">,</span> <span class="n">bufferSize</span><span class="p">,</span> <span class="n">stream</span>
<span class="p">)</span>
</pre></div>
</div>
</div>
</div>
<p>With data prep and resources allocation finished, the kernel is ready to be
launched. To pass the location of the data on the device to the kernel execution
configuration, you must retrieve the device pointer. In the following code
example, <code class="docutils literal notranslate"><span class="pre">int(dXclass)</span></code> retries the pointer value of <code class="docutils literal notranslate"><span class="pre">dXclass</span></code>, which is
<code class="docutils literal notranslate"><span class="pre">CUdeviceptr</span></code>, and assigns a memory size to store this value using <code class="docutils literal notranslate"><span class="pre">np.array</span></code>.</p>
<p>Like <code class="docutils literal notranslate"><span class="pre">cuMemcpyHtoDAsync</span></code>, <code class="docutils literal notranslate"><span class="pre">cuLaunchKernel</span></code> expects <code class="docutils literal notranslate"><span class="pre">void**</span></code> in the argument list. In
the earlier code example, it creates <code class="docutils literal notranslate"><span class="pre">void**</span></code> by grabbing the <code class="docutils literal notranslate"><span class="pre">void*</span></code> value of each
individual argument and placing them into its own contiguous memory.</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="c1"># The following code example is not intuitive </span>
<span class="c1"># Subject to change in a future release</span>
<span class="n">dX</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="nb">int</span><span class="p">(</span><span class="n">dXclass</span><span class="p">)],</span> <span class="n">dtype</span><span class="o">=</span><span class="n">np</span><span class="o">.</span><span class="n">uint64</span><span class="p">)</span>
<span class="n">dY</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="nb">int</span><span class="p">(</span><span class="n">dYclass</span><span class="p">)],</span> <span class="n">dtype</span><span class="o">=</span><span class="n">np</span><span class="o">.</span><span class="n">uint64</span><span class="p">)</span>
<span class="n">dOut</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="nb">int</span><span class="p">(</span><span class="n">dOutclass</span><span class="p">)],</span> <span class="n">dtype</span><span class="o">=</span><span class="n">np</span><span class="o">.</span><span class="n">uint64</span><span class="p">)</span>
<span class="n">args</span> <span class="o">=</span> <span class="p">[</span><span class="n">a</span><span class="p">,</span> <span class="n">dX</span><span class="p">,</span> <span class="n">dY</span><span class="p">,</span> <span class="n">dOut</span><span class="p">,</span> <span class="n">n</span><span class="p">]</span>
<span class="n">args</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="n">arg</span><span class="o">.</span><span class="n">ctypes</span><span class="o">.</span><span class="n">data</span> <span class="k">for</span> <span class="n">arg</span> <span class="ow">in</span> <span class="n">args</span><span class="p">],</span> <span class="n">dtype</span><span class="o">=</span><span class="n">np</span><span class="o">.</span><span class="n">uint64</span><span class="p">)</span>
</pre></div>
</div>
</div>
</div>
<p>Now the kernel can be launched:</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">err</span><span class="p">,</span> <span class="o">=</span> <span class="n">cuda</span><span class="o">.</span><span class="n">cuLaunchKernel</span><span class="p">(</span>
<span class="n">kernel</span><span class="p">,</span>
<span class="n">NUM_BLOCKS</span><span class="p">,</span> <span class="c1"># grid x dim</span>
<span class="mi">1</span><span class="p">,</span> <span class="c1"># grid y dim</span>
<span class="mi">1</span><span class="p">,</span> <span class="c1"># grid z dim</span>
<span class="n">NUM_THREADS</span><span class="p">,</span> <span class="c1"># block x dim</span>
<span class="mi">1</span><span class="p">,</span> <span class="c1"># block y dim</span>
<span class="mi">1</span><span class="p">,</span> <span class="c1"># block z dim</span>
<span class="mi">0</span><span class="p">,</span> <span class="c1"># dynamic shared memory</span>
<span class="n">stream</span><span class="p">,</span> <span class="c1"># stream</span>
<span class="n">args</span><span class="o">.</span><span class="n">ctypes</span><span class="o">.</span><span class="n">data</span><span class="p">,</span> <span class="c1"># kernel arguments</span>
<span class="mi">0</span><span class="p">,</span> <span class="c1"># extra (ignore)</span>
<span class="p">)</span>
<span class="n">err</span><span class="p">,</span> <span class="o">=</span> <span class="n">cuda</span><span class="o">.</span><span class="n">cuMemcpyDtoHAsync</span><span class="p">(</span>
<span class="n">hOut</span><span class="o">.</span><span class="n">ctypes</span><span class="o">.</span><span class="n">data</span><span class="p">,</span> <span class="n">dOutclass</span><span class="p">,</span> <span class="n">bufferSize</span><span class="p">,</span> <span class="n">stream</span>
<span class="p">)</span>
<span class="n">err</span><span class="p">,</span> <span class="o">=</span> <span class="n">cuda</span><span class="o">.</span><span class="n">cuStreamSynchronize</span><span class="p">(</span><span class="n">stream</span><span class="p">)</span>
</pre></div>
</div>
</div>
</div>
<p>The <code class="docutils literal notranslate"><span class="pre">cuLaunchKernel</span></code> function takes the compiled module kernel and execution
configuration parameters. The device code is launched in the same stream as the
data transfers. That ensures that the kernel’s compute is performed only after
the data has finished transfer, as all API calls and kernel launches within a
stream are serialized. After the call to transfer data back to the host is
executed, <code class="docutils literal notranslate"><span class="pre">cuStreamSynchronize</span></code> is used to halt CPU execution until all operations
in the designated stream are finished.</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="c1"># Assert values are same after running kernel</span>
<span class="n">hZ</span> <span class="o">=</span> <span class="n">a</span> <span class="o">*</span> <span class="n">hX</span> <span class="o">+</span> <span class="n">hY</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">np</span><span class="o">.</span><span class="n">allclose</span><span class="p">(</span><span class="n">hOut</span><span class="p">,</span> <span class="n">hZ</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"Error outside tolerance for host-device vectors"</span><span class="p">)</span>
</pre></div>
</div>
</div>
</div>
<p>Perform verification of the data to ensure correctness and finish the code with
memory clean up.</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">err</span><span class="p">,</span> <span class="o">=</span> <span class="n">cuda</span><span class="o">.</span><span class="n">cuStreamDestroy</span><span class="p">(</span><span class="n">stream</span><span class="p">)</span>
<span class="n">err</span><span class="p">,</span> <span class="o">=</span> <span class="n">cuda</span><span class="o">.</span><span class="n">cuMemFree</span><span class="p">(</span><span class="n">dXclass</span><span class="p">)</span>
<span class="n">err</span><span class="p">,</span> <span class="o">=</span> <span class="n">cuda</span><span class="o">.</span><span class="n">cuMemFree</span><span class="p">(</span><span class="n">dYclass</span><span class="p">)</span>
<span class="n">err</span><span class="p">,</span> <span class="o">=</span> <span class="n">cuda</span><span class="o">.</span><span class="n">cuMemFree</span><span class="p">(</span><span class="n">dOutclass</span><span class="p">)</span>
<span class="n">err</span><span class="p">,</span> <span class="o">=</span> <span class="n">cuda</span><span class="o">.</span><span class="n">cuModuleUnload</span><span class="p">(</span><span class="n">module</span><span class="p">)</span>
<span class="n">err</span><span class="p">,</span> <span class="o">=</span> <span class="n">cuda</span><span class="o">.</span><span class="n">cuCtxDestroy</span><span class="p">(</span><span class="n">context</span><span class="p">)</span>
</pre></div>
</div>
</div>
</div>
</div>
<div class="section" id="performance">
<h2>Performance<a class="headerlink" href="#performance" title="Permalink to this headline">¶</a></h2>
<p>Performance is a primary driver in targeting GPUs in your application. So, how
does the above code compare to its C++ version? Table 1 shows that the results
are nearly identical. <a class="reference external" href="https://developer.nvidia.com/nsight-systems">NVIDIA NSight
Systems</a> was used to retrieve
kernel performance and <a class="reference external" href="https://developer.nvidia.com/blog/how-implement-performance-metrics-cuda-cc/">CUDA
Events</a>
was used for application performance.</p>
<p>The following command was used to profile the applications:</p>
<div class="highlight-shell notranslate"><div class="highlight"><pre><span></span>nsys profile -s none -t cuda --stats<span class="o">=</span><span class="nb">true</span> <executable>
</pre></div>
</div>
<div class="table-wrapper"><table class="docutils align-default" id="id1">
<caption><span class="caption-number">Table 1 </span><span class="caption-text">Kernel and application performance comparison.</span><a class="headerlink" href="#id1" title="Permalink to this table">¶</a></caption>
<colgroup>
<col style="width: 33%"/>
<col style="width: 33%"/>
<col style="width: 33%"/>
</colgroup>
<thead>
<tr class="row-odd"><th class="head"></th>
<th class="head"><p>C++</p></th>
<th class="head"><p>Python</p></th>
</tr>
</thead>
<tbody>
<tr class="row-even"><td><p>Kernel execution</p></td>
<td><p>352µs</p></td>
<td><p>352µs</p></td>
</tr>
<tr class="row-odd"><td><p>Application execution</p></td>
<td><p>1076ms</p></td>
<td><p>1080ms</p></td>
</tr>
</tbody>
</table></div>
<p>CUDA Python is also compatible with <a class="reference external" href="https://developer.nvidia.com/nsight-compute">NVIDIA Nsight
Compute</a>, which is an
interactive kernel profiler for CUDA applications. It allows you to have
detailed insights into kernel performance. This is useful when you’re trying to
maximize performance (<a class="reference internal" href="#figure-1"><span class="std std-numref">Fig. 1</span></a>).</p>
<div class="figure align-default" id="figure-1">
<img alt="_images/Nsigth-Compute-CLI-625x473.png" src="_images/Nsigth-Compute-CLI-625x473.png"/>
<p class="caption"><span class="caption-number">Fig. 1 </span><span class="caption-text">Screenshot of Nsight Compute CLI output of CUDA Python example.</span><a class="headerlink" href="#figure-1" title="Permalink to this image">¶</a></p>
</div>
</div>
</div>
</article>
</div>
<footer>
<div class="related-pages">
<a class="next-page" href="motivation.html">
<div class="page-info">
<div class="context">
<span>Next</span>
</div>
<div class="title">Motivation</div>
</div>
<svg><use href="#svg-arrow-right"></use></svg>
</a>
<a class="prev-page" href="install.html">
<svg><use href="#svg-arrow-right"></use></svg>
<div class="page-info">
<div class="context">
<span>Previous</span>
</div>
<div class="title">Installation</div>
</div>
</a>
</div>
<div class="related-information">
Copyright © 2021-2022, NVIDIA |
Built with <a href="https://www.sphinx-doc.org/">Sphinx</a>
and
<a class="muted-link" href="https://pradyunsg.me">@pradyunsg</a>'s
<a href="https://github.com/pradyunsg/furo">Furo theme</a>. |
<a class="muted-link" href="_sources/overview.md.txt"
rel="nofollow">
Show Source
</a>
</div>
</footer>
</div>
<aside class="toc-drawer">
<div class="toc-sticky toc-scroll">
<div class="toc-title-container">
<span class="toc-title">
Contents
</span>
</div>
<div class="toc-tree-container">
<div class="toc-tree">
<ul>
<li><a class="reference internal" href="#">Overview</a><ul>
<li><a class="reference internal" href="#cuda-python-workflow">CUDA Python workflow</a></li>
<li><a class="reference internal" href="#performance">Performance</a></li>
</ul>
</li>
</ul>
</div>
</div>
</div>
</aside>
</div>
</div><script data-url_root="./" id="documentation_options" src="_static/documentation_options.js"></script>
<script src="_static/jquery.js"></script>
<script src="_static/underscore.js"></script>
<script src="_static/doctools.js"></script>
<script src="_static/scripts/main.js"></script>
<script src="_static/togglebutton.js"></script>
<script>var togglebuttonSelector = '.toggle, .admonition.dropdown, .tag_hide_input div.cell_input, .tag_hide-input div.cell_input, .tag_hide_output div.cell_output, .tag_hide-output div.cell_output, .tag_hide_cell.cell, .tag_hide-cell.cell';</script>
</body>
</html>