Skip to content
This repository was archived by the owner on Jan 12, 2026. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
171 changes: 1 addition & 170 deletions docs/sources/conf.py
Original file line number Diff line number Diff line change
@@ -1,172 +1,3 @@
""""
# Configuration file for the Sphinx documentation builder.
#
# This file only contains a selection of the most common options. For a full
# list see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html

# -- Path setup --------------------------------------------------------------

# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
# import os
# import sys
# sys.path.insert(0, os.path.abspath('.'))

import sphinx_rtd_theme

# -- Project information -----------------------------------------------------

project = "Data Parallel Extensions for Python"
copyright = "2021, Intel"
author = "Intel"

# The full version, including alpha/beta/rc tags
release = "0.0.1"


# -- General configuration ---------------------------------------------------

# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
"recommonmark",
"sphinx_rtd_theme",
"sphinx.ext.autodoc",
"sphinx.ext.todo",
]

todo_include_todos = True

source_parsers = {".md": "recommonmark.parser.CommonMarkParser"}

# Add any paths that contain templates here, relative to this directory.
templates_path = ["_templates"]

# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]


# -- Options for HTML output -------------------------------------------------

# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
html_theme = "sphinx_rtd_theme"

# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
# html_static_path = ["_static"]
"""

# *****************************************************************************
# Copyright (c) 2020, Intel Corporation All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# *****************************************************************************


# coding: utf-8
# Configuration file for the Sphinx documentation builder.
#
# -- Path setup --------------------------------------------------------------

# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
# import os
# import sys
# sys.path.insert(0, os.path.abspath('.'))


# -- Import sdc package to build API Reference -------------------------------
#import os
#import sys
#import shutil

#SDC_DOC_NO_API_REF_STR = 'SDC_DOC_NO_API_REF'
#SDC_DOC_APIREF_DIR = '_api_ref'

#sys.path.insert(0, os.path.relpath('buildscripts'))
#sdc_doc_no_api_ref = False # Generate API Reference by default

#if SDC_DOC_NO_API_REF_STR in os.environ:
# sdc_doc_no_api_ref = os.environ[SDC_DOC_NO_API_REF_STR] == '1'

#if not sdc_doc_no_api_ref:
# if os.path.exists(SDC_DOC_APIREF_DIR):
# shutil.rmtree(SDC_DOC_APIREF_DIR)

# try:
# import sdc
# except ImportError:
# raise ImportError('Cannot import sdc.\n'
# 'Documentation generator for API Reference for a given module expects that module '
# 'to be installed. Use conda/pip install SDC to install it prior to using API Reference '
# 'generation. If you want to disable API Reference generation, set the environment '
# 'variable SDC_DOC_NO_API_REF=1')

# try:
# from apiref_generator import generate_api_reference
# except ImportError:
# raise ImportError('Cannot import apiref_generator', os.getcwd())

# generate_api_reference()

#SDC_DOC_NO_EXAMPLES_STR = 'SDC_DOC_NO_EXAMPLES'
#SDC_DOC_EXAMPLES_DIR = '_examples'

#sdc_doc_no_examples = False # Generate examples list by default
#if SDC_DOC_NO_EXAMPLES_STR in os.environ:
# sdc_doc_no_examples = os.environ[SDC_DOC_NO_EXAMPLES_STR] == '1'

#if not sdc_doc_no_examples:
# if os.path.exists(SDC_DOC_EXAMPLES_DIR):
# shutil.rmtree(SDC_DOC_EXAMPLES_DIR)

# try:
# import sdc
# except ImportError:
# raise ImportError('Cannot import sdc.\n'
# 'Documentation generator for Examples for a given module expects that module '
# 'to be installed. Use conda/pip install SDC to install it prior to using API Examples '
# 'generation. If you want to disable Examples generation, set the environment '
# 'variable SDC_DOC_NO_EXAMPLES_STR=1')

# try:
# from examples_generator import generate_examples
# except ImportError:
# raise ImportError('Cannot import examples_generator', os.getcwd())

# generate_examples()

# -- Project information -----------------------------------------------------

project = 'Data Parallel Extensions for Python*'
Expand All @@ -176,7 +7,6 @@
# The full version, including alpha/beta/rc tags
release = '0.1'


# -- General configuration ----------------------------------------------------

# Add any Sphinx extension module names here, as strings. They can be
Expand All @@ -188,6 +18,7 @@
'sphinx.ext.extlinks',
'sphinx.ext.githubpages',
'sphinx.ext.napoleon',
'sphinx.ext.autosectionlabel',
'sphinxcontrib.programoutput',
]

Expand Down
9 changes: 7 additions & 2 deletions docs/sources/ext_links.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,10 @@
**********************************************************
.. _NumPy*: https://numpy.org/
.. _Numba*: https://numba.pydata.org/
.. _Python Array API Standard: https://data-apis.org/array-api/
.. _Intel Distribution for Python: https://www.intel.com/content/www/us/en/developer/tools/oneapi/distribution-for-python.html
.. _Python* Array API Standard: https://data-apis.org/array-api/
.. _Intel Distribution for Python*: https://www.intel.com/content/www/us/en/developer/tools/oneapi/distribution-for-python.html
.. _OpenCl*: https://www.khronos.org/opencl/
.. _DPC++: https://www.apress.com/gp/book/9781484255735
.. _Data Parallel Extension for Numba*: https://intelpython.github.io/numba-dpex/latest/index.html
.. _SYCL*: https://www.khronos.org/sycl/
.. _Data Parallel Control: https://intelpython.github.io/dpctl/latest/index.html
144 changes: 144 additions & 0 deletions docs/sources/heterogeneous_computing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,147 @@

Heterogeneous computing
=======================

Device Offload
**************

Python is an interpreted language, which implies that most of Python codes will run on CPU,
and only a few data parallel regions will execute on data parallel devices.
That is why the concept of host and offload devices is useful when it comes to conceptualizing
a heterogeneous programming model in Python.

.. image:: ./_images/hetero-devices.png
:width: 600px
:align: center
:alt: SIMD

The above diagram illustrates the *host* (the CPU which runs Python interpreter) and three *devices*
(two GPU devices and one attached accelerator device). **Data Parallel Extensions for Python**
offer a programming model where a script executed by Python interpreter on host can *offload* data
parallel kernels to user-specified device. A *kernel* is the *data parallel region* of a program submitted
for execution on the device. There can be multiple data parallel regions, and hence multiple *offload kernels*.

Kernels can be pre-compiled into a library, such as ``dpnp``, or, alternatively, directly coded
in a programming language for heterogeneous computing, such as `OpenCl*`_ or `DPC++`_ .
**Data Parallel Extensions for Python** offer the way of writing kernels directly in Python
using `Numba*`_ compiler along with ``numba-dpex``, the `Data Parallel Extension for Numba*`_.

One or more kernels are submitted for execution into a *queue* targeting an *offload device*.
For each device one or more queues can be created. In most cases you won’t need to work
with device queues directly. Data Parallel Extensions for Python will do necessary underlying
work with queues for you through the :ref:`Compute-Follows-Data`.

Unified Shared Memory
*********************

Each device has its own memory, not necessarily accessible from another device.

.. image:: ./_images/hetero-devices.png
:width: 600px
:align: center
:alt: SIMD

For example, **Device 1** memory may not be directly accessible from the host, but only accessible
via expensive copying by a driver software. Similarly, depending on the architecture, direct data
exchange between **Device 2** and **Device 1** may be impossible, and only possible via expensive
copying through the host memory. These aspects must be taken into consideration when programming
data parallel devices.

In the above illustration the **Device 2** logically consists of two sub-devices, **Sub-Device 1**
and **Sub-Device 2**. The programming model allows accessing **Device 2** as a single logical device, or
by working with each individual sub-devices. For the former case a programmer needs to create
a queue for **Device 2**. For the latter case a programmer needs to create 2 queues, one for each sub-device.

`SYCL*`_ standard introduces a concept of the *Unified Shared Memory* (USM). USM requires hardware support
for unified virtual address space, which allows coherency between the host and the device
pointers. All memory is allocated by the host, but it offers three distinct allocation types:

* **Host: located on the host, accessible by the host or device.** This type of memory is useful in a situation
when you need to stream a read-only data from the host to the device once.

* **Device: located on the device, accessibly only by the device.** This type of memory is the fastest one.
Useful in a situation when most of data crunching happens on the device.

* **Shared: location is both host and device (copies are synchronized by underlying software), accessible by
the host or device.** Shared allocations are useful when data are accessed by both host and devices,
since a user does not need to explicitly manage data migration. However, it is much slower than USM Device memory type.

Compute-Follows-Data
********************
Since data copying between devices is typically very expensive, for performance reasons it is essential
to process data close to where it is allocated. This is the premise of the *Compute-Follows-Data* programming model,
which states that the compute will happen where the data resides. Tensors implemented in ``dpctl`` and ``dpnp``
carry information about allocation queues, and hence, about the device on which an array is allocated.
Based on tensor input arguments of the offload kernel, it deduces the queue on which the execution takes place.

.. image:: ./_images/kernel-queue-device.png
:width: 600px
:align: center
:alt: SIMD

The above picture illustrates the *Compute-Follows-Data* concept. Arrays ``A`` and ``B`` are inputs to the
**Offload Kernel**. These arrays carry information about their *allocation queue* (**Device Queue**) and the
*device* (**Device 1**) where they were created. According to the Compute-Follows-Data paradigm
the **Offload Kernel** will be submitted to this **Device Queue**, and the resulting array ``C`` will
be created on the **Device Queue** associated with the **Device 1**.

**Data Parallel Extensions for Python** require all input tensor arguments to have the **same** allocation queue,
otherwise an exception will be thrown. For example, the following usages will result in the exception.

.. figure:: ./_images/queue-exception1.png
:width: 600px
:align: center
:alt: SIMD

Input tensors are on different devices and different queues. Exception is thrown.

.. figure:: ./_images/queue-exception2.png
:width: 600px
:align: center
:alt: SIMD

Input tensors are on the same device but queues are different. Exception is thrown.

.. figure:: ./_images/queue-exception3.png
:width: 600px
:align: center
:alt: SIMD

Data belongs to the same device, but queues are different and associated with different sub-devices.

Copying data between devices and queues
***************************************

**Data Parallel Extensions for Python** create **one** *canonical queue* per device so that in
normal circumstances you do not need to directly manage queues. Having one canonical queue per device
allows you to copy data between devices using to_device() method:

.. code-block:: python

a_new = a.to_device(b.device)

Array ``a`` will be copied to the device associated with array ``b`` into the new array ``a_new``.
The same queue will be associated with ``b`` and ``a_new``.

Alternatively, you can do this as follows:

.. code-block:: python
:caption: DPNP array

a_new = dpnp.asarray(a, device=b.device)

.. code-block:: python
:caption: DPCtl array

a_new = dpctl.tensor.asarray(a, device=b.device)

Creating additional queues
**************************

As previously indicated **Data Parallel Extensions for Python** automatically create one canonical queue per device,
and you normally work with this queue implicitly. However, you can always create as many additional queues per device
as needed, and work with them explicitly.

A typical situation when you will want to create the queue explicitly is for profiling purposes.
Read `Data Parallel Control`_ documentation for more details about queues.

2 changes: 1 addition & 1 deletion docs/sources/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ gains on data parallel devices such as GPUs. It consists of three foundational p
* **numba_dpex** - Data Parallel Extensions for `Numba*`_ - extension for Numba compiler
that enables programming data parallel devices the same way you program CPU with Numba.
* **dpctl - Data Parallel Control library** that provides utilities for device selection,
allocation of data on devices, tensor data structure along with `Python Array API Standard`_ implementation, and support for creation of user-defined data-parallel extensions.
allocation of data on devices, tensor data structure along with `Python* Array API Standard`_ implementation, and support for creation of user-defined data-parallel extensions.

Table of Contents
*****************
Expand Down
2 changes: 1 addition & 1 deletion docs/sources/prerequisites_and_installation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ Data Parallel Extensions for Python.
**********************

You will need Python 3.8, 3.9, or 3.10 installed on your system. If you do not have one yet the easiest way to do
that is to install `Intel Distribution for Python`_.
that is to install `Intel Distribution for Python*`_.
It will install all essential Python numerical and machine
learning packages optimized for Intel hardware, including Data Parallel Extensions for Python*.
If you have Python installation from another vendor, it is fine too. All you need is to install Data Parallel
Expand Down