diff --git a/docs/sources/conf.py b/docs/sources/conf.py index 1360b95..42f7310 100644 --- a/docs/sources/conf.py +++ b/docs/sources/conf.py @@ -1,172 +1,3 @@ -"""" -# Configuration file for the Sphinx documentation builder. -# -# This file only contains a selection of the most common options. For a full -# list see the documentation: -# https://www.sphinx-doc.org/en/master/usage/configuration.html - -# -- Path setup -------------------------------------------------------------- - -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. -# -# import os -# import sys -# sys.path.insert(0, os.path.abspath('.')) - -import sphinx_rtd_theme - -# -- Project information ----------------------------------------------------- - -project = "Data Parallel Extensions for Python" -copyright = "2021, Intel" -author = "Intel" - -# The full version, including alpha/beta/rc tags -release = "0.0.1" - - -# -- General configuration --------------------------------------------------- - -# Add any Sphinx extension module names here, as strings. They can be -# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom -# ones. -extensions = [ - "recommonmark", - "sphinx_rtd_theme", - "sphinx.ext.autodoc", - "sphinx.ext.todo", -] - -todo_include_todos = True - -source_parsers = {".md": "recommonmark.parser.CommonMarkParser"} - -# Add any paths that contain templates here, relative to this directory. -templates_path = ["_templates"] - -# List of patterns, relative to source directory, that match files and -# directories to ignore when looking for source files. -# This pattern also affects html_static_path and html_extra_path. -exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] - - -# -- Options for HTML output ------------------------------------------------- - -# The theme to use for HTML and HTML Help pages. See the documentation for -# a list of builtin themes. -# -html_theme = "sphinx_rtd_theme" - -# Add any paths that contain custom static files (such as style sheets) here, -# relative to this directory. They are copied after the builtin static files, -# so a file named "default.css" will overwrite the builtin "default.css". -# html_static_path = ["_static"] -""" - -# ***************************************************************************** -# Copyright (c) 2020, Intel Corporation All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# Redistributions of source code must retain the above copyright notice, -# this list of conditions and the following disclaimer. -# -# Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, -# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; -# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, -# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR -# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, -# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# ***************************************************************************** - - -# coding: utf-8 -# Configuration file for the Sphinx documentation builder. -# -# -- Path setup -------------------------------------------------------------- - -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. -# -# import os -# import sys -# sys.path.insert(0, os.path.abspath('.')) - - -# -- Import sdc package to build API Reference ------------------------------- -#import os -#import sys -#import shutil - -#SDC_DOC_NO_API_REF_STR = 'SDC_DOC_NO_API_REF' -#SDC_DOC_APIREF_DIR = '_api_ref' - -#sys.path.insert(0, os.path.relpath('buildscripts')) -#sdc_doc_no_api_ref = False # Generate API Reference by default - -#if SDC_DOC_NO_API_REF_STR in os.environ: -# sdc_doc_no_api_ref = os.environ[SDC_DOC_NO_API_REF_STR] == '1' - -#if not sdc_doc_no_api_ref: -# if os.path.exists(SDC_DOC_APIREF_DIR): -# shutil.rmtree(SDC_DOC_APIREF_DIR) - -# try: -# import sdc -# except ImportError: -# raise ImportError('Cannot import sdc.\n' -# 'Documentation generator for API Reference for a given module expects that module ' -# 'to be installed. Use conda/pip install SDC to install it prior to using API Reference ' -# 'generation. If you want to disable API Reference generation, set the environment ' -# 'variable SDC_DOC_NO_API_REF=1') - -# try: -# from apiref_generator import generate_api_reference -# except ImportError: -# raise ImportError('Cannot import apiref_generator', os.getcwd()) - -# generate_api_reference() - -#SDC_DOC_NO_EXAMPLES_STR = 'SDC_DOC_NO_EXAMPLES' -#SDC_DOC_EXAMPLES_DIR = '_examples' - -#sdc_doc_no_examples = False # Generate examples list by default -#if SDC_DOC_NO_EXAMPLES_STR in os.environ: -# sdc_doc_no_examples = os.environ[SDC_DOC_NO_EXAMPLES_STR] == '1' - -#if not sdc_doc_no_examples: -# if os.path.exists(SDC_DOC_EXAMPLES_DIR): -# shutil.rmtree(SDC_DOC_EXAMPLES_DIR) - -# try: -# import sdc -# except ImportError: -# raise ImportError('Cannot import sdc.\n' -# 'Documentation generator for Examples for a given module expects that module ' -# 'to be installed. Use conda/pip install SDC to install it prior to using API Examples ' -# 'generation. If you want to disable Examples generation, set the environment ' -# 'variable SDC_DOC_NO_EXAMPLES_STR=1') - -# try: -# from examples_generator import generate_examples -# except ImportError: -# raise ImportError('Cannot import examples_generator', os.getcwd()) - -# generate_examples() - # -- Project information ----------------------------------------------------- project = 'Data Parallel Extensions for Python*' @@ -176,7 +7,6 @@ # The full version, including alpha/beta/rc tags release = '0.1' - # -- General configuration ---------------------------------------------------- # Add any Sphinx extension module names here, as strings. They can be @@ -188,6 +18,7 @@ 'sphinx.ext.extlinks', 'sphinx.ext.githubpages', 'sphinx.ext.napoleon', + 'sphinx.ext.autosectionlabel', 'sphinxcontrib.programoutput', ] diff --git a/docs/sources/ext_links.txt b/docs/sources/ext_links.txt index 874d847..4faef6b 100644 --- a/docs/sources/ext_links.txt +++ b/docs/sources/ext_links.txt @@ -4,5 +4,10 @@ ********************************************************** .. _NumPy*: https://numpy.org/ .. _Numba*: https://numba.pydata.org/ -.. _Python Array API Standard: https://data-apis.org/array-api/ -.. _Intel Distribution for Python: https://www.intel.com/content/www/us/en/developer/tools/oneapi/distribution-for-python.html +.. _Python* Array API Standard: https://data-apis.org/array-api/ +.. _Intel Distribution for Python*: https://www.intel.com/content/www/us/en/developer/tools/oneapi/distribution-for-python.html +.. _OpenCl*: https://www.khronos.org/opencl/ +.. _DPC++: https://www.apress.com/gp/book/9781484255735 +.. _Data Parallel Extension for Numba*: https://intelpython.github.io/numba-dpex/latest/index.html +.. _SYCL*: https://www.khronos.org/sycl/ +.. _Data Parallel Control: https://intelpython.github.io/dpctl/latest/index.html diff --git a/docs/sources/heterogeneous_computing.rst b/docs/sources/heterogeneous_computing.rst index ffdef5f..6341f56 100644 --- a/docs/sources/heterogeneous_computing.rst +++ b/docs/sources/heterogeneous_computing.rst @@ -3,3 +3,147 @@ Heterogeneous computing ======================= + +Device Offload +************** + +Python is an interpreted language, which implies that most of Python codes will run on CPU, +and only a few data parallel regions will execute on data parallel devices. +That is why the concept of host and offload devices is useful when it comes to conceptualizing +a heterogeneous programming model in Python. + +.. image:: ./_images/hetero-devices.png + :width: 600px + :align: center + :alt: SIMD + +The above diagram illustrates the *host* (the CPU which runs Python interpreter) and three *devices* +(two GPU devices and one attached accelerator device). **Data Parallel Extensions for Python** +offer a programming model where a script executed by Python interpreter on host can *offload* data +parallel kernels to user-specified device. A *kernel* is the *data parallel region* of a program submitted +for execution on the device. There can be multiple data parallel regions, and hence multiple *offload kernels*. + +Kernels can be pre-compiled into a library, such as ``dpnp``, or, alternatively, directly coded +in a programming language for heterogeneous computing, such as `OpenCl*`_ or `DPC++`_ . +**Data Parallel Extensions for Python** offer the way of writing kernels directly in Python +using `Numba*`_ compiler along with ``numba-dpex``, the `Data Parallel Extension for Numba*`_. + +One or more kernels are submitted for execution into a *queue* targeting an *offload device*. +For each device one or more queues can be created. In most cases you won’t need to work +with device queues directly. Data Parallel Extensions for Python will do necessary underlying +work with queues for you through the :ref:`Compute-Follows-Data`. + +Unified Shared Memory +********************* + +Each device has its own memory, not necessarily accessible from another device. + +.. image:: ./_images/hetero-devices.png + :width: 600px + :align: center + :alt: SIMD + +For example, **Device 1** memory may not be directly accessible from the host, but only accessible +via expensive copying by a driver software. Similarly, depending on the architecture, direct data +exchange between **Device 2** and **Device 1** may be impossible, and only possible via expensive +copying through the host memory. These aspects must be taken into consideration when programming +data parallel devices. + +In the above illustration the **Device 2** logically consists of two sub-devices, **Sub-Device 1** +and **Sub-Device 2**. The programming model allows accessing **Device 2** as a single logical device, or +by working with each individual sub-devices. For the former case a programmer needs to create +a queue for **Device 2**. For the latter case a programmer needs to create 2 queues, one for each sub-device. + +`SYCL*`_ standard introduces a concept of the *Unified Shared Memory* (USM). USM requires hardware support +for unified virtual address space, which allows coherency between the host and the device +pointers. All memory is allocated by the host, but it offers three distinct allocation types: + +* **Host: located on the host, accessible by the host or device.** This type of memory is useful in a situation + when you need to stream a read-only data from the host to the device once. + +* **Device: located on the device, accessibly only by the device.** This type of memory is the fastest one. + Useful in a situation when most of data crunching happens on the device. + +* **Shared: location is both host and device (copies are synchronized by underlying software), accessible by + the host or device.** Shared allocations are useful when data are accessed by both host and devices, + since a user does not need to explicitly manage data migration. However, it is much slower than USM Device memory type. + +Compute-Follows-Data +******************** +Since data copying between devices is typically very expensive, for performance reasons it is essential +to process data close to where it is allocated. This is the premise of the *Compute-Follows-Data* programming model, +which states that the compute will happen where the data resides. Tensors implemented in ``dpctl`` and ``dpnp`` +carry information about allocation queues, and hence, about the device on which an array is allocated. +Based on tensor input arguments of the offload kernel, it deduces the queue on which the execution takes place. + +.. image:: ./_images/kernel-queue-device.png + :width: 600px + :align: center + :alt: SIMD + +The above picture illustrates the *Compute-Follows-Data* concept. Arrays ``A`` and ``B`` are inputs to the +**Offload Kernel**. These arrays carry information about their *allocation queue* (**Device Queue**) and the +*device* (**Device 1**) where they were created. According to the Compute-Follows-Data paradigm +the **Offload Kernel** will be submitted to this **Device Queue**, and the resulting array ``C`` will +be created on the **Device Queue** associated with the **Device 1**. + +**Data Parallel Extensions for Python** require all input tensor arguments to have the **same** allocation queue, +otherwise an exception will be thrown. For example, the following usages will result in the exception. + +.. figure:: ./_images/queue-exception1.png + :width: 600px + :align: center + :alt: SIMD + + Input tensors are on different devices and different queues. Exception is thrown. + +.. figure:: ./_images/queue-exception2.png + :width: 600px + :align: center + :alt: SIMD + + Input tensors are on the same device but queues are different. Exception is thrown. + +.. figure:: ./_images/queue-exception3.png + :width: 600px + :align: center + :alt: SIMD + + Data belongs to the same device, but queues are different and associated with different sub-devices. + +Copying data between devices and queues +*************************************** + +**Data Parallel Extensions for Python** create **one** *canonical queue* per device so that in +normal circumstances you do not need to directly manage queues. Having one canonical queue per device +allows you to copy data between devices using to_device() method: + +.. code-block:: python + + a_new = a.to_device(b.device) + +Array ``a`` will be copied to the device associated with array ``b`` into the new array ``a_new``. +The same queue will be associated with ``b`` and ``a_new``. + +Alternatively, you can do this as follows: + +.. code-block:: python + :caption: DPNP array + + a_new = dpnp.asarray(a, device=b.device) + +.. code-block:: python + :caption: DPCtl array + + a_new = dpctl.tensor.asarray(a, device=b.device) + +Creating additional queues +************************** + +As previously indicated **Data Parallel Extensions for Python** automatically create one canonical queue per device, +and you normally work with this queue implicitly. However, you can always create as many additional queues per device +as needed, and work with them explicitly. + +A typical situation when you will want to create the queue explicitly is for profiling purposes. +Read `Data Parallel Control`_ documentation for more details about queues. + diff --git a/docs/sources/index.rst b/docs/sources/index.rst index 7263352..6eeb380 100644 --- a/docs/sources/index.rst +++ b/docs/sources/index.rst @@ -18,7 +18,7 @@ gains on data parallel devices such as GPUs. It consists of three foundational p * **numba_dpex** - Data Parallel Extensions for `Numba*`_ - extension for Numba compiler that enables programming data parallel devices the same way you program CPU with Numba. * **dpctl - Data Parallel Control library** that provides utilities for device selection, - allocation of data on devices, tensor data structure along with `Python Array API Standard`_ implementation, and support for creation of user-defined data-parallel extensions. + allocation of data on devices, tensor data structure along with `Python* Array API Standard`_ implementation, and support for creation of user-defined data-parallel extensions. Table of Contents ***************** diff --git a/docs/sources/prerequisites_and_installation.rst b/docs/sources/prerequisites_and_installation.rst index b6aa0a8..5475a47 100644 --- a/docs/sources/prerequisites_and_installation.rst +++ b/docs/sources/prerequisites_and_installation.rst @@ -22,7 +22,7 @@ Data Parallel Extensions for Python. ********************** You will need Python 3.8, 3.9, or 3.10 installed on your system. If you do not have one yet the easiest way to do -that is to install `Intel Distribution for Python`_. +that is to install `Intel Distribution for Python*`_. It will install all essential Python numerical and machine learning packages optimized for Intel hardware, including Data Parallel Extensions for Python*. If you have Python installation from another vendor, it is fine too. All you need is to install Data Parallel