IntelPython · samaid · Nov 17, 2022 · Nov 17, 2022
diff --git a/docs/sources/conf.py b/docs/sources/conf.py
@@ -1,172 +1,3 @@
-""""
-# Configuration file for the Sphinx documentation builder.
-#
-# This file only contains a selection of the most common options. For a full
-# list see the documentation:
-# https://www.sphinx-doc.org/en/master/usage/configuration.html
-
-# -- Path setup --------------------------------------------------------------
-
-# If extensions (or modules to document with autodoc) are in another directory,
-# add these directories to sys.path here. If the directory is relative to the
-# documentation root, use os.path.abspath to make it absolute, like shown here.
-#
-# import os
-# import sys
-# sys.path.insert(0, os.path.abspath('.'))
-
-import sphinx_rtd_theme
-
-# -- Project information -----------------------------------------------------
-
-project = "Data Parallel Extensions for Python"
-copyright = "2021, Intel"
-author = "Intel"
-
-# The full version, including alpha/beta/rc tags
-release = "0.0.1"
-
-
-# -- General configuration ---------------------------------------------------
-
-# Add any Sphinx extension module names here, as strings. They can be
-# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
-# ones.
-extensions = [
-    "recommonmark",
-    "sphinx_rtd_theme",
-    "sphinx.ext.autodoc",
-    "sphinx.ext.todo",
-]
-
-todo_include_todos = True
-
-source_parsers = {".md": "recommonmark.parser.CommonMarkParser"}
-
-# Add any paths that contain templates here, relative to this directory.
-templates_path = ["_templates"]
-
-# List of patterns, relative to source directory, that match files and
-# directories to ignore when looking for source files.
-# This pattern also affects html_static_path and html_extra_path.
-exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
-
-
-# -- Options for HTML output -------------------------------------------------
-
-# The theme to use for HTML and HTML Help pages.  See the documentation for
-# a list of builtin themes.
-#
-html_theme = "sphinx_rtd_theme"
-
-# Add any paths that contain custom static files (such as style sheets) here,
-# relative to this directory. They are copied after the builtin static files,
-# so a file named "default.css" will overwrite the builtin "default.css".
-# html_static_path = ["_static"]
-"""
-
-# *****************************************************************************
-# Copyright (c) 2020, Intel Corporation All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-#     Redistributions of source code must retain the above copyright notice,
-#     this list of conditions and the following disclaimer.
-#
-#     Redistributions in binary form must reproduce the above copyright notice,
-#     this list of conditions and the following disclaimer in the documentation
-#     and/or other materials provided with the distribution.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
-# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
-# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
-# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
-# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-# *****************************************************************************
-
-
-# coding: utf-8
-# Configuration file for the Sphinx documentation builder.
-#
-# -- Path setup --------------------------------------------------------------
-
-# If extensions (or modules to document with autodoc) are in another directory,
-# add these directories to sys.path here. If the directory is relative to the
-# documentation root, use os.path.abspath to make it absolute, like shown here.
-#
-# import os
-# import sys
-# sys.path.insert(0, os.path.abspath('.'))
-
-
-# -- Import sdc package to build API Reference -------------------------------
-#import os
-#import sys
-#import shutil
-
-#SDC_DOC_NO_API_REF_STR = 'SDC_DOC_NO_API_REF'
-#SDC_DOC_APIREF_DIR = '_api_ref'
-
-#sys.path.insert(0, os.path.relpath('buildscripts'))
-#sdc_doc_no_api_ref = False  # Generate API Reference by default
-
-#if SDC_DOC_NO_API_REF_STR in os.environ:
-#    sdc_doc_no_api_ref = os.environ[SDC_DOC_NO_API_REF_STR] == '1'
-
-#if not sdc_doc_no_api_ref:
-#    if os.path.exists(SDC_DOC_APIREF_DIR):
-#        shutil.rmtree(SDC_DOC_APIREF_DIR)
-
-#    try:
-#        import sdc
-#    except ImportError:
-#        raise ImportError('Cannot import sdc.\n'
-#                          'Documentation generator for API Reference for a given module expects that module '
-#                          'to be installed. Use conda/pip install SDC to install it prior to using API Reference '
-#                          'generation. If you want to disable API Reference generation, set the environment '
-#                          'variable SDC_DOC_NO_API_REF=1')
-
-#    try:
-#        from apiref_generator import generate_api_reference
-#    except ImportError:
-#        raise ImportError('Cannot import apiref_generator', os.getcwd())
-
-#    generate_api_reference()
-
-#SDC_DOC_NO_EXAMPLES_STR = 'SDC_DOC_NO_EXAMPLES'
-#SDC_DOC_EXAMPLES_DIR = '_examples'
-
-#sdc_doc_no_examples = False  # Generate examples list by default
-#if SDC_DOC_NO_EXAMPLES_STR in os.environ:
-#    sdc_doc_no_examples = os.environ[SDC_DOC_NO_EXAMPLES_STR] == '1'
-
-#if not sdc_doc_no_examples:
-#    if os.path.exists(SDC_DOC_EXAMPLES_DIR):
-#        shutil.rmtree(SDC_DOC_EXAMPLES_DIR)
-
-#    try:
-#        import sdc
-#    except ImportError:
-#        raise ImportError('Cannot import sdc.\n'
-#                          'Documentation generator for Examples for a given module expects that module '
-#                          'to be installed. Use conda/pip install SDC to install it prior to using API Examples '
-#                          'generation. If you want to disable Examples generation, set the environment '
-#                          'variable SDC_DOC_NO_EXAMPLES_STR=1')
-
-#    try:
-#        from examples_generator import generate_examples
-#    except ImportError:
-#        raise ImportError('Cannot import examples_generator', os.getcwd())
-
-#    generate_examples()
-
 # -- Project information -----------------------------------------------------
 
 project = 'Data Parallel Extensions for Python*'
@@ -176,7 +7,6 @@
 # The full version, including alpha/beta/rc tags
 release = '0.1'
 
-
 # -- General configuration ----------------------------------------------------
 
 # Add any Sphinx extension module names here, as strings. They can be
@@ -188,6 +18,7 @@
     'sphinx.ext.extlinks',
     'sphinx.ext.githubpages',
     'sphinx.ext.napoleon',
+    'sphinx.ext.autosectionlabel',
     'sphinxcontrib.programoutput',
 ]
 

diff --git a/docs/sources/ext_links.txt b/docs/sources/ext_links.txt
@@ -4,5 +4,10 @@
     **********************************************************
 .. _NumPy*: https://numpy.org/
 .. _Numba*: https://numba.pydata.org/
-.. _Python Array API Standard: https://data-apis.org/array-api/
-.. _Intel Distribution for Python: https://www.intel.com/content/www/us/en/developer/tools/oneapi/distribution-for-python.html
+.. _Python* Array API Standard: https://data-apis.org/array-api/
+.. _Intel Distribution for Python*: https://www.intel.com/content/www/us/en/developer/tools/oneapi/distribution-for-python.html
+.. _OpenCl*: https://www.khronos.org/opencl/
+.. _DPC++: https://www.apress.com/gp/book/9781484255735
+.. _Data Parallel Extension for Numba*: https://intelpython.github.io/numba-dpex/latest/index.html
+.. _SYCL*: https://www.khronos.org/sycl/
+.. _Data Parallel Control: https://intelpython.github.io/dpctl/latest/index.html
diff --git a/docs/sources/heterogeneous_computing.rst b/docs/sources/heterogeneous_computing.rst
@@ -3,3 +3,147 @@
 
 Heterogeneous computing
 =======================
+
+Device Offload
+**************
+
+Python is an interpreted language, which implies that most of Python codes will run on CPU,
+and only a few data parallel regions will execute on data parallel devices.
+That is why the concept of host and offload devices is useful when it comes to conceptualizing
+a heterogeneous programming model in Python.
+
+.. image:: ./_images/hetero-devices.png
+    :width: 600px
+    :align: center
+    :alt: SIMD
+
+The above diagram illustrates the *host* (the CPU which runs Python interpreter) and three *devices*
+(two GPU devices and one attached accelerator device). **Data Parallel Extensions for Python**
+offer a programming model where a script executed by Python interpreter on host can *offload* data
+parallel kernels to user-specified device. A *kernel* is the *data parallel region* of a program submitted
+for execution on the device. There can be multiple data parallel regions, and hence multiple *offload kernels*.
+
+Kernels can be pre-compiled into a library, such as ``dpnp``, or, alternatively, directly coded
+in a programming language for heterogeneous computing, such as `OpenCl*`_ or `DPC++`_ .
+**Data Parallel Extensions for Python** offer the way of writing kernels directly in Python
+using `Numba*`_ compiler along with ``numba-dpex``, the `Data Parallel Extension for Numba*`_.
+
+One or more kernels are submitted for execution into a *queue* targeting an *offload device*.
+For each device one or more queues can be created. In most cases you won’t need to work
+with device queues directly. Data Parallel Extensions for Python will do necessary underlying
+work with queues for you through the :ref:`Compute-Follows-Data`.
+
+Unified Shared Memory
+*********************
+
+Each device has its own memory, not necessarily accessible from another device.
+
+.. image:: ./_images/hetero-devices.png
+    :width: 600px
+    :align: center
+    :alt: SIMD
+
+For example, **Device 1** memory may not be directly accessible from the host, but only accessible
+via expensive copying by a driver software. Similarly, depending on the architecture, direct data
+exchange between **Device 2** and **Device 1** may be impossible, and only possible via expensive
+copying through the host memory. These aspects must be taken into consideration when programming
+data parallel devices.
+
+In the above illustration the **Device 2** logically consists of two sub-devices, **Sub-Device 1**
+and **Sub-Device 2**. The programming model allows accessing **Device 2** as a single logical device, or
+by working with each individual sub-devices. For the former case a programmer needs to create
+a queue for **Device 2**. For the latter case a programmer needs to create 2 queues, one for each sub-device.
+
+`SYCL*`_ standard introduces a concept of the *Unified Shared Memory* (USM). USM requires hardware support
+for unified virtual address space, which allows coherency between the host and the device
+pointers. All memory is allocated by the host, but it offers three distinct allocation types:
+
+* **Host: located on the host, accessible by the host or device.** This type of memory is useful in a situation
+  when you need to stream a read-only data from the host to the device once.
+
+* **Device: located on the device, accessibly only by the device.** This type of memory is the fastest one.
+  Useful in a situation when most of data crunching happens on the device.
+
+* **Shared: location is both host and device (copies are synchronized by underlying software), accessible by
+  the host or device.** Shared allocations are useful when data are accessed by both host and devices,
+  since a user does not need to explicitly manage data migration. However, it is much slower than USM Device memory type.
+
+Compute-Follows-Data
+********************
+Since data copying between devices is typically very expensive, for performance reasons it is essential
+to process data close to where it is allocated. This is the premise of the *Compute-Follows-Data* programming model,
+which states that the compute will happen where the data resides. Tensors implemented in ``dpctl`` and ``dpnp``
+carry information about allocation queues, and hence, about the device on which an array is allocated.
+Based on tensor input arguments of the offload kernel, it deduces the queue on which the execution takes place.
+
+.. image:: ./_images/kernel-queue-device.png
+    :width: 600px
+    :align: center
+    :alt: SIMD
+
+The above picture illustrates the *Compute-Follows-Data* concept. Arrays ``A`` and ``B`` are inputs to the
+**Offload Kernel**. These arrays carry information about their *allocation queue* (**Device Queue**) and the
+*device* (**Device 1**) where they were created. According to the Compute-Follows-Data paradigm
+the **Offload Kernel** will be submitted to this **Device Queue**, and the resulting array ``C`` will
+be created on the **Device Queue** associated with the **Device 1**.
+
+**Data Parallel Extensions for Python** require all input tensor arguments to have the **same** allocation queue,
+otherwise an exception will be thrown. For example, the following usages will result in the exception.
+
+.. figure:: ./_images/queue-exception1.png
+    :width: 600px
+    :align: center
+    :alt: SIMD
+
+    Input tensors are on different devices and different queues. Exception is thrown.
+
+.. figure:: ./_images/queue-exception2.png
+    :width: 600px
+    :align: center
+    :alt: SIMD
+
+    Input tensors are on the same device but queues are different. Exception is thrown.
+
+.. figure:: ./_images/queue-exception3.png
+    :width: 600px
+    :align: center
+    :alt: SIMD
+
+    Data belongs to the same device, but queues are different and associated with different sub-devices.
+
+Copying data between devices and queues
+***************************************
+
+**Data Parallel Extensions for Python** create **one** *canonical queue* per device so that in
+normal circumstances you do not need to directly manage queues. Having one canonical queue per device
+allows you to copy data between devices using to_device() method:
+
+.. code-block:: python
+
+   a_new = a.to_device(b.device)
+
+Array ``a`` will be copied to the device associated with array ``b`` into the new array ``a_new``.
+The same queue will be associated with ``b`` and ``a_new``.
+
+Alternatively, you can do this as follows:
+
+.. code-block:: python
+    :caption: DPNP array
+
+    a_new = dpnp.asarray(a, device=b.device)
+
+.. code-block:: python
+    :caption: DPCtl array
+
+    a_new = dpctl.tensor.asarray(a, device=b.device)
+
+Creating additional queues
+**************************
+
+As previously indicated **Data Parallel Extensions for Python** automatically create one canonical queue per device,
+and you normally work with this queue implicitly. However, you can always create as many additional queues per device
+as needed, and work with them explicitly.
+
+A typical situation when you will want to create the queue explicitly is for profiling purposes.
+Read `Data Parallel Control`_ documentation for more details about queues.
+
diff --git a/docs/sources/index.rst b/docs/sources/index.rst
@@ -18,7 +18,7 @@ gains on data parallel devices such as GPUs. It consists of three foundational p
 * **numba_dpex** - Data Parallel Extensions for `Numba*`_ - extension for Numba compiler
   that enables programming data parallel devices the same way you program CPU with Numba.
 * **dpctl - Data Parallel Control library** that provides utilities for device selection,
-  allocation of data on devices, tensor data structure along with `Python Array API Standard`_ implementation, and support for creation of user-defined data-parallel extensions.
+  allocation of data on devices, tensor data structure along with `Python* Array API Standard`_ implementation, and support for creation of user-defined data-parallel extensions.
 
 Table of Contents
 *****************

diff --git a/docs/sources/prerequisites_and_installation.rst b/docs/sources/prerequisites_and_installation.rst
@@ -22,7 +22,7 @@ Data Parallel Extensions for Python.
 **********************
 
 You will need Python 3.8, 3.9, or 3.10 installed on your system. If you do not have one yet the easiest way to do
-that is to install `Intel Distribution for Python`_.
+that is to install `Intel Distribution for Python*`_.
 It will install all essential Python numerical and machine
 learning packages optimized for Intel hardware, including Data Parallel Extensions for Python*.
 If you have Python installation from another vendor, it is fine too. All you need is to install Data Parallel