fix docs, make sure docs build (meta-pytorch#1302)

gokulavasan · web-flow · commit b65382c56d2c · 2024-07-30T10:08:40.000-07:00
* fix docs, make sure docs build

* adding stateful dataloader docs
diff --git a/docs/Makefile b/docs/Makefile
@@ -1,23 +1,41 @@
 # Minimal makefile for Sphinx documentation
 #
 
-# You can set these variables from the command line, and also
-# from the environment for the first two.
-SPHINXOPTS    ?=
-SPHINXBUILD   ?= sphinx-build
+ifneq ($(EXAMPLES_PATTERN),)
+    EXAMPLES_PATTERN_OPTS := -D sphinx_gallery_conf.filename_pattern="$(EXAMPLES_PATTERN)"
+endif
+
+# You can set these variables from the command line.
+SPHINXOPTS    = -W -j auto $(EXAMPLES_PATTERN_OPTS)
+SPHINXBUILD   = sphinx-build
+SPHINXPROJ    = torchdata
 SOURCEDIR     = source
 BUILDDIR      = build
 
 # Put it first so that "make" without argument is like "make help".
 help:
 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
 
-doctest: html
-	$(SPHINXBUILD) -b doctest $(SPHINXOPTS) "$(SOURCEDIR)" "$(BUILDDIR)"/doctest
-	@echo "Testing of doctests in the sources finished, look at the " \
-	      "results in $(BUILDDIR)/doctest/output.txt."
+docset: html
+	doc2dash --name $(SPHINXPROJ) --icon $(SOURCEDIR)/_static/img/pytorch-logo-flame.png --enable-js --online-redirect-url http://pytorch.org/data/ --force $(BUILDDIR)/html/
+
+	# Manually fix because Zeal doesn't deal well with `icon.png`-only at 2x resolution.
+	cp $(SPHINXPROJ).docset/icon.png $(SPHINXPROJ).docset/icon@2x.png
+	convert $(SPHINXPROJ).docset/icon@2x.png -resize 16x16 $(SPHINXPROJ).docset/icon.png
+
+html-noplot:  # Avoids running the gallery examples, which may take time
+	$(SPHINXBUILD) -D plot_gallery=0 -b html "${SOURCEDIR}" "$(BUILDDIR)"/html
+	@echo
+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
+
+clean:
+	rm -rf $(BUILDDIR)/*
+	rm -rf $(SOURCEDIR)/generated_examples/  # sphinx-gallery
+	rm -rf $(SOURCEDIR)/gen_modules/  # sphinx-gallery
+	rm -rf $(SOURCEDIR)/sg_execution_times.rst  # sphinx-gallery
+	rm -rf $(SOURCEDIR)/generated/  # autosummary
 
-.PHONY: help doctest Makefile
+.PHONY: help Makefile docset
 
 # Catch-all target: route all unknown targets to Sphinx using the new
 # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
diff --git a/docs/source/dp_tutorial.rst b/docs/source/dp_tutorial.rst
@@ -321,7 +321,7 @@ Accessing AWS S3 with ``fsspec`` DataPipes
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 This requires the installation of the libraries ``fsspec``
-(`documentation <https://filesystem-spec.readthedocs.io/en/latest/>`_) and ``s3fs``
+(`documentation <https://filesystem-spec.readthedocs.io/en/latest/>`__) and ``s3fs``
 (`s3fs GitHub repo <https://github.com/fsspec/s3fs>`_).
 
 You can list out the files within a S3 bucket directory by passing a path that starts
@@ -363,7 +363,7 @@ is also available for writing data to cloud.
 Accessing Google Cloud Storage (GCS) with ``fsspec`` DataPipes
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 This requires the installation of the libraries ``fsspec``
-(`documentation <https://filesystem-spec.readthedocs.io/en/latest/>`_) and ``gcsfs``
+(`documentation <https://filesystem-spec.readthedocs.io/en/latest/>`__) and ``gcsfs``
 (`gcsfs GitHub repo <https://github.com/fsspec/gcsfs>`_).
 
 You can list out the files within a GCS bucket directory by specifying a path that starts
@@ -400,11 +400,11 @@ Accessing Azure Blob storage with ``fsspec`` DataPipes
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 This requires the installation of the libraries ``fsspec``
-(`documentation <https://filesystem-spec.readthedocs.io/en/latest/>`_) and ``adlfs``
+(`documentation <https://filesystem-spec.readthedocs.io/en/latest/>`__) and ``adlfs``
 (`adlfs GitHub repo <https://github.com/fsspec/adlfs>`_).
-You can access data in Azure Data Lake Storage Gen2 by providing URIs staring with ``abfs://``. 
+You can access data in Azure Data Lake Storage Gen2 by providing URIs staring with ``abfs://``.
 For example,
-`FSSpecFileLister <generated/torchdata.datapipes.iter.FSSpecFileLister.html>`_ (``.list_files_by_fsspec(...)``) 
+`FSSpecFileLister <generated/torchdata.datapipes.iter.FSSpecFileLister.html>`_ (``.list_files_by_fsspec(...)``)
 can be used to list files in a directory in a container:
 
 .. code:: python
@@ -430,11 +430,11 @@ directory ``curated/covid-19/ecdc_cases/latest``, belonging to account ``pandemi
             .open_files_by_fsspec(account_name='pandemicdatalake') \
             .parse_csv()
     print(list(dp)[:3])
-    # [['date_rep', 'day', ..., 'iso_country', 'daterep'], 
+    # [['date_rep', 'day', ..., 'iso_country', 'daterep'],
     # ['2020-12-14', '14', ..., 'AF', '2020-12-14'],
     # ['2020-12-13', '13', ..., 'AF', '2020-12-13']]
 
-If necessary, you can also access data in Azure Data Lake Storage Gen1 by using URIs staring with 
+If necessary, you can also access data in Azure Data Lake Storage Gen1 by using URIs staring with
 ``adl://`` and ``abfs://``, as described in `README of adlfs repo <https://github.com/fsspec/adlfs/blob/main/README.md>`_
 
 Accessing Azure ML Datastores with ``fsspec`` DataPipes
@@ -446,11 +446,11 @@ An Azure ML datastore is a *reference* to an existing storage account on Azure.
 - Authentication is automatically handled - both *credential-based* access (service principal/SAS/key) and *identity-based* access (Azure Active Directory/managed identity) are supported. When using credential-based authentication, you do not need to expose secrets in your code.
 
 This requires the installation of the library ``azureml-fsspec``
-(`documentation <https://learn.microsoft.com/python/api/azureml-fsspec/?view=azure-ml-py>`_).
+(`documentation <https://learn.microsoft.com/python/api/azureml-fsspec/?view=azure-ml-py>`__).
 
-You can access data in an Azure ML datastore by providing URIs staring with ``azureml://``. 
+You can access data in an Azure ML datastore by providing URIs staring with ``azureml://``.
 For example,
-`FSSpecFileLister <generated/torchdata.datapipes.iter.FSSpecFileLister.html>`_ (``.list_files_by_fsspec(...)``) 
+`FSSpecFileLister <generated/torchdata.datapipes.iter.FSSpecFileLister.html>`_ (``.list_files_by_fsspec(...)``)
 can be used to list files in a directory in a container:
 
 .. code:: python
@@ -470,7 +470,7 @@ can be used to list files in a directory in a container:
 
     dp = IterableWrapper([uri]).list_files_by_fsspec()
     print(list(dp))
-    # ['azureml:///<sub_id>/resourcegroups/<rg_name>/workspaces/<ws_name>/datastores/<datastore>/paths/<folder>/file1.txt', 
+    # ['azureml:///<sub_id>/resourcegroups/<rg_name>/workspaces/<ws_name>/datastores/<datastore>/paths/<folder>/file1.txt',
     # 'azureml:///<sub_id>/resourcegroups/<rg_name>/workspaces/<ws_name>/datastores/<datastore>/paths/<folder>/file2.txt', ...]
 
 You can also open files using `FSSpecFileOpener <generated/torchdata.datapipes.iter.FSSpecFileOpener.html>`_
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -36,6 +36,7 @@ Features described in this documentation are classified by release status:
    :maxdepth: 2
    :caption: API Reference:
 
+   torchdata.stateful_dataloader.rst
    torchdata.datapipes.iter.rst
    torchdata.datapipes.map.rst
    torchdata.datapipes.utils.rst
diff --git a/docs/source/torchdata.stateful_dataloader.rst b/docs/source/torchdata.stateful_dataloader.rst
@@ -0,0 +1,13 @@
+:tocdepth: 3
+
+Stateful DataLoader
+===================
+
+.. automodule:: torchdata.stateful_dataloader
+
+StatefulDataLoader is a drop-in replacement for `torch.utils.data.DataLoader <https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader>`_ which offers ``state_dict`` / ``load_state_dict`` methods for handling mid-epoch checkpointing which operate on the previous/next iterator requested from the dataloader (resp.).
+
+By default, the state includes the number of batches yielded and uses this to naively fast-forward the sampler (map-style) or the dataset (iterable-style). However if the sampler and/or dataset include ``state_dict`` / ``load_state_dict`` methods, then it will call them during its own ``state_dict`` / ``load_state_dict`` calls. Under the hood, :class:`StatefulDataLoader` handles aggregation and distribution of state across multiprocess workers (but not across ranks).
+
+.. autoclass:: StatefulDataLoader
+    :members:
diff --git a/torchdata/dataloader2/reading_service.py b/torchdata/dataloader2/reading_service.py
@@ -149,7 +149,7 @@ def __new__(cls, *args, **kwargs):
 
 class InProcessReadingService(ReadingServiceInterface):
     r"""
-    Default ReadingService to serve the ``DataPipe` graph in the main process,
+    Default ReadingService to serve the ``DataPipe`` graph in the main process,
     and apply graph settings like determinism control to the graph.
 
     Args:
diff --git a/torchdata/stateful_dataloader/stateful_dataloader.py b/torchdata/stateful_dataloader/stateful_dataloader.py
@@ -92,13 +92,12 @@
 
 class StatefulDataLoader(DataLoader[_T_co]):
     r"""
-    This is a drop in replacement for :class:`~torch.utils.data.DataLoader`
+    This is a drop in replacement for ``torch.utils.data.DataLoader``
     that implements state_dict and load_state_dict methods, enabling mid-epoch
     checkpointing.
 
-    All arguments are identical to :class:`~torch.utils.data.DataLoader`, with
-    a new kwarg: `snapshot_every_n_steps: Optional[int] = `.
-    See :py:mod:`torch.utils.data` documentation page for more details.
+    All arguments are identical to ``torch.utils.data.DataLoader``, with
+    a new kwarg: ``snapshot_every_n_steps``.
 
     Args:
         dataset (Dataset): dataset from which to load the data.
@@ -148,11 +147,13 @@ class StatefulDataLoader(DataLoader[_T_co]):
             maintain the workers `Dataset` instances alive. (default: ``False``)
         pin_memory_device (str, optional): the device to :attr:`pin_memory` to if ``pin_memory`` is
             ``True``.
+        snapshot_every_n_steps (int, optional): Defines how often the state is
+            transferred from the dataloader workers to the dataloader. By default, it is set to ``1``, i.e., state is transferred every step. If the state is large, this value can be increased (and ideally set to the frequency of training checkpointing) to reduce the overhead of transferring state every step.
 
 
     .. warning:: If the ``spawn`` start method is used, :attr:`worker_init_fn`
                  cannot be an unpicklable object, e.g., a lambda function. See
-                 :ref:`multiprocessing-best-practices` on more details related
+                 `multiprocessing-best-practices <https://pytorch.org/docs/stable/notes/multiprocessing.html#multiprocessing-best-practices>`_ on more details related
                  to multiprocessing in PyTorch.
 
     .. warning:: ``len(dataloader)`` heuristic is based on the length of the sampler used.
@@ -169,12 +170,12 @@ class StatefulDataLoader(DataLoader[_T_co]):
                  dropped when :attr:`drop_last` is set. Unfortunately, PyTorch can not detect such
                  cases in general.
 
-                 See `Dataset Types`_ for more details on these two types of datasets and how
+                 See `Dataset Types <https://pytorch.org/docs/stable/data.html>`_ for more details on these two types of datasets and how
                  :class:`~torch.utils.data.IterableDataset` interacts with
-                 `Multi-process data loading`_.
+                 `Multi-process data loading <https://pytorch.org/docs/stable/data.html#multi-process-data-loading>`_.
 
-    .. warning:: See :ref:`reproducibility`, and :ref:`dataloader-workers-random-seed`, and
-                 :ref:`data-loading-randomness` notes for random seed related questions.
+    .. warning:: See `Reproducibility <https://pytorch.org/docs/stable/notes/randomness.html#reproducibility>`_, and `Dataloader-workers-random-seed <https://pytorch.org/docs/stable/notes/faq.html#dataloader-workers-random-seed>`_, and
+                 `Data-loading-randomness <https://pytorch.org/docs/stable/data.html#data-loading-randomness>`_ notes for random seed related questions.
 
     .. _multiprocessing context:
         https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods