diff --git a/docarray/array/any_array.py b/docarray/array/any_array.py index 3d966d34904..cf1812194f1 100644 --- a/docarray/array/any_array.py +++ b/docarray/array/any_array.py @@ -210,7 +210,7 @@ class Book(BaseDoc): If your [`DocList`][docarray.array.doc_list.doc_list.DocList] is in doc_vec mode, and you want to access a field of - type [`AnyTensor`][docarray.typing.AnyTensor], the doc_vec tensor will be returned instead of a list: + type `AnyTensor`, the doc_vec tensor will be returned instead of a list: ```python class Image(BaseDoc): diff --git a/docarray/array/doc_list/io.py b/docarray/array/doc_list/io.py index 9f153e2f1bd..16dca6a5bb0 100644 --- a/docarray/array/doc_list/io.py +++ b/docarray/array/doc_list/io.py @@ -358,10 +358,9 @@ def from_csv( :param dialect: defines separator and how to handle whitespaces etc. Can be a [`csv.Dialect`](https://docs.python.org/3/library/csv.html#csv.Dialect) instance or one string of: - - - 'excel' (for comma separated values), - - 'excel-tab' (for tab separated values), - - 'unix' (for csv file generated on UNIX systems). + `'excel'` (for comma separated values), + `'excel-tab'` (for tab separated values), + `'unix'` (for csv file generated on UNIX systems). :return: `DocList` object """ @@ -428,10 +427,10 @@ def to_csv( :param dialect: defines separator and how to handle whitespaces etc. Can be a [`csv.Dialect`](https://docs.python.org/3/library/csv.html#csv.Dialect) instance or one string of: + `'excel'` (for comma separated values), + `'excel-tab'` (for tab separated values), + `'unix'` (for csv file generated on UNIX systems). - - 'excel' (for comma seperated values), - - 'excel-tab' (for tab separated values), - - 'unix' (for csv file generated on UNIX systems). """ fields = self.doc_type._get_access_paths() diff --git a/docarray/array/doc_vec/doc_vec.py b/docarray/array/doc_vec/doc_vec.py index 7d692b31084..adb701d2a11 100644 --- a/docarray/array/doc_vec/doc_vec.py +++ b/docarray/array/doc_vec/doc_vec.py @@ -59,32 +59,34 @@ class DocVec(AnyDocArray[T_doc]): computation that require batches of data (ex: matrix multiplication, distance calculation, deep learning forward pass) - A DocVec has a similar interface as - {class}`~docarray.array.DocList` but with an underlying implementation that is - column based instead of row based. Each field - of the schema of the DocVec - (the :attr:`~docarray.array.doc_vec.DocVec.doc_type` which is a - `BaseDoc`) will be stored in a column. If the field is a tensor, the data from all Documents will be stored as a single, doc_vec (torch/np/tf) tensor. - If the tensor field - is `AnyTensor` or a Union of tensor types, the - :attr:`~docarray.array.doc_vec.DocVec.tensor_type` will be used to determine - the type of the doc_vec column. - - If the field is another `BasedDoc` the column will be another DocVec that follows the - schema of the nested Document. - If the field is a `DocList` or - `DocVec` then the column will be a list of `DocVec`. + A DocVec has a similar interface as [`DocList`][docarray.array.DocList] + but with an underlying implementation that is column based instead of row based. + Each field of the schema of the `DocVec` (the `.doc_type` which is a + [`BaseDoc`][docarray.BaseDoc]) will be stored in a column. + + If the field is a tensor, the data from all Documents will be stored as a single + doc_vec (torch/np/tf) tensor. + + If the tensor field is `AnyTensor` or a Union of tensor types, the + `.tensor_type` will be used to determine the type of the doc_vec column. + + If the field is another [`BaseDoc`][docarray.BaseDoc] the column will be another + `DocVec` that follows the schema of the nested Document. + + If the field is a [`DocList`][docarray.DocList] or `DocVec` then the column will + be a list of `DocVec`. + For any other type the column is a Python list. - Every `Document` inside a `DocVec` is a view into the data columns stored at the `DocVec` level. The `BaseDoc` does - not hold any data itself. The behavior of - this Document "view" is similar to the behavior of `view = tensor[i]` in - numpy/PyTorch. + Every `Document` inside a `DocVec` is a view into the data columns stored at the + `DocVec` level. The `BaseDoc` does not hold any data itself. The behavior of + this Document "view" is similar to the behavior of `view = tensor[i]` in + numpy/PyTorch. - :param docs: a homogeneous sequence of BaseDoc + :param docs: a homogeneous sequence of `BaseDoc` :param tensor_type: Tensor Class used to wrap the doc_vec tensors. This is useful - if the BaseDoc of this DocVec has some undefined tensor type like - AnyTensor or Union of NdArray and TorchTensor + if the BaseDoc of this DocVec has some undefined tensor type like + AnyTensor or Union of NdArray and TorchTensor """ doc_type: Type[T_doc] diff --git a/docarray/base_doc/mixins/update.py b/docarray/base_doc/mixins/update.py index 471e97483ba..754e6c9b789 100644 --- a/docarray/base_doc/mixins/update.py +++ b/docarray/base_doc/mixins/update.py @@ -24,9 +24,9 @@ def update(self, other: T): """ Updates self with the content of other. Changes are applied to self. Updating one Document with another consists in the following: - - setting data properties of the second Document to the first Document - if they are not None: + - Setting data properties of the second Document to the first Document + if they are not None - Concatenating lists and updating sets - Updating recursively Documents and DocArrays - Updating Dictionaries of the left with the right @@ -35,9 +35,9 @@ def update(self, other: T): it is applied to a static schema type, the presence of the field is given by the field not having a None value and that DocArrays, lists and sets are concatenated. It is worth mentioning that Tuples - are not merged together since they are meant to be inmutable, + are not merged together since they are meant to be immutable, so they behave as regular types and the value of `self` is updated - with the value of `other` + with the value of `other`. --- diff --git a/docarray/data/torch_dataset.py b/docarray/data/torch_dataset.py index 25fbb9a9a6a..f174326c2a1 100644 --- a/docarray/data/torch_dataset.py +++ b/docarray/data/torch_dataset.py @@ -14,30 +14,31 @@ class MultiModalDataset(Dataset, Generic[T_doc]): A dataset that can be used inside a PyTorch DataLoader. In other words, it implements the PyTorch Dataset interface. - :param docs: the DocList to be used as the dataset - :param preprocessing: a dictionary of field names and preprocessing functions - The preprocessing dictionary passed to the constructor consists of keys that are field names and values that are functions that take a single argument and return a single argument. - EXAMPLE USAGE - .. code-block:: python + --- + + ```python from torch.utils.data import DataLoader from docarray import DocList from docarray.data import MultiModalDataset - from docarray.documents import Text + from docarray.documents import TextDoc def prepend_number(text: str): return f"Number {text}" - docs = DocList[Text](Text(text=str(i)) for i in range(16)) - ds = MultiModalDataset[Text](docs, preprocessing={'text': prepend_number}) - loader = DataLoader(ds, batch_size=4, collate_fn=MultiModalDataset[Text].collate_fn) + docs = DocList[TextDoc](TextDoc(text=str(i)) for i in range(16)) + ds = MultiModalDataset[TextDoc](docs, preprocessing={'text': prepend_number}) + loader = DataLoader(ds, batch_size=4, collate_fn=MultiModalDataset[TextDoc].collate_fn) for batch in loader: print(batch.text) + ``` + + --- Nested fields can be accessed by using dot notation. The document itself can be accessed using the empty string as the key. @@ -47,24 +48,25 @@ def prepend_number(text: str): The transformations will be applied according to their order in the dictionary. - EXAMPLE USAGE - .. code-block:: python + --- + + ```python import torch from torch.utils.data import DataLoader from docarray import DocList, BaseDoc from docarray.data import MultiModalDataset - from docarray.documents import Text + from docarray.documents import TextDoc class Thesis(BaseDoc): - title: Text + title: TextDoc class Student(BaseDoc): thesis: Thesis - def embed_title(title: Text): + def embed_title(title: TextDoc): title.embedding = torch.ones(4) @@ -90,6 +92,12 @@ def add_nonsense(student: Student): loader = DataLoader(ds, batch_size=4, collate_fn=ds.collate_fn) for batch in loader: print(batch.thesis.title.embedding) + ``` + + --- + + :param docs: the `DocList` to be used as the dataset + :param preprocessing: a dictionary of field names and preprocessing functions """ doc_type: Optional[Type[BaseDoc]] = None diff --git a/docarray/typing/tensor/audio/abstract_audio_tensor.py b/docarray/typing/tensor/audio/abstract_audio_tensor.py index 56fdae6c05e..b987b2addfd 100644 --- a/docarray/typing/tensor/audio/abstract_audio_tensor.py +++ b/docarray/typing/tensor/audio/abstract_audio_tensor.py @@ -16,7 +16,7 @@ class AbstractAudioTensor(AbstractTensor, ABC): def to_bytes(self) -> 'AudioBytes': """ - Convert audio tensor to AudioBytes. + Convert audio tensor to [`AudioBytes`][docarray.typrin.AudioBytes]. """ from docarray.typing.bytes.audio_bytes import AudioBytes diff --git a/docarray/typing/tensor/image/abstract_image_tensor.py b/docarray/typing/tensor/image/abstract_image_tensor.py index 0a880be9865..9566910781d 100644 --- a/docarray/typing/tensor/image/abstract_image_tensor.py +++ b/docarray/typing/tensor/image/abstract_image_tensor.py @@ -15,7 +15,7 @@ class AbstractImageTensor(AbstractTensor, ABC): def to_bytes(self, format: str = 'PNG') -> 'ImageBytes': """ - Convert image tensor to ImageBytes. + Convert image tensor to [`ImageBytes`][docarray.typing.ImageBytes]. :param format: the image format use to store the image, can be 'PNG' , 'JPG' ... :return: an ImageBytes object diff --git a/docarray/typing/tensor/image/image_tensorflow_tensor.py b/docarray/typing/tensor/image/image_tensorflow_tensor.py index c95b001e704..f373f45b30e 100644 --- a/docarray/typing/tensor/image/image_tensorflow_tensor.py +++ b/docarray/typing/tensor/image/image_tensorflow_tensor.py @@ -14,7 +14,8 @@ class ImageTensorFlowTensor( """ Subclass of [`TensorFlowTensor`][docarray.typing.TensorFlowTensor], to represent an image tensor. Adds image-specific features to the tensor. - For instance the ability convert the tensor back to image bytes which are + For instance the ability convert the tensor back to + [`ImageBytes`][docarray.typing.ImageBytes] which are optimized to send over the wire. diff --git a/docarray/typing/tensor/image/image_torch_tensor.py b/docarray/typing/tensor/image/image_torch_tensor.py index 249030c00f6..103a936d705 100644 --- a/docarray/typing/tensor/image/image_torch_tensor.py +++ b/docarray/typing/tensor/image/image_torch_tensor.py @@ -12,7 +12,8 @@ class ImageTorchTensor(AbstractImageTensor, TorchTensor, metaclass=metaTorchAndN """ Subclass of [`TorchTensor`][docarray.typing.TorchTensor], to represent an image tensor. Adds image-specific features to the tensor. - For instance the ability convert the tensor back to image bytes which are + For instance the ability convert the tensor back to + [`ImageBytes`][docarray.typing.ImageBytes] which are optimized to send over the wire. diff --git a/docarray/typing/tensor/video/video_tensor_mixin.py b/docarray/typing/tensor/video/video_tensor_mixin.py index d2ed61eacee..173daaacce8 100644 --- a/docarray/typing/tensor/video/video_tensor_mixin.py +++ b/docarray/typing/tensor/video/video_tensor_mixin.py @@ -135,7 +135,7 @@ def to_bytes( audio_format: str = 'fltp', ) -> 'VideoBytes': """ - Convert video tensor to VideoBytes. + Convert video tensor to [`VideoBytes`][docarray.typing.VideoBytes]. :param audio_tensor: AudioTensor containing the video's soundtrack. :param video_frame_rate: video frames per second. diff --git a/docarray/typing/url/url_3d/mesh_url.py b/docarray/typing/url/url_3d/mesh_url.py index 9ba5e330e6e..70f32eb5581 100644 --- a/docarray/typing/url/url_3d/mesh_url.py +++ b/docarray/typing/url/url_3d/mesh_url.py @@ -26,33 +26,33 @@ def load( trimesh_args: Optional[Dict[str, Any]] = None, ) -> 'VerticesAndFaces': """ - Load the data from the url into a VerticesAndFaces object containing - vertices and faces information. + Load the data from the url into a [`VerticesAndFaces`][docarray.documents.VerticesAndFaces] + object containing vertices and faces information. --- - ```python - from docarray import BaseDoc + ```python + from docarray import BaseDoc - from docarray.typing import Mesh3DUrl, NdArray + from docarray.typing import Mesh3DUrl, NdArray - class MyDoc(BaseDoc): - mesh_url: Mesh3DUrl + class MyDoc(BaseDoc): + mesh_url: Mesh3DUrl - doc = MyDoc(mesh_url="toydata/tetrahedron.obj") + doc = MyDoc(mesh_url="https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj") - tensors = doc.mesh_url.load() - assert isinstance(tensors.vertices, NdArray) - assert isinstance(tensors.faces, NdArray) - ``` + tensors = doc.mesh_url.load() + assert isinstance(tensors.vertices, NdArray) + assert isinstance(tensors.faces, NdArray) + ``` - --- - :param skip_materials: Skip materials if True, else skip. - :param trimesh_args: dictionary of additional arguments for `trimesh.load()` - or `trimesh.load_remote()`. - :return: VerticesAndFaces object containing vertices and faces information. + + :param skip_materials: Skip materials if True, else skip. + :param trimesh_args: dictionary of additional arguments for `trimesh.load()` + or `trimesh.load_remote()`. + :return: VerticesAndFaces object containing vertices and faces information. """ from docarray.documents.mesh.vertices_and_faces import VerticesAndFaces diff --git a/docarray/typing/url/url_3d/point_cloud_url.py b/docarray/typing/url/url_3d/point_cloud_url.py index dd3f17be0df..efe6ce6ae0e 100644 --- a/docarray/typing/url/url_3d/point_cloud_url.py +++ b/docarray/typing/url/url_3d/point_cloud_url.py @@ -29,7 +29,7 @@ def load( trimesh_args: Optional[Dict[str, Any]] = None, ) -> 'PointsAndColors': """ - Load the data from the url into an NdArray containing point cloud information. + Load the data from the url into an `NdArray` containing point cloud information. --- @@ -45,7 +45,7 @@ class MyDoc(BaseDoc): point_cloud_url: PointCloud3DUrl - doc = MyDoc(point_cloud_url="toydata/tetrahedron.obj") + doc = MyDoc(point_cloud_url="thttps://people.sc.fsu.edu/~jburkardt/data/obj/al.obj") # point_cloud = doc.point_cloud_url.load(samples=100) @@ -96,20 +96,24 @@ def display( First, it loads the point cloud into a `PointsAndColors` object, and then calls display on it. The following is therefore equivalent: - .. code-block:: python + --- - import numpy as np - from docarray import BaseDoc + ```python + import numpy as np + from docarray import BaseDoc - from docarray.documents import PointCloud3D + from docarray.documents import PointCloud3D - pc = PointCloud3D("toydata/tetrahedron.obj") + pc = PointCloud3D(url="https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj") - # option 1 - pc.url.display() + # option 1 + # pc.url.display() - # option 2 (equivalent) - pc.url.load(samples=10000).display() + # option 2 (equivalent) + # pc.url.load(samples=10000).display() + ``` + + --- :param samples: number of points to sample from the mesh. """ diff --git a/docarray/typing/url/video_url.py b/docarray/typing/url/video_url.py index db9dd4b5080..8c5f0e6d995 100644 --- a/docarray/typing/url/video_url.py +++ b/docarray/typing/url/video_url.py @@ -73,8 +73,7 @@ class MyDoc(BaseDoc): --- :param kwargs: supports all keyword arguments that are being supported by - av.open() as described in: - https://pyav.org/docs/stable/api/_globals.html?highlight=open#av.open + av.open() as described [here](https://pyav.org/docs/stable/api/_globals.html?highlight=open#av.open) :return: [`AudioNdArray`][docarray.typing.AudioNdArray] representing the audio content, [`VideoNdArray`][docarray.typing.VideoNdArray] representing the images of the video,