diff --git a/docarray/array/any_array.py b/docarray/array/any_array.py index d156da9ea8c..da718519682 100644 --- a/docarray/array/any_array.py +++ b/docarray/array/any_array.py @@ -121,7 +121,7 @@ def _set_data_column( field: str, values: Union[List, T, 'AbstractTensor'], ): - """Set all Documents in this DocList using the passed values + """Set all Documents in this [`DocList`][docarray.typing.DocList] using the passed values :param field: name of the fields to extract :values: the values to set at the DocList level @@ -140,7 +140,7 @@ def to_protobuf(self) -> 'DocListProto': ... def _to_node_protobuf(self) -> 'NodeProto': - """Convert a DocList into a NodeProto protobuf message. + """Convert a [`DocList`][docarray.typing.DocList] into a NodeProto protobuf message. This function should be called when a DocList is nested into another Document that need to be converted into a protobuf @@ -157,82 +157,81 @@ def traverse_flat( ) -> Union[List[Any], 'AbstractTensor']: """ Return a List of the accessed objects when applying the `access_path`. If this - results in a nested list or list of DocLists, the list will be flattened + results in a nested list or list of [`DocList`s][docarray.typing.DocList], the list will be flattened on the first level. The access path is a string that consists of attribute - names, concatenated and "__"-separated. It describes the path from the first - level to an arbitrary one, e.g. 'content__image__url'. + names, concatenated and `"__"`-separated. It describes the path from the first + level to an arbitrary one, e.g. `'content__image__url'`. - :param access_path: a string that represents the access path ("__"-separated). + :param access_path: a string that represents the access path (`"__"`-separated). :return: list of the accessed objects, flattened if nested. - EXAMPLE USAGE - .. code-block:: python - from docarray import BaseDoc, DocList, Text + ```python + from docarray import BaseDoc, DocList, Text - class Author(BaseDoc): - name: str + class Author(BaseDoc): + name: str - class Book(BaseDoc): - author: Author - content: Text + class Book(BaseDoc): + author: Author + content: Text - docs = DocList[Book]( - Book(author=Author(name='Jenny'), content=Text(text=f'book_{i}')) - for i in range(10) # noqa: E501 - ) + docs = DocList[Book]( + Book(author=Author(name='Jenny'), content=Text(text=f'book_{i}')) + for i in range(10) # noqa: E501 + ) - books = docs.traverse_flat(access_path='content') # list of 10 Text objs + books = docs.traverse_flat(access_path='content') # list of 10 Text objs - authors = docs.traverse_flat(access_path='author__name') # list of 10 strings + authors = docs.traverse_flat(access_path='author__name') # list of 10 strings + ``` If the resulting list is a nested list, it will be flattened: - EXAMPLE USAGE - .. code-block:: python - from docarray import BaseDoc, DocList - + ```python + from docarray import BaseDoc, DocList - class Chapter(BaseDoc): - content: str + class Chapter(BaseDoc): + content: str - class Book(BaseDoc): - chapters: DocList[Chapter] + class Book(BaseDoc): + chapters: DocList[Chapter] - docs = DocList[Book]( - Book(chapters=DocList[Chapter]([Chapter(content='some_content') for _ in range(3)])) - for _ in range(10) - ) - chapters = docs.traverse_flat(access_path='chapters') # list of 30 strings + docs = DocList[Book]( + Book(chapters=DocList[Chapter]([Chapter(content='some_content') for _ in range(3)])) + for _ in range(10) + ) - If your DocList is in doc_vec mode, and you want to access a field of - type AnyTensor, the doc_vec tensor will be returned instead of a list: + chapters = docs.traverse_flat(access_path='chapters') # list of 30 strings + ``` - EXAMPLE USAGE - .. code-block:: python - class Image(BaseDoc): - tensor: TorchTensor[3, 224, 224] + If your [`DocList`][docarray.typing.DocList] is in doc_vec mode, and you want to access a field of + type [`AnyTensor`][docarray.typing.AnyTensor], the doc_vec tensor will be returned instead of a list: + ```python + class Image(BaseDoc): + tensor: TorchTensor[3, 224, 224] - batch = DocList[Image]( - [ - Image( - tensor=torch.zeros(3, 224, 224), - ) - for _ in range(2) - ] - ) - batch_stacked = batch.stack() - tensors = batch_stacked.traverse_flat( - access_path='tensor' - ) # tensor of shape (2, 3, 224, 224) + batch = DocList[Image]( + [ + Image( + tensor=torch.zeros(3, 224, 224), + ) + for _ in range(2) + ] + ) + batch_stacked = batch.stack() + tensors = batch_stacked.traverse_flat( + access_path='tensor' + ) # tensor of shape (2, 3, 224, 224) + ``` """ ... @@ -264,7 +263,7 @@ def _flatten_one_level(sequence: List[Any]) -> List[Any]: def summary(self): """ - Print a summary of this DocList object and a summary of the schema of its + Print a summary of this [`DocList`][docarray.typing.DocList] object and a summary of the schema of its Document type. """ DocArraySummary(self).summary() @@ -276,13 +275,13 @@ def _batch( show_progress: bool = False, ) -> Generator[T, None, None]: """ - Creates a `Generator` that yields `DocList` of size `batch_size`. + Creates a `Generator` that yields [`DocList`][docarray.typing.DocList] of size `batch_size`. Note, that the last batch might be smaller than `batch_size`. :param batch_size: Size of each generated batch. :param shuffle: If set, shuffle the Documents before dividing into minibatches. :param show_progress: if set, show a progress bar when batching documents. - :yield: a Generator of `DocList`, each in the length of `batch_size` + :yield: a Generator of [`DocList`][docarray.typing.DocList], each in the length of `batch_size` """ from rich.progress import track diff --git a/docarray/array/doc_list/doc_list.py b/docarray/array/doc_list/doc_list.py index 89364ff4842..d01d7a31e0d 100644 --- a/docarray/array/doc_list/doc_list.py +++ b/docarray/array/doc_list/doc_list.py @@ -68,9 +68,8 @@ class DocList( homogeneous and follow the same schema. To precise this schema you can use the `DocList[MyDocument]` syntax where MyDocument is a Document class (i.e. schema). This creates a DocList that can only contains Documents of - the type 'MyDocument'. + the type `MyDocument`. - --- ```python from docarray import BaseDoc, DocList @@ -86,36 +85,39 @@ class Image(BaseDoc): docs = DocList[Image]( Image(url='http://url.com/foo.png') for _ in range(10) ) # noqa: E510 - ``` - --- + + # If your DocList is homogeneous (i.e. follows the same schema), you can access + # fields at the DocList level (for example `docs.tensor` or `docs.url`). + + print(docs.url) + # [ImageUrl('http://url.com/foo.png', host_type='domain'), ...] - If your DocList is homogeneous (i.e. follows the same schema), you can access - fields at the DocList level (for example `docs.tensor` or `docs.url`). - You can also set fields, with `docs.tensor = np.random.random([10, 100])`: + # You can also set fields, with `docs.tensor = np.random.random([10, 100])`: - print(docs.url) - # [ImageUrl('http://url.com/foo.png', host_type='domain'), ...] - import numpy as np + import numpy as np - docs.tensor = np.random.random([10, 100]) - print(docs.tensor) - # [NdArray([0.11299577, 0.47206767, 0.481723 , 0.34754724, 0.15016037, - # 0.88861321, 0.88317666, 0.93845579, 0.60486676, ... ]), ...] + docs.tensor = np.random.random([10, 100]) - You can index into a DocList like a numpy doc_list or torch tensor: + print(docs.tensor) + # [NdArray([0.11299577, 0.47206767, 0.481723 , 0.34754724, 0.15016037, + # 0.88861321, 0.88317666, 0.93845579, 0.60486676, ... ]), ...] - docs[0] # index by position - docs[0:5:2] # index by slice - docs[[0, 2, 3]] # index by list of indices - docs[True, False, True, True, ...] # index by boolean mask + # You can index into a DocList like a numpy doc_list or torch tensor: - You can delete items from a DocList like a Python List + docs[0] # index by position + docs[0:5:2] # index by slice + docs[[0, 2, 3]] # index by list of indices + docs[True, False, True, True, ...] # index by boolean mask - del docs[0] # remove first element from DocList - del docs[0:5] # remove elements for 0 to 5 from DocList + + # You can delete items from a DocList like a Python List + + del docs[0] # remove first element from DocList + del docs[0:5] # remove elements for 0 to 5 from DocList + ``` :param docs: iterable of Document @@ -135,10 +137,10 @@ def construct( docs: Sequence[T_doc], ) -> T: """ - Create a DocList without validation any data. The data must come from a + Create a `DocList` without validation any data. The data must come from a trusted source :param docs: a Sequence (list) of Document with the same schema - :return: + :return: a `DocList` object """ new_docs = cls.__new__(cls) new_docs._data = docs if isinstance(docs, list) else list(docs) @@ -154,13 +156,13 @@ def __eq__(self, other: Any) -> bool: def _validate_docs(self, docs: Iterable[T_doc]) -> Iterable[T_doc]: """ - Validate if an Iterable of Document are compatible with this DocList + Validate if an Iterable of Document are compatible with this `DocList` """ for doc in docs: yield self._validate_one_doc(doc) def _validate_one_doc(self, doc: T_doc) -> T_doc: - """Validate if a Document is compatible with this DocList""" + """Validate if a Document is compatible with this `DocList`""" if not issubclass(self.doc_type, AnyDoc) and not isinstance(doc, self.doc_type): raise ValueError(f'{doc} is not a {self.doc_type}') return doc @@ -178,16 +180,16 @@ def __bytes__(self) -> bytes: def append(self, doc: T_doc): """ - Append a Document to the DocList. The Document must be from the same class - as the doc_type of this DocList otherwise it will fail. + Append a Document to the `DocList`. The Document must be from the same class + as the `.doc_type` of this `DocList` otherwise it will fail. :param doc: A Document """ self._data.append(self._validate_one_doc(doc)) def extend(self, docs: Iterable[T_doc]): """ - Extend a DocList with an Iterable of Document. The Documents must be from - the same class as the doc_type of this DocList otherwise it will + Extend a `DocList` with an Iterable of Document. The Documents must be from + the same class as the `.doc_type` of this `DocList` otherwise it will fail. :param docs: Iterable of Documents """ @@ -195,8 +197,8 @@ def extend(self, docs: Iterable[T_doc]): def insert(self, i: int, doc: T_doc): """ - Insert a Document to the DocList. The Document must be from the same - class as the doc_type of this DocList otherwise it will fail. + Insert a Document to the `DocList`. The Document must be from the same + class as the doc_type of this `DocList` otherwise it will fail. :param i: index to insert :param doc: A Document """ @@ -238,10 +240,10 @@ def _set_data_column( field: str, values: Union[List, T, 'AbstractTensor'], ): - """Set all Documents in this DocList using the passed values + """Set all Documents in this `DocList` using the passed values :param field: name of the fields to set - :values: the values to set at the DocList level + :values: the values to set at the `DocList` level """ ... @@ -253,11 +255,11 @@ def stack( tensor_type: Type['AbstractTensor'] = NdArray, ) -> 'DocVec': """ - Convert the DocList into a DocVec. `Self` cannot be used + Convert the `DocList` into a `DocVec`. `Self` cannot be used afterwards :param tensor_type: Tensor Class used to wrap the doc_vec tensors. This is useful if the BaseDoc has some undefined tensor type like AnyTensor or Union of NdArray and TorchTensor - :return: A DocVec of the same document type as self + :return: A `DocVec` of the same document type as self """ from docarray.array.doc_vec.doc_vec import DocVec @@ -291,7 +293,7 @@ def traverse_flat( @classmethod def from_protobuf(cls: Type[T], pb_msg: 'DocListProto') -> T: """create a Document from a protobuf message - :param pb_msg: The protobuf message from where to construct the DocList + :param pb_msg: The protobuf message from where to construct the `DocList` """ return super().from_protobuf(pb_msg) diff --git a/docarray/array/doc_list/io.py b/docarray/array/doc_list/io.py index fed12363697..e0814e89fa8 100644 --- a/docarray/array/doc_list/io.py +++ b/docarray/array/doc_list/io.py @@ -120,7 +120,7 @@ def from_protobuf(cls: Type[T], pb_msg: 'DocListProto') -> T: return cls(cls.doc_type.from_protobuf(doc_proto) for doc_proto in pb_msg.docs) def to_protobuf(self) -> 'DocListProto': - """Convert DocList into a Protobuf message""" + """Convert `DocList` into a Protobuf message""" from docarray.proto import DocListProto da_proto = DocListProto() @@ -137,13 +137,13 @@ def from_bytes( compress: Optional[str] = None, show_progress: bool = False, ) -> T: - """Deserialize bytes into a DocList. + """Deserialize bytes into a `DocList`. :param data: Bytes from which to deserialize :param protocol: protocol that was used to serialize :param compress: compress algorithm that was used to serialize :param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf` - :return: the deserialized DocList + :return: the deserialized `DocList` """ return cls._load_binary_all( file_ctx=nullcontext(data), @@ -242,7 +242,7 @@ def to_bytes( file_ctx: Optional[BinaryIO] = None, show_progress: bool = False, ) -> Optional[bytes]: - """Serialize itself into bytes. + """Serialize itself into `bytes`. For more Pythonic code, please use ``bytes(...)``. @@ -273,13 +273,13 @@ def from_base64( compress: Optional[str] = None, show_progress: bool = False, ) -> T: - """Deserialize base64 strings into a DocList. + """Deserialize base64 strings into a `DocList`. :param data: Base64 string to deserialize :param protocol: protocol that was used to serialize :param compress: compress algorithm that was used to serialize :param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf` - :return: the deserialized DocList + :return: the deserialized `DocList` """ return cls._load_binary_all( file_ctx=nullcontext(base64.b64decode(data)), @@ -315,17 +315,17 @@ def from_json( cls: Type[T], file: Union[str, bytes, bytearray], ) -> T: - """Deserialize JSON strings or bytes into a DocList. + """Deserialize JSON strings or bytes into a `DocList`. - :param file: JSON object from where to deserialize a DocList - :return: the deserialized DocList + :param file: JSON object from where to deserialize a `DocList` + :return: the deserialized `DocList` """ json_docs = orjson.loads(file) return cls([cls.doc_type(**v) for v in json_docs]) def to_json(self) -> bytes: - """Convert the object into JSON bytes. Can be loaded via :meth:`.from_json`. - :return: JSON serialization of DocList + """Convert the object into JSON bytes. Can be loaded via `.from_json`. + :return: JSON serialization of `DocList` """ return orjson_dumps(self._data) @@ -345,22 +345,25 @@ def from_csv( ) -> 'DocList': """ Load a DocList from a csv file following the schema defined in the - :attr:`~docarray.DocList.doc_type` attribute. + [`.doc_type`][docarray.DocList] attribute. Every row of the csv file will be mapped to one document in the doc_list. The column names (defined in the first row) have to match the field names of the Document type. - For nested fields use "__"-separated access paths, such as 'image__url'. + For nested fields use "__"-separated access paths, such as `'image__url'`. List-like fields (including field of type DocList) are not supported. :param file_path: path to csv file to load DocList from. :param encoding: encoding used to read the csv file. Defaults to 'utf-8'. :param dialect: defines separator and how to handle whitespaces etc. - Can be a csv.Dialect instance or one string of: - 'excel' (for comma seperated values), - 'excel-tab' (for tab separated values), - 'unix' (for csv file generated on UNIX systems). - :return: DocList + Can be a [`csv.Dialect`](https://docs.python.org/3/library/csv.html#csv.Dialect) + instance or one string of: + + - 'excel' (for comma separated values), + - 'excel-tab' (for tab separated values), + - 'unix' (for csv file generated on UNIX systems). + + :return: `DocList` object """ if cls.doc_type == AnyDoc: raise TypeError( @@ -415,18 +418,20 @@ def to_csv( self, file_path: str, dialect: Union[str, csv.Dialect] = 'excel' ) -> None: """ - Save a DocList to a csv file. + Save a `DocList` to a csv file. The field names will be stored in the first row. Each row corresponds to the information of one Document. Columns for nested fields will be named after the "__"-seperated access paths, - such as `"image__url"` for `image.url`. + such as `'image__url'` for `image.url`. :param file_path: path to a csv file. :param dialect: defines separator and how to handle whitespaces etc. - Can be a csv.Dialect instance or one string of: - 'excel' (for comma seperated values), - 'excel-tab' (for tab separated values), - 'unix' (for csv file generated on UNIX systems). + Can be a [`csv.Dialect`](https://docs.python.org/3/library/csv.html#csv.Dialect) + instance or one string of: + + - 'excel' (for comma seperated values), + - 'excel-tab' (for tab separated values), + - 'unix' (for csv file generated on UNIX systems). """ fields = self.doc_type._get_access_paths() @@ -441,42 +446,43 @@ def to_csv( @classmethod def from_dataframe(cls, df: 'pd.DataFrame') -> 'DocList': """ - Load a DocList from a `pandas.DataFrame` following the schema - defined in the :attr:`~docarray.DocList.doc_type` attribute. + Load a `DocList` from a `pandas.DataFrame` following the schema + defined in the [`.doc_type`][docarray.DocList] attribute. Every row of the dataframe will be mapped to one Document in the doc_list. The column names of the dataframe have to match the field names of the Document type. For nested fields use "__"-separated access paths as column names, - such as 'image__url'. + such as `'image__url'`. List-like fields (including field of type DocList) are not supported. - EXAMPLE USAGE: + --- - .. code-block:: python + ```python + import pandas as pd - import pandas as pd + from docarray import BaseDoc, DocList - from docarray import BaseDoc, DocList + class Person(BaseDoc): + name: str + follower: int - class Person(BaseDoc): - name: str - follower: int + df = pd.DataFrame( + data=[['Maria', 12345], ['Jake', 54321]], columns=['name', 'follower'] + ) - df = pd.DataFrame( - data=[['Maria', 12345], ['Jake', 54321]], columns=['name', 'follower'] - ) - - docs = DocList[Person].from_dataframe(df) + docs = DocList[Person].from_dataframe(df) - assert docs.name == ['Maria', 'Jake'] - assert docs.follower == [12345, 54321] + assert docs.name == ['Maria', 'Jake'] + assert docs.follower == [12345, 54321] + ``` + --- - :param df: pandas.DataFrame to extract Document's information from - :return: DocList where each Document contains the information of one + :param df: `pandas.DataFrame` to extract Document's information from + :return: `DocList` where each Document contains the information of one corresponding row of the `pandas.DataFrame`. """ from docarray import DocList @@ -518,9 +524,9 @@ def to_dataframe(self) -> 'pd.DataFrame': The field names will be stored as column names. Each row of the dataframe corresponds to the information of one Document. Columns for nested fields will be named after the "__"-seperated access paths, - such as `"image__url"` for `image.url`. + such as `'image__url'` for `image.url`. - :return: pandas.DataFrame + :return: `pandas.DataFrame` """ if TYPE_CHECKING: import pandas as pd @@ -702,9 +708,9 @@ def load_binary( :param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf` :param streaming: if `True` returns a generator over `Document` objects. In case protocol is pickle the `Documents` are streamed from disk to save memory usage - :return: a DocList object + :return: a `DocList` object - .. note:: + !!! note If `file` is `str` it can specify `protocol` and `compress` as file extensions. This functionality assumes `file=file_name.$protocol.$compress` where `$protocol` and `$compress` refer to a string interpolation of the respective `protocol` and `compress` methods. @@ -762,7 +768,7 @@ def save_binary( :param compress: compress algorithm to use :param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf` - .. note:: + !!! note If `file` is `str` it can specify `protocol` and `compress` as file extensions. This functionality assumes `file=file_name.$protocol.$compress` where `$protocol` and `$compress` refer to a string interpolation of the respective `protocol` and `compress` methods. diff --git a/docarray/documents/__init__.py b/docarray/documents/__init__.py index 04667963843..aba89edd172 100644 --- a/docarray/documents/__init__.py +++ b/docarray/documents/__init__.py @@ -1,8 +1,17 @@ from docarray.documents.audio import AudioDoc from docarray.documents.image import ImageDoc -from docarray.documents.mesh import Mesh3D -from docarray.documents.point_cloud import PointCloud3D +from docarray.documents.mesh import Mesh3D, VerticesAndFaces +from docarray.documents.point_cloud import PointCloud3D, PointsAndColors from docarray.documents.text import TextDoc from docarray.documents.video import VideoDoc -__all__ = ['TextDoc', 'ImageDoc', 'AudioDoc', 'Mesh3D', 'PointCloud3D', 'VideoDoc'] +__all__ = [ + 'TextDoc', + 'ImageDoc', + 'AudioDoc', + 'Mesh3D', + 'VerticesAndFaces', + 'PointCloud3D', + 'PointsAndColors', + 'VideoDoc', +] diff --git a/docarray/documents/audio.py b/docarray/documents/audio.py index 3103ab656c1..ee189b5c867 100644 --- a/docarray/documents/audio.py +++ b/docarray/documents/audio.py @@ -24,75 +24,74 @@ class AudioDoc(BaseDoc): """ Document for handling audios. - The Audio Document can contain an AudioUrl (`AudioDoc.url`), an AudioTensor - (`AudioDoc.tensor`), and an AnyEmbedding (`AudioDoc.embedding`). + The Audio Document can contain: - EXAMPLE USAGE: + - an [`AudioUrl`][docarray.typing.url.AudioUrl] (`AudioDoc.url`) + - an [`AudioTensor`](../../../api_references/typing/tensor/audio) (`AudioDoc.tensor`) + - an [`AnyEmbedding`](../../../api_references/typing/tensor/embedding) (`AudioDoc.embedding`) + - an [`AudioBytes`][docarray.typing.bytes.AudioBytes] (`AudioDoc.bytes_`) object + - an integer representing the frame_rate (`AudioDoc.frame_rate`) You can use this Document directly: - .. code-block:: python + ```python + from docarray.documents import AudioDoc - from docarray.documents import AudioDoc - - # use it directly - audio = Audio( - url='https://github.com/docarray/docarray/blob/feat-rewrite-v2/tests/toydata/hello.wav?raw=true' - ) - audio.tensor, audio.frame_rate = audio.url.load() - model = MyEmbeddingModel() - audio.embedding = model(audio.tensor) + # use it directly + audio = AudioDoc( + url='https://github.com/docarray/docarray/blob/feat-rewrite-v2/tests/toydata/hello.wav?raw=true' + ) + audio.tensor, audio.frame_rate = audio.url.load() + # model = MyEmbeddingModel() + # audio.embedding = model(audio.tensor) + ``` You can extend this Document: - .. code-block:: python - - from docarray.documents import AudioDoc, TextDoc - from typing import Optional + ```python + from docarray.documents import AudioDoc, TextDoc + from typing import Optional - # extend it - class MyAudio(Audio): - name: Optional[Text] + # extend it + class MyAudio(AudioDoc): + name: Optional[TextDoc] - audio = MyAudio( - url='https://github.com/docarray/docarray/blob/feat-rewrite-v2/tests/toydata/hello.wav?raw=true' - ) - audio.tensor, audio.frame_rate = audio.url.load() - model = MyEmbeddingModel() - audio.embedding = model(audio.tensor) - audio.name = Text(text='my first audio') - + audio = MyAudio( + url='https://github.com/docarray/docarray/blob/feat-rewrite-v2/tests/toydata/hello.wav?raw=true' + ) + audio.name = TextDoc(text='my first audio') + audio.tensor, audio.frame_rate = audio.url.load() + # model = MyEmbeddingModel() + # audio.embedding = model(audio.tensor) + ``` You can use this Document for composition: - .. code-block:: python - - from docarray import BaseDoc - from docarray.documents import AudioDoc, TextDoc + ```python + from docarray import BaseDoc + from docarray.documents import AudioDoc, TextDoc - # compose it - class MultiModalDoc(Document): - audio: Audio - text: Text + # compose it + class MultiModalDoc(BaseDoc): + audio: AudioDoc + text: TextDoc - mmdoc = MultiModalDoc( - audio=Audio( - url='https://github.com/docarray/docarray/blob/feat-rewrite-v2/tests/toydata/hello.wav?raw=true' - ), - text=Text(text='hello world, how are you doing?'), - ) - mmdoc.audio.tensor, mmdoc.audio.frame_rate = mmdoc.audio.url.load() - - # equivalent to - - mmdoc.audio.bytes_ = mmdoc.audio.url.load_bytes() - - mmdoc.audio.tensor, mmdoc.audio.frame_rate = mmdoc.audio.bytes.load() - + mmdoc = MultiModalDoc( + audio=AudioDoc( + url='https://github.com/docarray/docarray/blob/feat-rewrite-v2/tests/toydata/hello.wav?raw=true' + ), + text=TextDoc(text='hello world, how are you doing?'), + ) + mmdoc.audio.tensor, mmdoc.audio.frame_rate = mmdoc.audio.url.load() + + # equivalent to + mmdoc.audio.bytes_ = mmdoc.audio.url.load_bytes() + mmdoc.audio.tensor, mmdoc.audio.frame_rate = mmdoc.audio.bytes_.load() + ``` """ url: Optional[AudioUrl] diff --git a/docarray/documents/helper.py b/docarray/documents/helper.py index 71a56260864..94be2db2739 100644 --- a/docarray/documents/helper.py +++ b/docarray/documents/helper.py @@ -36,24 +36,21 @@ def create_doc( in the format `=(, )` or `=` :return: the new Document class - EXAMPLE USAGE - - .. code-block:: python - - from docarray.documents import Audio - from docarray.documents.helper import create_doc - from docarray.typing.tensor.audio import AudioNdArray - - MyAudio = create_doc( - 'MyAudio', - __base__=Audio, - title=(str, ...), - tensor=(AudioNdArray, ...), - ) - - assert issubclass(MyAudio, BaseDoc) - assert issubclass(MyAudio, Audio) + ```python + from docarray.documents import Audio + from docarray.documents.helper import create_doc + from docarray.typing.tensor.audio import AudioNdArray + + MyAudio = create_doc( + 'MyAudio', + __base__=Audio, + title=(str, ...), + tensor=(AudioNdArray, ...), + ) + assert issubclass(MyAudio, BaseDoc) + assert issubclass(MyAudio, Audio) + ``` """ if not issubclass(__base__, BaseDoc): diff --git a/docarray/documents/image.py b/docarray/documents/image.py index 47456c8cd29..12e32e3cb19 100644 --- a/docarray/documents/image.py +++ b/docarray/documents/image.py @@ -21,67 +21,75 @@ class ImageDoc(BaseDoc): """ Document for handling images. - It can contain an ImageUrl (`Image.url`), an AnyTensor (`Image.tensor`), - and an AnyEmbedding (`Image.embedding`). - EXAMPLE USAGE: + It can contain: - You can use this Document directly: + - an [`ImageUrl`][docarray.typing.url.ImageUrl] (`Image.url`) + - an [`ImageTensor`](../../../api_references/typing/tensor/image) (`Image.tensor`) + - an [`AnyEmbedding`](../../../api_references/typing/tensor/embedding) (`Image.embedding`) + - an [`ImageBytes`][docarray.typing.bytes.ImageBytes] object (`ImageDoc.bytes_`) - .. code-block:: python + You can use this Document directly: - from docarray.documents import ImageDoc + ```python + from docarray.documents import ImageDoc - # use it directly - image = ImageDoc(url='http://www.jina.ai/image.jpg') - image.tensor = image.url.load() - model = MyEmbeddingModel() - image.embedding = model(image.tensor) + # use it directly + image = ImageDoc( + url='https://github.com/docarray/docarray/blob/feat-rewrite-v2/tests/toydata/image-data/apple.png?raw=true' + ) + image.tensor = image.url.load() + # model = MyEmbeddingModel() + # image.embedding = model(image.tensor) + ``` You can extend this Document: - .. code-block:: python - - from docarray.documents import ImageDoc - from docarray.typing import AnyEmbedding - from typing import Optional + ```python + from docarray.documents import ImageDoc + from docarray.typing import AnyEmbedding + from typing import Optional - # extend it - class MyImage(ImageDoc): - second_embedding: Optional[AnyEmbedding] + # extend it + class MyImage(ImageDoc): + second_embedding: Optional[AnyEmbedding] - image = MyImage(url='http://www.jina.ai/image.jpg') - image.tensor = image.url.load() - model = MyEmbeddingModel() - image.embedding = model(image.tensor) - image.second_embedding = model(image.tensor) - + image = MyImage( + url='https://github.com/docarray/docarray/blob/feat-rewrite-v2/tests/toydata/image-data/apple.png?raw=true' + ) + image.tensor = image.url.load() + # model = MyEmbeddingModel() + # image.embedding = model(image.tensor) + # image.second_embedding = model(image.tensor) + ``` You can use this Document for composition: - .. code-block:: python - - from docarray import BaseDoc - from docarray.documents import ImageDoc, TextDoc + ```python + from docarray import BaseDoc + from docarray.documents import ImageDoc, TextDoc - # compose it - class MultiModalDoc(BaseDoc): - image: Image - text: Text + # compose it + class MultiModalDoc(BaseDoc): + image: ImageDoc + text: TextDoc - mmdoc = MultiModalDoc( - image=Image(url="http://www.jina.ai/image.jpg"), - text=Text(text="hello world, how are you doing?"), - ) - mmdoc.image.tensor = mmdoc.image.url.load() - # or - mmdoc.image.bytes_ = mmdoc.image.url.load_bytes() + mmdoc = MultiModalDoc( + image=ImageDoc( + url='https://github.com/docarray/docarray/blob/feat-rewrite-v2/tests/toydata/image-data/apple.png?raw=true' + ), + text=TextDoc(text='hello world, how are you doing?'), + ) + mmdoc.image.tensor = mmdoc.image.url.load() - mmdoc.image.tensor = mmdoc.image.bytes.load() + # or + mmdoc.image.bytes_ = mmdoc.image.url.load_bytes() + mmdoc.image.tensor = mmdoc.image.bytes_.load() + ``` """ url: Optional[ImageUrl] diff --git a/docarray/documents/legacy/legacy_document.py b/docarray/documents/legacy/legacy_document.py index 96e2ee1e758..3d9cde62d13 100644 --- a/docarray/documents/legacy/legacy_document.py +++ b/docarray/documents/legacy/legacy_document.py @@ -14,22 +14,23 @@ class LegacyDocument(BaseDoc): Nevertheless, the API is not totally compatible with DocAray v1 `Document`. Indeed, none of the method associated with `Document` are present. Only the schema of the data is similar. - .. code-block:: python - from docarray import DocList - from docarray.documents.legacy import LegacyDocument - import numpy as np + ```python + from docarray import DocList + from docarray.documents.legacy import LegacyDocument + import numpy as np - doc = LegacyDocument(text='hello') - doc.url = 'http://myimg.png' - doc.tensor = np.zeros((3, 224, 224)) - doc.embedding = np.zeros((100, 1)) + doc = LegacyDocument(text='hello') + doc.url = 'http://myimg.png' + doc.tensor = np.zeros((3, 224, 224)) + doc.embedding = np.zeros((100, 1)) - doc.tags['price'] = 10 + doc.tags['price'] = 10 - doc.chunks = DocList[Document]([Document() for _ in range(10)]) + doc.chunks = DocList[Document]([Document() for _ in range(10)]) - doc.chunks = DocList[Document]([Document() for _ in range(10)]) + doc.chunks = DocList[Document]([Document() for _ in range(10)]) + ``` """ diff --git a/docarray/documents/mesh/__init__.py b/docarray/documents/mesh/__init__.py index e1f402ac56f..15ba1fdab10 100644 --- a/docarray/documents/mesh/__init__.py +++ b/docarray/documents/mesh/__init__.py @@ -1,3 +1,4 @@ from docarray.documents.mesh.mesh_3d import Mesh3D +from docarray.documents.mesh.vertices_and_faces import VerticesAndFaces -__all__ = ['Mesh3D'] +__all__ = ['Mesh3D', 'VerticesAndFaces'] diff --git a/docarray/documents/mesh/mesh_3d.py b/docarray/documents/mesh/mesh_3d.py index a8a2edd5d15..82d93f73456 100644 --- a/docarray/documents/mesh/mesh_3d.py +++ b/docarray/documents/mesh/mesh_3d.py @@ -19,85 +19,87 @@ class Mesh3D(BaseDoc): tensor of shape (n_faces, 3). Each number in that tensor refers to an index of a vertex in the tensor of vertices. - The Mesh3D Document can contain an Mesh3DUrl (`Mesh3D.url`), a VerticesAndFaces - object containing an AnyTensor of vertices (`Mesh3D.tensors.vertices) and an - AnyTensor of faces (`Mesh3D.tensors.faces), and an AnyEmbedding - (`Mesh3D.embedding`). + The Mesh3D Document can contain: - EXAMPLE USAGE: + - an [`Mesh3DUrl`][docarray.typing.url.Mesh3DUrl] (`Mesh3D.url`) + - a [`VerticesAndFaces`][docarray.documents.mesh.vertices_and_faces.VerticesAndFaces] + object containing: - You can use this Document directly: + - an [`AnyTensor`](../../../../api_references/typing/tensor/tensor) of + vertices (`Mesh3D.tensors.vertices`) + - an [`AnyTensor`](../../../../api_references/typing/tensor/tensor) of faces (`Mesh3D.tensors.faces`) - .. code-block:: python + - an [`AnyEmbedding`](../../../../api_references/typing/tensor/embedding) (`Mesh3D.embedding`) + - a `bytes` object (`Mesh3D.bytes_`). - from docarray.documents import Mesh3D + You can use this Document directly: - # use it directly - mesh = Mesh3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj') - mesh.tensors = mesh.url.load() - model = MyEmbeddingModel() - mesh.embedding = model(mesh.tensors.vertices) + ```python + from docarray.documents import Mesh3D - You can extend this Document: + # use it directly + mesh = Mesh3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj') + mesh.tensors = mesh.url.load() + # model = MyEmbeddingModel() + # mesh.embedding = model(mesh.tensors.vertices) + ``` - .. code-block:: python - - from docarray.documents import Mesh3D - from docarray.typing import AnyEmbedding - from typing import Optional + You can extend this Document: + ```python + from docarray.documents import Mesh3D + from docarray.typing import AnyEmbedding + from typing import Optional - # extend it - class MyMesh3D(Mesh3D): - name: Optional[Text] + # extend it + class MyMesh3D(Mesh3D): + name: Optional[str] - mesh = MyMesh3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj') - mesh.tensors = mesh.url.load() - model = MyEmbeddingModel() - mesh.embedding = model(mesh.vertices) - mesh.name = 'my first mesh' + mesh = MyMesh3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj') + mesh.name = 'my first mesh' + mesh.tensors = mesh.url.load() + # model = MyEmbeddingModel() + # mesh.embedding = model(mesh.vertices) + ``` You can use this Document for composition: - .. code-block:: python + ```python + from docarray import BaseDoc + from docarray.documents import Mesh3D, TextDoc - from docarray import BaseDoc - from docarray.documents import Mesh3D, Text + # compose it + class MultiModalDoc(BaseDoc): + mesh: Mesh3D + text: TextDoc - # compose it - class MultiModalDoc(BaseDoc): - mesh: Mesh3D - text: Text + mmdoc = MultiModalDoc( + mesh=Mesh3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj'), + text=TextDoc(text='hello world, how are you doing?'), + ) + mmdoc.mesh.tensors = mmdoc.mesh.url.load() - mmdoc = MultiModalDoc( - mesh=Mesh3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj'), - text=Text(text='hello world, how are you doing?'), - ) - mmdoc.mesh.tensors = mmdoc.mesh.url.load() - - # or - mmdoc.mesh.bytes_ = mmdoc.mesh.url.load_bytes() - + # or + mmdoc.mesh.bytes_ = mmdoc.mesh.url.load_bytes() + ``` You can display your 3D mesh in a notebook from either its url, or its tensors: - .. code-block:: python - - from docarray.documents import Mesh3D - - # display from url - mesh = Mesh3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj') - mesh.url.display() + ```python + from docarray.documents import Mesh3D - # display from tensors - mesh.tensors = mesh.url.load() - model = MyEmbeddingModel() - mesh.embedding = model(mesh.tensors.vertices) + # display from url + mesh = Mesh3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj') + # mesh.url.display() + # display from tensors + mesh.tensors = mesh.url.load() + # mesh.tensors.display() + ``` """ diff --git a/docarray/documents/mesh/vertices_and_faces.py b/docarray/documents/mesh/vertices_and_faces.py index cc3dc6da795..758f0acc6b0 100644 --- a/docarray/documents/mesh/vertices_and_faces.py +++ b/docarray/documents/mesh/vertices_and_faces.py @@ -9,11 +9,14 @@ class VerticesAndFaces(BaseDoc): """ - Document for handling 3D mesh tensor data. + Document for handling the tensor data of a [`Mesh3D`][docarray.documents.mesh.Mesh3D] object. - A VerticesAndFaces Document can contain an AnyTensor containing the vertices - information (`VerticesAndFaces.vertices`), and an AnyTensor containing the faces - information (`VerticesAndFaces.faces`). + A VerticesAndFaces Document can contain: + + - an [`AnyTensor`](../../../../api_references/typing/tensor/tensor) + containing the vertices information (`VerticesAndFaces.vertices`) + - an [`AnyTensor`](../../../../api_references/typing/tensor/tensor) + containing the faces information (`VerticesAndFaces.faces`) """ vertices: AnyTensor diff --git a/docarray/documents/point_cloud/__init__.py b/docarray/documents/point_cloud/__init__.py index 65b966114b5..27a9defeb87 100644 --- a/docarray/documents/point_cloud/__init__.py +++ b/docarray/documents/point_cloud/__init__.py @@ -1,3 +1,4 @@ from docarray.documents.point_cloud.point_cloud_3d import PointCloud3D +from docarray.documents.point_cloud.points_and_colors import PointsAndColors -__all__ = ['PointCloud3D'] +__all__ = ['PointCloud3D', 'PointsAndColors'] diff --git a/docarray/documents/point_cloud/point_cloud_3d.py b/docarray/documents/point_cloud/point_cloud_3d.py index 36e56a4c571..8a1963be69f 100644 --- a/docarray/documents/point_cloud/point_cloud_3d.py +++ b/docarray/documents/point_cloud/point_cloud_3d.py @@ -25,90 +25,86 @@ class PointCloud3D(BaseDoc): Point cloud is a representation of a 3D mesh. It is made by repeatedly and uniformly sampling points within the surface of the 3D body. Compared to the mesh - representation, the point cloud is a fixed size ndarray (shape=(n_samples, 3)) and + representation, the point cloud is a fixed size ndarray of shape `(n_samples, 3)` and hence easier for deep learning algorithms to handle. - A PointCloud3D Document can contain an PointCloud3DUrl (`PointCloud3D.url`), - a PointsAndColors object (`PointCloud3D.tensors`), and an AnyEmbedding - (`PointCloud3D.embedding`). + A PointCloud3D Document can contain: - EXAMPLE USAGE: + - a [`PointCloud3DUrl`][docarray.typing.url.PointCloud3DUrl] (`PointCloud3D.url`) + - a [`PointsAndColors`][docarray.documents.point_cloud.points_and_colors.PointsAndColors] object (`PointCloud3D.tensors`) + - an [`AnyEmbedding`](../../../../api_references/typing/tensor/embedding) (`PointCloud3D.embedding`) + - a `bytes` object (`PointCloud3D.bytes_`) You can use this Document directly: - .. code-block:: python + ```python + from docarray.documents import PointCloud3D - from docarray.documents import PointCloud3D - - # use it directly - pc = PointCloud3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj') - pc.tensors = pc.url.load(samples=100) - model = MyEmbeddingModel() - pc.embedding = model(pc.tensors.points) + # use it directly + pc = PointCloud3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj') + pc.tensors = pc.url.load(samples=100) + # model = MyEmbeddingModel() + # pc.embedding = model(pc.tensors.points) + ``` You can extend this Document: - .. code-block:: python - - from docarray.documents import PointCloud3D - from docarray.typing import AnyEmbedding - from typing import Optional + ```python + from docarray.documents import PointCloud3D + from docarray.typing import AnyEmbedding + from typing import Optional - # extend it - class MyPointCloud3D(PointCloud3D): - second_embedding: Optional[AnyEmbedding] + # extend it + class MyPointCloud3D(PointCloud3D): + second_embedding: Optional[AnyEmbedding] - pc = MyPointCloud3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj') - pc.tensors = pc.url.load(samples=100) - model = MyEmbeddingModel() - pc.embedding = model(pc.tensors.points) - pc.second_embedding = model(pc.tensors.colors) - + pc = MyPointCloud3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj') + pc.tensors = pc.url.load(samples=100) + # model = MyEmbeddingModel() + # pc.embedding = model(pc.tensors.points) + # pc.second_embedding = model(pc.tensors.colors) + ``` You can use this Document for composition: - .. code-block:: python - - from docarray import BaseDoc - from docarray.documents import PointCloud3D, Text + ```python + from docarray import BaseDoc + from docarray.documents import PointCloud3D, TextDoc - # compose it - class MultiModalDoc(BaseDoc): - point_cloud: PointCloud3D - text: Text + # compose it + class MultiModalDoc(BaseDoc): + point_cloud: PointCloud3D + text: TextDoc - mmdoc = MultiModalDoc( - point_cloud=PointCloud3D( - url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj' - ), - text=Text(text='hello world, how are you doing?'), - ) - mmdoc.point_cloud.tensors = mmdoc.point_cloud.url.load(samples=100) - - # or - - mmdoc.point_cloud.bytes_ = mmdoc.point_cloud.url.load_bytes() + mmdoc = MultiModalDoc( + point_cloud=PointCloud3D( + url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj' + ), + text=TextDoc(text='hello world, how are you doing?'), + ) + mmdoc.point_cloud.tensors = mmdoc.point_cloud.url.load(samples=100) + # or + mmdoc.point_cloud.bytes_ = mmdoc.point_cloud.url.load_bytes() + ``` You can display your point cloud from either its url, or its tensors: - .. code-block:: python - - from docarray.documents import PointCloud3D - - # display from url - pc = PointCloud3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj') - pc.url.display() + ```python + from docarray.documents import PointCloud3D - # display from tensors - pc.tensors = pc.url.load(samples=10000) - model = MyEmbeddingModel() - pc.embedding = model(pc.tensors.points) + # display from url + pc = PointCloud3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj') + # pc.url.display() + # display from tensors + pc.tensors = pc.url.load(samples=10000) + # pc.tensors.display() + ``` """ url: Optional[PointCloud3DUrl] diff --git a/docarray/documents/point_cloud/points_and_colors.py b/docarray/documents/point_cloud/points_and_colors.py index 825744bda4f..89475d3d9cd 100644 --- a/docarray/documents/point_cloud/points_and_colors.py +++ b/docarray/documents/point_cloud/points_and_colors.py @@ -20,11 +20,14 @@ class PointsAndColors(BaseDoc): """ - Document for handling point clouds tensor data. + Document for handling the tensor data of a [`PointCloud3D`][docarray.documents.point_cloud.PointCloud3D] object. - A PointsAndColors Document can contain an AnyTensor containing the points in - 3D space information (`PointsAndColors.points`), and an AnyTensor containing - the points' color information (`PointsAndColors.colors`). + A PointsAndColors Document can contain: + + - an [`AnyTensor`](../../../../api_references/typing/tensor/tensor) + containing the points in 3D space information (`PointsAndColors.points`) + - an [`AnyTensor`](../../../../api_references/typing/tensor/tensor) + containing the points' color information (`PointsAndColors.colors`) """ points: AnyTensor diff --git a/docarray/documents/text.py b/docarray/documents/text.py index c0c2725891c..91d2be7045a 100644 --- a/docarray/documents/text.py +++ b/docarray/documents/text.py @@ -10,92 +10,96 @@ class TextDoc(BaseDoc): """ Document for handling text. - It can contain a TextUrl (`TextDoc.url`), a str (`TextDoc.text`), - and an AnyEmbedding (`TextDoc.embedding`). - EXAMPLE USAGE: + It can contain: - You can use this Document directly: + - a [`TextUrl`][docarray.typing.url.TextUrl] (`TextDoc.url`) + - a `str` (`TextDoc.text`) + - an [`AnyEmbedding`](../../../api_references/typing/tensor/embedding) (`TextDoc.embedding`) + - a `bytes` object (`TextDoc.bytes_`) - .. code-block:: python + You can use this Document directly: - from docarray.documents import TextDoc + ```python + from docarray.documents import TextDoc - # use it directly - txt_doc = Text(url='http://www.jina.ai/') - txt_doc.text = txt_doc.url.load() - model = MyEmbeddingModel() - txt_doc.embedding = model(txt_doc.text) + # use it directly + txt_doc = TextDoc(url='http://www.jina.ai/') + txt_doc.text = txt_doc.url.load() + # model = MyEmbeddingModel() + # txt_doc.embedding = model(txt_doc.text) + ``` You can initialize directly from a string: - .. code-block:: python - - from docarray.documents import TextDoc + ```python + from docarray.documents import TextDoc - txt_doc = Text('hello world') + txt_doc = TextDoc('hello world') + ``` You can extend this Document: - .. code-block:: python - - from docarray.documents import TextDoc - from docarray.typing import AnyEmbedding - from typing import Optional + ```python + from docarray.documents import TextDoc + from docarray.typing import AnyEmbedding + from typing import Optional - # extend it - class MyText(Text): - second_embedding: Optional[AnyEmbedding] + # extend it + class MyText(TextDoc): + second_embedding: Optional[AnyEmbedding] - txt_doc = MyText(url='http://www.jina.ai/') - txt_doc.text = txt_doc.url.load() - model = MyEmbeddingModel() - txt_doc.embedding = model(txt_doc.text) - txt_doc.second_embedding = model(txt_doc.text) - + txt_doc = MyText(url='http://www.jina.ai/') + txt_doc.text = txt_doc.url.load() + # model = MyEmbeddingModel() + # txt_doc.embedding = model(txt_doc.text) + # txt_doc.second_embedding = model(txt_doc.text) + ``` You can use this Document for composition: - .. code-block:: python - - from docarray import BaseDoc - from docarray.documents import ImageDoc, TextDoc + ```python + from docarray import BaseDoc + from docarray.documents import ImageDoc, TextDoc - # compose it - class MultiModalDoc(BaseDoc): - image_doc: Image - text_doc: Text + # compose it + class MultiModalDoc(BaseDoc): + image_doc: ImageDoc + text_doc: TextDoc - mmdoc = MultiModalDoc( - image_doc=Image(url="http://www.jina.ai/image.jpg"), - text_doc=Text(text="hello world, how are you doing?"), - ) - mmdoc.text_doc.text = mmdoc.text_doc.url.load() - - # or - - mmdoc.text_doc.bytes_ = mmdoc.text_doc.url.load_bytes() + mmdoc = MultiModalDoc( + image_doc=ImageDoc( + url='https://github.com/docarray/docarray/blob/feat-rewrite-v2/tests/toydata/image-data/apple.png?raw=true' + ), + text_doc=TextDoc(text='hello world, how are you doing?'), + ) + mmdoc.image_doc.tensor = mmdoc.image_doc.url.load() + # or + mmdoc.image_doc.bytes_ = mmdoc.image_doc.url.load_bytes() + mmdoc.image_doc.tensor = mmdoc.image_doc.bytes_.load() + ``` This Document can be compared against another Document of the same type or a string. When compared against another object of the same type, the pydantic BaseModel equality check will apply which checks the equality of every attribute, - including `id`. When compared against a str, it will check the equality + excluding `id`. When compared against a str, it will check the equality of the `text` attribute against the given string. - .. code-block:: python + ```python + from docarray.documents import TextDoc - from docarray.documents import TextDoc + doc = TextDoc(text='This is the main text', url='exampleurl.com') + doc2 = TextDoc(text='This is the main text', url='exampleurl.com') - doc = Text(text='This is the main text', url='exampleurl.com') - doc2 = Text(text='This is the main text', url='exampleurl.com') + doc == 'This is the main text' # True + doc == doc2 # True + ``` - doc == 'This is the main text' # True - doc == doc2 # False, their ids are not equivalent """ text: Optional[str] @@ -126,18 +130,18 @@ def __eq__(self, other: Any) -> bool: def __contains__(self, item: str) -> bool: """ - This method makes `Text` behave the same as an `str`. - - .. code-block:: python - - from docarray.documents import Text - - t = Text(text='this is my text document') - assert 'text' in t - assert 'docarray' not in t + This method makes `TextDoc` behave the same as an `str`. :param item: A string to be checked if is a substring of `text` attribute :return: A boolean determining the presence of `item` as a substring in `text` + + ```python + from docarray.documents import TextDoc + + t = TextDoc(text='this is my text document') + assert 'text' in t + assert 'docarray' not in t + ``` """ if self.text is not None: return self.text.__contains__(item) diff --git a/docarray/documents/video.py b/docarray/documents/video.py index 1035beac277..9ead5cc4ffc 100644 --- a/docarray/documents/video.py +++ b/docarray/documents/video.py @@ -4,7 +4,7 @@ from docarray.base_doc import BaseDoc from docarray.documents import AudioDoc -from docarray.typing import AnyEmbedding, AnyTensor +from docarray.typing import AnyEmbedding, AnyTensor, VideoBytes from docarray.typing.tensor.abstract_tensor import AbstractTensor from docarray.typing.tensor.video.video_tensor import VideoTensor from docarray.typing.url.video_url import VideoUrl @@ -24,75 +24,77 @@ class VideoDoc(BaseDoc): """ Document for handling video. - The Video Document can contain a VideoUrl (`VideoDoc.url`), an Audio Document - (`VideoDoc.audio`), a VideoTensor (`VideoDoc.tensor`), an AnyTensor representing - the indices of the video's key frames (`VideoDoc.key_frame_indices`) and an - AnyEmbedding (`VideoDoc.embedding`). - EXAMPLE USAGE: + The Video Document can contain: - You can use this Document directly: + - a [`VideoUrl`][docarray.typing.url.VideoUrl] (`VideoDoc.url`) + - an [`AudioDoc`][docarray.documents.AudioDoc] (`VideoDoc.audio`) + - a [`VideoTensor`](../../../api_references/typing/tensor/video) (`VideoDoc.tensor`) + - an [`AnyTensor`](../../../api_references/typing/tensor/tensor) representing the indices of the video's key frames (`VideoDoc.key_frame_indices`) + - an [`AnyEmbedding`](../../../api_references/typing/tensor/embedding) (`VideoDoc.embedding`) + - a [`VideoBytes`][docarray.typing.bytes.VideoBytes] object (`VideoDoc.bytes_`) - .. code-block:: python + You can use this Document directly: - from docarray.documents import Video + ```python + from docarray.documents import VideoDoc - # use it directly - vid = Video( - url='https://github.com/docarray/docarray/tree/feat-add-video-v2/tests/toydata/mov_bbb.mp4?raw=true' - ) - vid.audio.tensor, vid.tensor, vid.key_frame_indices = vid.url.load() - model = MyEmbeddingModel() - vid.embedding = model(vid.tensor) + # use it directly + vid = VideoDoc( + url='https://github.com/docarray/docarray/blob/feat-rewrite-v2/tests/toydata/mov_bbb.mp4?raw=true' + ) + vid.tensor, vid.audio.tensor, vid.key_frame_indices = vid.url.load() + # model = MyEmbeddingModel() + # vid.embedding = model(vid.tensor) + ``` You can extend this Document: - .. code-block:: python + ```python + from typing import Optional - from typing import Optional + from docarray.documents import TextDoc, VideoDoc - from docarray.documents import TextDoc, VideoDoc + # extend it + class MyVideo(VideoDoc): + name: Optional[TextDoc] - # extend it - class MyVideo(Video): - name: Optional[Text] - - video = MyVideo( - url='https://github.com/docarray/docarray/blob/feat-rewrite-v2/tests/toydata/mov_bbb.mp4?raw=true' - ) - video.video_tensor = video.url.load().video - model = MyEmbeddingModel() - video.embedding = model(video.tensor) - video.name = Text(text='my first video') + video = MyVideo( + url='https://github.com/docarray/docarray/blob/feat-rewrite-v2/tests/toydata/mov_bbb.mp4?raw=true' + ) + video.name = TextDoc(text='my first video') + video.tensor = video.url.load().video + # model = MyEmbeddingModel() + # video.embedding = model(video.tensor) + ``` You can use this Document for composition: - .. code-block:: python - - from docarray import BaseDoc - from docarray.documents import TextDoc, VideoDoc + ```python + from docarray import BaseDoc + from docarray.documents import TextDoc, VideoDoc - # compose it - class MultiModalDoc(BaseDoc): - video: Video - text: Text + # compose it + class MultiModalDoc(BaseDoc): + video: VideoDoc + text: TextDoc - mmdoc = MultiModalDoc( - video=Video( - url='https://github.com/docarray/docarray/blob/feat-rewrite-v2/tests/toydata/mov_bbb.mp4?raw=true' - ), - text=Text(text='hello world, how are you doing?'), - ) - mmdoc.video.video_tensor = mmdoc.video.url.load().video - - # or - - mmdoc.video.bytes_ = mmdoc.video.url.load_bytes() - + mmdoc = MultiModalDoc( + video=VideoDoc( + url='https://github.com/docarray/docarray/blob/feat-rewrite-v2/tests/toydata/mov_bbb.mp4?raw=true' + ), + text=TextDoc(text='hello world, how are you doing?'), + ) + mmdoc.video.tensor = mmdoc.video.url.load().video + + # or + mmdoc.video.bytes_ = mmdoc.video.url.load_bytes() + mmdoc.video.tensor = mmdoc.video.bytes_.load().video + ``` """ url: Optional[VideoUrl] @@ -100,7 +102,7 @@ class MultiModalDoc(BaseDoc): tensor: Optional[VideoTensor] key_frame_indices: Optional[AnyTensor] embedding: Optional[AnyEmbedding] - bytes_: Optional[bytes] + bytes_: Optional[VideoBytes] @classmethod def validate( diff --git a/docarray/typing/__init__.py b/docarray/typing/__init__.py index 376b3aff94c..5fdb578ad04 100644 --- a/docarray/typing/__init__.py +++ b/docarray/typing/__init__.py @@ -3,11 +3,11 @@ from docarray.typing.bytes import AudioBytes, ImageBytes, VideoBytes from docarray.typing.id import ID from docarray.typing.tensor import ImageNdArray, ImageTensor -from docarray.typing.tensor.audio import AudioNdArray +from docarray.typing.tensor.audio import AudioNdArray, AudioTensor from docarray.typing.tensor.embedding.embedding import AnyEmbedding, NdArrayEmbedding from docarray.typing.tensor.ndarray import NdArray from docarray.typing.tensor.tensor import AnyTensor -from docarray.typing.tensor.video import VideoNdArray +from docarray.typing.tensor.video import VideoNdArray, VideoTensor from docarray.typing.url import ( AnyUrl, AudioUrl, @@ -50,6 +50,8 @@ 'ID', 'AnyTensor', 'ImageTensor', + 'AudioTensor', + 'VideoTensor', 'ImageNdArray', 'ImageBytes', 'VideoBytes', diff --git a/docarray/typing/bytes/video_bytes.py b/docarray/typing/bytes/video_bytes.py index 7a870727c1c..008eeb8b0fd 100644 --- a/docarray/typing/bytes/video_bytes.py +++ b/docarray/typing/bytes/video_bytes.py @@ -1,5 +1,5 @@ from io import BytesIO -from typing import TYPE_CHECKING, Any, NamedTuple, Type, TypeVar +from typing import TYPE_CHECKING, Any, List, NamedTuple, Type, TypeVar import numpy as np from pydantic import parse_obj_as @@ -91,20 +91,20 @@ class MyDoc(BaseDoc): av = import_library('av') with av.open(BytesIO(self), **kwargs) as container: - audio_frames = [] - video_frames = [] - keyframe_indices = [] + audio_frames: List[np.ndarray] = [] + video_frames: List[np.ndarray] = [] + keyframe_indices: List[int] = [] for frame in container.decode(): if type(frame) == av.audio.frame.AudioFrame: audio_frames.append(frame.to_ndarray()) elif type(frame) == av.video.frame.VideoFrame: - video_frames.append(frame.to_ndarray(format='rgb24')) - if frame.key_frame == 1: curr_index = len(video_frames) keyframe_indices.append(curr_index) + video_frames.append(frame.to_ndarray(format='rgb24')) + if len(audio_frames) == 0: audio = parse_obj_as(AudioNdArray, np.array(audio_frames)) else: diff --git a/docarray/typing/tensor/audio/abstract_audio_tensor.py b/docarray/typing/tensor/audio/abstract_audio_tensor.py index 0d1636dc164..56fdae6c05e 100644 --- a/docarray/typing/tensor/audio/abstract_audio_tensor.py +++ b/docarray/typing/tensor/audio/abstract_audio_tensor.py @@ -42,15 +42,13 @@ def save( :param sample_width: sample width in bytes :param pydub_args: dictionary of additional arguments for pydub.AudioSegment.export function """ - if TYPE_CHECKING: - import pydub - else: - pydub = import_library('pydub', raise_error=True) + pydub = import_library('pydub', raise_error=True) # noqa: F841 + from pydub import AudioSegment comp_backend = self.get_comp_backend() channels = 2 if comp_backend.n_dim(array=self) > 1 else 1 # type: ignore - segment = pydub.AudioSegment( + segment = AudioSegment( self.to_bytes(), frame_rate=frame_rate, sample_width=sample_width, diff --git a/docarray/typing/url/audio_url.py b/docarray/typing/url/audio_url.py index 6851b8a98c8..a84a68754ee 100644 --- a/docarray/typing/url/audio_url.py +++ b/docarray/typing/url/audio_url.py @@ -75,4 +75,4 @@ def display(self): else: display(Audio(filename=self)) else: - warnings.warn('Display of image is only possible in a notebook.') + warnings.warn('Display of audio is only possible in a notebook.') diff --git a/docs/api_references/array/any_da.md b/docs/api_references/array/any_da.md index e71d1999cf5..2f9c0d61409 100644 --- a/docs/api_references/array/any_da.md +++ b/docs/api_references/array/any_da.md @@ -1,3 +1,3 @@ # AnyDocArray -::: docarray.array.doc_vec.doc_vec.DocVec +::: docarray.array.any_array.AnyDocArray diff --git a/docs/api_references/array/da.md b/docs/api_references/array/da.md index 21a206a9537..eedcec827cd 100644 --- a/docs/api_references/array/da.md +++ b/docs/api_references/array/da.md @@ -1,3 +1,4 @@ # DocList ::: docarray.array.doc_list.doc_list.DocList +::: docarray.array.doc_list.io.IOMixinArray diff --git a/docs/api_references/array/da_stack.md b/docs/api_references/array/da_stack.md index 74f9ff637a0..917b4488d78 100644 --- a/docs/api_references/array/da_stack.md +++ b/docs/api_references/array/da_stack.md @@ -1,3 +1,3 @@ # DocVec -::: docarray.array.any_array.AnyDocArray +::: docarray.array.doc_vec.doc_vec.DocVec \ No newline at end of file diff --git a/docs/data_types/3d_mesh/3d_mesh.md b/docs/data_types/3d_mesh/3d_mesh.md new file mode 100644 index 00000000000..20db151bd23 --- /dev/null +++ b/docs/data_types/3d_mesh/3d_mesh.md @@ -0,0 +1,2698 @@ +# ๐Ÿงฌ 3D Mesh + +DocArray supports many different modalities including `3D Mesh`. +This section will show you how to load and handle 3D data using DocArray. + +A 3D mesh is the structural build of a 3D model consisting of polygons. Most 3D meshes are created via professional software packages, such as commercial suites like Unity, or the free open-source Blender 3D. + + +!!! note + This feature requires `trimesh`. You can install all necessary dependencies via: + ```cm + pip install "docarray[mesh]" + ``` + +## Vertices and Faces representation + +A 3D mesh can be represented by its vertices and faces: + +- **Vertices** are points in a 3D space, represented as a tensor of shape `(n_points, 3)`. +- **Faces** are triangular surfaces that are defined by three points in 3D space, corresponding to the three vertices of a triangle. They can be represented as a tensor of shape `(n_faces, 3)`. Each number in that tensor refers to an index of a vertex in the tensor of vertices. + +### Load vertices and faces + +First, let's define our class `MyMesh3D`, which extends [`BaseDoc`][docarray.base_doc.doc.BaseDoc] and provides attributes to store our 3D data. It has an `url` attribute of type [`Mesh3DUrl`][docarray.typing.url.url_3d.mesh_url.Mesh3DUrl]. To store the vertices and faces, DocArray provides the [`VerticesAndFaces`][docarray.documents.mesh.vertices_and_faces.VerticesAndFaces] class, which has a `vertices` attribute and a `faces` attribute, both of type [`AnyTensor`](../../../../api_references/typing/tensor/tensor). This especially comes in handy later when we want to display our 3D mesh. + +!!! tip + Check out our predefined [`Mesh3D`](#getting-started-predefined-docs) to get started and play around with our 3D features. + +But for now, let's create a `MyMesh3D` instance with an URL to a remote `.obj` file: + + +```python +from typing import Optional + +from docarray import BaseDoc +from docarray.documents.mesh.vertices_and_faces import VerticesAndFaces +from docarray.typing import Mesh3DUrl + + +class MyMesh3D(BaseDoc): + mesh_url: Mesh3DUrl + tensors: Optional[VerticesAndFaces] + + +doc = MyMesh3D(mesh_url="https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj") +``` + +To load the vertices and faces information, you can simply call [`.load()`][docarray.typing.url.url_3d.mesh_url.Mesh3DUrl.load] on the [`Mesh3DUrl`][docarray.typing.url.url_3d.mesh_url.Mesh3DUrl] instance. This will return a [`VerticesAndFaces`][docarray.documents.mesh.vertices_and_faces.VerticesAndFaces] object. + +```python +doc.tensors = doc.mesh_url.load() +doc.summary() +``` + +
+ Output + ``` { .text .no-copy } + ๐Ÿ“„ MyMesh3D : 9d8c26f ... + โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ + โ”‚ Attribute โ”‚ Value โ”‚ + โ”œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ค + โ”‚ mesh_url: Mesh3DUrl โ”‚ https://people.sc.fsu.edu/~jburkardt/data/obj/al.o ... โ”‚ + โ”‚ โ”‚ (length: 52) โ”‚ + โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ + โ””โ”€โ”€ ๐Ÿ”ถ tensors: VerticesAndFaces + โ””โ”€โ”€ ๐Ÿ“„ VerticesAndFaces : 8cae4c4 ... + โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ + โ”‚ Attribute โ”‚ Value โ”‚ + โ”œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ค + โ”‚ vertices: NdArray โ”‚ NdArray of shape (3980, 3), dtype: float64 โ”‚ + โ”‚ faces: NdArray โ”‚ NdArray of shape (7152, 3), dtype: int64 โ”‚ + โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ + ``` +
+ +### Display 3D mesh in notebook + +You can display your 3D mesh interactively from its URL as well as a [`VerticesAndFaces`][docarray.documents.mesh.vertices_and_faces.VerticesAndFaces] instance, by calling `.display()` on either one. The latter will always display without color, whereas the display from the URL will show with color if this information is included in the file content. + +``` { .python } +doc.mesh_url.display() +``` + + + + +## Point cloud representation + +A point cloud is a representation of a 3D mesh. It is made by repeatedly and uniformly sampling points within the surface of the 3D body. Compared to the mesh representation, the point cloud is a fixed size ndarray and hence easier for deep learning algorithms to handle. + +### Load point cloud + +!!! tip + Check out our predefined [`PointCloud3D`](#getting-started-predefined-docs) to get started and play around with our 3D features. + +In DocArray, loading a point cloud from a [`PointCloud3DUrl`][docarray.typing.url.url_3d.point_cloud_url.PointCloud3DUrl] instance will return a [`PointsAndColors`][docarray.documents.point_cloud.points_and_colors.PointsAndColors] instance. Such an object has a `points` attribute containing the information about the points in 3D space as well as an optional `colors` attribute. + +First, let's define our class `MyPointCloud`, which extends [`BaseDoc`][docarray.base_doc.doc.BaseDoc] and provides attributes to store the point cloud information. + +```python +from typing import Optional + +from docarray import BaseDoc +from docarray.documents.point_cloud.points_and_colors import PointsAndColors +from docarray.typing import PointCloud3DUrl + + +class MyPointCloud(BaseDoc): + url: PointCloud3DUrl + tensors: Optional[PointsAndColors] + + +doc = MyPointCloud(url="https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj") +``` + +Next, we can load a point cloud of size `samples` by simply calling [`.load()`][docarray.typing.url.url_3d.point_cloud_url.PointCloud3DUrl.load] on the [`PointCloud3DUrl`][docarray.typing.url.url_3d.point_cloud_url.PointCloud3DUrl] instance: + +```python +doc.tensors = doc.url.load(samples=1000) +doc.summary() +``` +
+ Output + ``` { .text .no-copy } + ๐Ÿ“„ MyPointCloud : a63374d ... + โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ + โ”‚ Attribute โ”‚ Value โ”‚ + โ”œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ค + โ”‚ url: PointCloud3DUrl โ”‚ https://people.sc.fsu.edu/~jburkardt/datโ€ฆ โ”‚ + โ”‚ โ”‚ ... (length: 52) โ”‚ + โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ + โ””โ”€โ”€ ๐Ÿ”ถ tensors: PointsAndColors + โ””โ”€โ”€ ๐Ÿ“„ PointsAndColors : 70ae175 ... + โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ + โ”‚ Attribute โ”‚ Value โ”‚ + โ”œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ค + โ”‚ points: NdArray โ”‚ NdArray of shape (1000, 3), dtype: float64 โ”‚ + โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ + + ``` +
+ + +### Display 3D point cloud in notebook +You can display your point cloud and interact with it from its URL as well as from a PointsAndColors instance. The first will always display without color, whereas the display from [`PointsAndColors`][docarray.documents.point_cloud.points_and_colors.PointsAndColors] will show with color if `.colors` is not None. + +``` { .python} +doc.url.display() +``` + + + + + + + +## Getting started - Predefined Docs +To get started and play around with the 3D modalities, DocArray provides the predefined documents [`Mesh3D`][docarray.documents.mesh.Mesh3D] and [`PointCloud3D`][docarray.documents.point_cloud.PointCloud3D], which includes all of the previously mentioned functionalities. + +### `Mesh3D` +The [`Mesh3D`][docarray.documents.mesh.Mesh3D] class for instance provides a [`Mesh3DUrl`][docarray.typing.Mesh3DUrl] field as well as a [`VerticesAndFaces`][docarray.documents.mesh.vertices_and_faces.VerticesAndFaces] field. + + +``` { .python } +class Mesh3D(BaseDoc): + url: Optional[Mesh3DUrl] + tensors: Optional[VerticesAndFaces] + embedding: Optional[AnyEmbedding] + bytes_: Optional[bytes] +``` + +### `PointCloud3D` + +``` { .python } +class PointCloud3D(BaseDoc): + url: Optional[PointCloud3DUrl] + tensors: Optional[PointsAndColors] + embedding: Optional[AnyEmbedding] + bytes_: Optional[bytes] +``` + +You can use them directly, extend or compose them. + +```python +from docarray import BaseDoc +from docarray.documents import Mesh3D, PointCloud3D + + +class My3DObject(BaseDoc): + title: str + mesh: Mesh3D + pc: PointCloud3D + + +obj_file = 'https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj' + +doc = My3DObject( + title='My first 3D object!', + mesh=Mesh3D(url=obj_file), + pc=PointCloud3D(url=obj_file), +) + + +doc.mesh.tensors = doc.mesh.url.load() +doc.pc.tensors = doc.pc.url.load(samples=100) +``` \ No newline at end of file diff --git a/docs/data_types/audio/audio.md b/docs/data_types/audio/audio.md new file mode 100644 index 00000000000..cbe84c9067e --- /dev/null +++ b/docs/data_types/audio/audio.md @@ -0,0 +1,211 @@ +# ๐Ÿ”Š Audio + +DocArray supports many different modalities including `Audio`. +This section will show you how to load and handle audio data using DocArray. + +Moreover, you will learn about DocArray's audio-specific types, to represent your audio data ranging from [`AudioUrl`][docarray.typing.url.AudioUrl] to [`AudioBytes`][docarray.typing.bytes.AudioBytes] and [`AudioNdArray`][docarray.typing.tensor.audio.audio_ndarray.AudioNdArray]. + +!!! note + This requires a `pydub` dependency. You can install all necessary dependencies via: + ```cmd + pip install "docarray[audio]" + ``` + Additionally, you have to install `ffmpeg` (see more info [here](https://github.com/jiaaro/pydub#getting-ffmpeg-set-up)): + ```cmd + # on Mac with brew: + brew install ffmpeg + ``` + ```cmd + # on Linux with apt-get + apt-get install ffmpeg libavcodec-extra + ``` + + +## Load audio file + +First, let's define a class, which extends [`BaseDoc`][docarray.base_doc.doc.BaseDoc] and has an `url` attribute of type [`AudioUrl`][docarray.typing.url.AudioUrl], and an optional `tensor` attribute of type [`AudioTensor`](../../../../api_references/typing/tensor/audio). + +!!! tip + Check out our predefined [`AudioDoc`](#getting-started-predefined-audiodoc) to get started and play around with our audio features. + +Next, you can instantiate an object of that class with a local or remote URL. + +```python +from docarray import BaseDoc +from docarray.typing import AudioUrl, AudioNdArray + + +class MyAudio(BaseDoc): + url: AudioUrl + tensor: AudioNdArray = None + frame_rate: int = None + + +doc = MyAudio( + url='https://github.com/docarray/docarray/blob/feat-rewrite-v2/tests/toydata/hello.mp3?raw=true' +) +``` + +Loading the content of the audio file is as easy as calling [`.load()`][docarray.typing.url.AudioUrl] on the [`AudioUrl`][docarray.typing.url.AudioUrl] instance. + +This will return a tuple of: + +- an [`AudioNdArray`][docarray.typing.tensor.audio.AudioNdArray] representing the audio file content +- an integer representing the frame rate (number of signals for a certain period of time) + +```python +doc.tensor, doc.frame_rate = doc.url.load() +doc.summary() +``` +
+ Output + ``` { .text .no-copy } + ๐Ÿ“„ MyAudio : 2015696 ... + โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ + โ”‚ Attribute โ”‚ Value โ”‚ + โ”œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ค + โ”‚ url: AudioUrl โ”‚ https://github.com/docarray/docarray/blob/feat-rew โ”‚ + โ”‚ โ”‚ ... (length: 90) โ”‚ + โ”‚ tensor: AudioNdArray โ”‚ AudioNdArray of shape (30833,), dtype: float64 โ”‚ + โ”‚ frame_rate: int โ”‚ 44100 โ”‚ + โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ + ``` +
+ + +## AudioTensor + +DocArray offers several [`AudioTensor`s](../../../../api_references/typing/tensor/audio) to store your data to: + +- [`AudioNdArray`][docarray.typing.tensor.audio.audio_ndarray.AudioNdArray] +- [`AudioTorchTensor`][docarray.typing.AudioTorchTensor] +- [`AudioTensorFlowTensor`][docarray.typing.AudioTensorFlowTensor] + +If you specify the type of your tensor to one of the above, it will be cast to that automatically: + +```python hl_lines="7 8 15 16" +from docarray import BaseDoc +from docarray.typing import AudioTensorFlowTensor, AudioTorchTensor, AudioUrl + + +class MyAudio(BaseDoc): + url: AudioUrl + tf_tensor: AudioTensorFlowTensor = None + torch_tensor: AudioTorchTensor = None + + +doc = MyAudio( + url='https://github.com/docarray/docarray/blob/feat-rewrite-v2/tests/toydata/hello.mp3?raw=true' +) + +doc.tf_tensor, _ = doc.url.load() +doc.torch_tensor, _ = doc.url.load() + +assert isinstance(doc.tf_tensor, AudioTensorFlowTensor) +assert isinstance(doc.torch_tensor, AudioTorchTensor) +``` + + +## AudioBytes + +Alternatively, you can load your [`AudioUrl`][docarray.typing.url.AudioUrl] instance to [`AudioBytes`][docarray.typing.bytes.AudioBytes], and your [`AudioBytes`][docarray.typing.bytes.AudioBytes] instance to an [`AudioTensor`](../../../../api_references/typing/tensor/audio) of your choice: + +```python hl_lines="15 16" + +from docarray import BaseDoc +from docarray.typing import AudioBytes, AudioTensor, AudioUrl + + +class MyAudio(BaseDoc): + url: AudioUrl = None + bytes_: AudioBytes = None + tensor: AudioTensor = None + + +doc = MyAudio( + url='https://github.com/docarray/docarray/blob/feat-rewrite-v2/tests/toydata/hello.mp3?raw=true' +) + +doc.bytes_ = doc.url.load_bytes() # type(doc.bytes_) = AudioBytes +doc.tensor, _ = doc.bytes_.load() # type(doc.tensor) = AudioNdarray +``` + +Vice versa, you can also transform an [`AudioTensor`](../../../../api_references/typing/tensor/audio) to [`AudioBytes`][docarray.typing.bytes.AudioBytes]: + +```python +from docarray.typing import AudioBytes + + +bytes_from_tensor = doc.tensor.to_bytes() + +assert isinstance(bytes_from_tensor, AudioBytes) +``` + +## Save audio to file +You can save your [`AudioTensor`](../../../../api_references/typing/tensor/audio) to an audio file of any format as follows: +``` { .python } +tensor_reversed = doc.tensor[::-1] +tensor_reversed.save( + file_path='olleh.mp3', + format='mp3', +) +``` +## Play audio in a notebook + +You can play your audio sound in a notebook from its URL as well as its tensor, by calling `.display()` on either one. + +Play from `url`: +``` { .python } +doc.url.display() +``` + + + + + +
+ +Play from `tensor`: +``` { .python } +tensor_reversed.display() +``` + + + + +
+ + + + +## Getting started - Predefined `AudioDoc` + +To get started and play around with your audio data, DocArray provides a predefined [`AudioDoc`][docarray.documents.audio.AudioDoc], which includes all of the previously mentioned functionalities: + +``` { .python } +class AudioDoc(BaseDoc): + url: Optional[AudioUrl] + tensor: Optional[AudioTensor] + embedding: Optional[AnyEmbedding] + bytes_: Optional[AudioBytes] + frame_rate: Optional[int] +``` + +You can use this class directly or extend it to your preference: +```python +from docarray.documents import AudioDoc +from typing import Optional + + +# extend AudioDoc +class MyAudio(AudioDoc): + name: Optional[str] + + +audio = MyAudio( + url='https://github.com/docarray/docarray/blob/feat-rewrite-v2/tests/toydata/hello.mp3?raw=true' +) +audio.name = 'My first audio doc!' +audio.tensor, audio.frame_rate = audio.url.load() +``` + diff --git a/docs/data_types/audio/hello.mp3 b/docs/data_types/audio/hello.mp3 new file mode 100644 index 00000000000..a576b89dee6 Binary files /dev/null and b/docs/data_types/audio/hello.mp3 differ diff --git a/docs/data_types/audio/olleh.mp3 b/docs/data_types/audio/olleh.mp3 new file mode 100644 index 00000000000..04528a3607c Binary files /dev/null and b/docs/data_types/audio/olleh.mp3 differ diff --git a/docs/data_types/image/display_notebook.jpg b/docs/data_types/image/display_notebook.jpg new file mode 100644 index 00000000000..edb8fa5a10b Binary files /dev/null and b/docs/data_types/image/display_notebook.jpg differ diff --git a/docs/data_types/image/image.md b/docs/data_types/image/image.md new file mode 100644 index 00000000000..1efc7b7be56 --- /dev/null +++ b/docs/data_types/image/image.md @@ -0,0 +1,204 @@ +# ๐Ÿ–ผ๏ธ Image + +DocArray supports many different modalities including the widely used `Image` modality. +This section will show you how to load and handle image data using DocArray. + +Moreover, we will introduce DocArray's image-specific types, to represent your image data ranging from [`ImageUrl`][docarray.typing.url.ImageUrl] to [`ImageBytes`][docarray.typing.bytes.ImageBytes] and [`ImageNdArray`][docarray.typing.tensor.image.image_ndarray.ImageNdArray]. + +!!! note + This requires `Pillow` dependency. You can install all necessary dependencies via: + ```cmd + pip install "docarray[image]" + ``` + +## Load image + +!!! tip + Check out our predefined [`ImageDoc`](#getting-started-predefined-imagedoc) to get started and play around with our image features. + +First, let's define our class `MyImage`, which extends [`BaseDoc`][docarray.base_doc.doc.BaseDoc] and has an `url` attribute of type [`ImageUrl`][docarray.typing.url.ImageUrl], as well as an optional `tensor` attribute of type [`ImageTensor`](../../../../api_references/typing/tensor/image). + +Next, let's instantiate a `MyImage` object with a local or remote URL. + +```python +from docarray.typing import ImageTensor, ImageUrl +from docarray import BaseDoc + + +class MyImage(BaseDoc): + url: ImageUrl + tensor: ImageTensor = None + + +img = MyImage( + url='https://github.com/docarray/docarray/blob/feat-rewrite-v2/tests/toydata/image-data/apple.png?raw=true' +) +``` + +To load the image data you can call [`.load()`][docarray.typing.url.ImageUrl.load] on the `url` attribute. By default, [`ImageUrl.load()`][docarray.typing.url.ImageUrl.load] returns an [`ImageNdArray`][docarray.typing.tensor.image.image_ndarray.ImageNdArray] object. + +```python +from docarray.typing import ImageNdArray + +img.tensor = img.url.load() + +assert isinstance(img.tensor, ImageNdArray) +``` + +## ImageTensor + +DocArray offers several [`ImageTensor`s](../../../../api_references/typing/tensor/image) to store your data to: + +- [`ImageNdArray`][docarray.typing.ImageNdArray] +- [`ImageTorchTensor`][docarray.typing.ImageTorchTensor] +- [`ImageTensorFlowTensor`][docarray.typing.ImageTensorFlowTensor] + +If you specify the type of your tensor to one of the above, it will be cast to that automatically: + +```python hl_lines="7 8 12 13" +from docarray.typing import ImageTensorFlowTensor, ImageTorchTensor, ImageUrl +from docarray import BaseDoc + + +class MyImage(BaseDoc): + url: ImageUrl = None + tf_tensor: ImageTensorFlowTensor = None + torch_tensor: ImageTorchTensor = None + + +img = MyImage(url='https://github.com/docarray/docarray/blob/feat-rewrite-v2/tests/toydata/image-data/apple.png?raw=true') +img.tf_tensor = img.url.load() +img.torch_tensor = img.url.load() + +assert isinstance(img.tf_tensor, ImageTensorFlowTensor) +assert isinstance(img.torch_tensor, ImageTorchTensor) +``` + +You can also load the URL content as a [`PIL.Image.Image`](https://pillow.readthedocs.io/en/stable/reference/Image.html#PIL.Image.Image) instance using [`ImageUrl.load_pil()`][docarray.typing.url.ImageUrl.load_pil]: + +```python +from PIL.Image import Image as PILImage + +img = MyImage( + url='https://github.com/docarray/docarray/blob/feat-rewrite-v2/tests/toydata/image-data/apple.png?raw=true' +) +pil_img = img.url.load_pil() + +assert isinstance(pil_img, PILImage) +``` + +## Parameterized ImageTensor + +Like all of our tensors, the [`ImageTensor`s](../../../../api_references/typing/tensor/image) can be used in a parametrized way, specifying the shape of the images. +Let's say, for instance, all your images are of size `(200, 300, 3)`. + +```python +import numpy as np +from docarray import BaseDoc +from docarray.typing import ImageNdArray + + +class MyImage(BaseDoc): + tensor: ImageNdArray[200, 300, 3] + + +img = MyImage(tensor=np.ones(shape=(200, 300, 3))) + +# this would fail: +# img = MyImage(tensor=np.ones(shape=(224, 224, 3))) +``` + +If you have RGB images of different shapes, you could specify only the dimension as well as the number of channels: + +```python +import numpy as np +from docarray import BaseDoc +from docarray.typing import ImageNdArray + + +class MyFlexibleImage(BaseDoc): + tensor: ImageNdArray['h', 'w', 3] + + +img_1 = MyFlexibleImage(tensor=np.zeros(shape=(200, 300, 3))) +img_2 = MyFlexibleImage(tensor=np.ones(shape=(224, 224, 3))) +``` + + + +## ImageBytes + +Alternatively, you can load your [`ImageUrl`][docarray.typing.url.ImageUrl] instance to [`ImageBytes`][docarray.typing.bytes.ImageBytes], and your [`ImageBytes`][docarray.typing.bytes.ImageBytes] instance to an [`ImageTensor`](../../../../api_references/typing/tensor/image) of your choice. + +```python hl_lines="13 14" +from docarray.typing import ImageBytes, ImageTensor, ImageUrl +from docarray import BaseDoc + + +class MyImage(BaseDoc): + url: ImageUrl = None + bytes_: ImageBytes = None + tensor: ImageTensor = None + + +img = MyImage(url='https://github.com/docarray/docarray/blob/feat-rewrite-v2/tests/toydata/image-data/apple.png?raw=true') + +img.bytes_ = img.url.load_bytes() # type(img.bytes_) = ImageBytes +img.tensor = img.bytes_.load() # type(img.tensor) = ImageNdarray +``` + +Vice versa, you can also transform an [`ImageTensor`](../../../../api_references/typing/tensor/image) to [`ImageBytes`][docarray.typing.bytes.ImageBytes]: + +```python +from docarray.typing import ImageBytes + +bytes_from_tensor = img.tensor.to_bytes() + +assert isinstance(bytes_from_tensor, ImageBytes) +``` + +## Display image in a notebook + +You can display your image in a notebook from both an [`ImageUrl`][docarray.typing.url.ImageUrl] instance as well as an +[`ImageTensor`](../../../../api_references/typing/tensor/image) instance. + + +
+ ![](display_notebook.jpg){ width="900" } +
+ + +## Getting started - Predefined `ImageDoc` + +To get started and play around with the image-modality, DocArray provides a predefined [`ImageDoc`][docarray.documents.image.ImageDoc], which includes all of the previously mentioned functionalities: + +``` { .python } +class ImageDoc(BaseDoc): + url: Optional[ImageUrl] + tensor: Optional[ImageTensor] + embedding: Optional[AnyEmbedding] + bytes_: Optional[ImageBytes] +``` + +You can use this class directly or extend it to your preference: +``` { .python } +from docarray.documents import ImageDoc +from docarray.typing import AnyEmbedding +from typing import Optional + + +# extending ImageDoc +class MyImage(ImageDoc): + image_title: str + second_embedding: Optional[AnyEmbedding] + + +image = MyImage( + image_title='My first image', + url='http://www.jina.ai/image.jpg', +) +image.tensor = image.url.load() +model = SomeEmbeddingModel() +image.embedding = model(image.tensor) +image.second_embedding = model(image.tensor) +``` diff --git a/docs/data_types/multimodal/multimodal.md b/docs/data_types/multimodal/multimodal.md new file mode 100644 index 00000000000..35f7ff2dba4 --- /dev/null +++ b/docs/data_types/multimodal/multimodal.md @@ -0,0 +1,182 @@ +# ๐Ÿ—ƒ Multimodal + +In this section, we will walk through how to use DocArray to process multiple data modalities in tandem. + +!!! tip "See also" + In this section, we will work with image and text data. If you are not yet familiar with how to process these + modalities individually, you may want to check out the respective examples first: [`Image`](../image/image.md) + and [`Text`](../text/text.md) + +## Model your data + +If you work with multiple modalities at the same time, most likely they stand in some relation with each other. +DocArray allows you to model your data and these relationships. + +### Define a schema + +Let's suppose you want to model a page of a newspaper that contains a main text, an image URL, a corresponding tensor +as well as a description. You can model this example in the following way: + +```python +from docarray import BaseDoc +from docarray.typing import ImageTorchTensor, ImageUrl + + +class Page(BaseDoc): + main_text: str + img_url: ImageUrl = None + img_description: str = None + img_tensor: ImageTorchTensor = None +``` + +### Instantiate an object + +After extending [`BaseDoc`][docarray.BaseDoc] and defining your schema, you can instantiate an object with your actual +data. + +```python +page = Page( + main_text='Hello world', + img_url='https://github.com/docarray/docarray/blob/feat-rewrite-v2/docs/assets/favicon.png?raw=true', + img_description='This is the image of an apple', +) +page.img_tensor = page.img_url.load() + +page.summary() +``` +
+ Output + ``` { .text .no-copy } + ๐Ÿ“„ Page : 8f39674 ... + โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ + โ”‚ Attribute โ”‚ Value โ”‚ + โ”œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ค + โ”‚ main_text: str โ”‚ Hello world โ”‚ + โ”‚ img_url: ImageUrl โ”‚ https://github.com/docarray/docarray/blob/feโ€ฆ โ”‚ + โ”‚ โ”‚ ... (length: 90) โ”‚ + โ”‚ img_description: str โ”‚ This is DocArray โ”‚ + โ”‚ img_tensor: ImageTorchTensor โ”‚ ImageTorchTensor of shape (320, 320, 3), โ”‚ + โ”‚ โ”‚ dtype: torch.uint8 โ”‚ + โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ + ``` +
+ +### Access data + +After instantiation, each modality can be accessed directly from the `Page` object: + +```python +print(page.main_text) +print(page.img_url) +print(page.img_description) +print(page.img_tensor) +``` +
+ Output + ``` { .text .no-copy } + Hello world + https://github.com/docarray/docarray/blob/feat-rewrite-v2/docs/assets/favicon.png?raw=true + This is DocArray + ImageTorchTensor([[[0, 0, 0], + [0, 0, 0], + [0, 0, 0], + ..., + [0, 0, 0]]]) + ``` +
+ +### Nested data + +If the data you want to model requires a more complex structure, nesting your attributes may be a good solution. + +For this example, let's try to define a schema to represent a newspaper. The newspaper should consist of a cover page, +any number of following pages, and some metadata. Further, each page contains a main text and can contain an image +and an image description. + +To implement this you can simply add a `Newspaper` class to our previous implementation. The newspaper has a required +`cover_page` attribute of type `Page` as well as a `pages` attribute, which is a `DocList` of `Page`s. + +```python +from docarray import BaseDoc, DocList +from docarray.typing import ImageTorchTensor, ImageUrl + + +class Page(BaseDoc): + main_text: str + img_url: ImageUrl = None + img_description: str = None + img_tensor: ImageTorchTensor = None + + +class Newspaper(BaseDoc): + cover: Page + pages: DocList[Page] = None + metadata: dict = None +``` + +You can instantiate this more complex `Newspaper` object in the same way as before: + +```python +cover_page = Page( + main_text='DocArray Daily', + img_url='https://github.com/docarray/docarray/blob/feat-rewrite-v2/docs/assets/favicon.png', +) + +pages = DocList[Page]( + [ + Page( + main_text='Hello world', + img_url='https://github.com/docarray/docarray/blob/feat-rewrite-v2/docs/assets/favicon.png', + img_description='This is the image of an apple', + ), + Page(main_text='Second page'), + Page(main_text='Third page'), + ] +) + +docarray_daily = Newspaper( + cover=cover_page, + pages=pages, + metadata={'author': 'DocArray and friends', 'issue': '0.30.0'}, +) + +docarray_daily.summary() +``` +
+ Output + ``` { .text .no-copy } + ๐Ÿ“„ Newspaper : 63189f7 ... + โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ + โ”‚ Attribute โ”‚ Value โ”‚ + โ”œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ค + โ”‚ metadata: dict โ”‚ {'author': 'DocArray and friends', 'issue': '0.0.3 ... } โ”‚ + โ”‚ โ”‚ (length: 2) โ”‚ + โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ + โ”œโ”€โ”€ ๐Ÿ”ถ cover: Page + โ”‚ โ””โ”€โ”€ ๐Ÿ“„ Page : ca164e3 ... + โ”‚ โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ + โ”‚ โ”‚ Attribute โ”‚ Value โ”‚ + โ”‚ โ”œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ค + โ”‚ โ”‚ main_text: str โ”‚ DocArray Daily โ”‚ + โ”‚ โ”‚ img_url: ImageUrl โ”‚ https://github.com/docarray/docarray/blob/feat-โ€ฆ โ”‚ + โ”‚ โ”‚ โ”‚ ... (length: 81) โ”‚ + โ”‚ โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ + โ””โ”€โ”€ ๐Ÿ’  pages: DocList[Page] + โ”œโ”€โ”€ ๐Ÿ“„ Page : 64ed19c ... + โ”‚ โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ + โ”‚ โ”‚ Attribute โ”‚ Value โ”‚ + โ”‚ โ”œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ค + โ”‚ โ”‚ main_text: str โ”‚ Hello world โ”‚ + โ”‚ โ”‚ img_url: ImageUrl โ”‚ https://github.com/docarray/docarray/blob/feโ€ฆ โ”‚ + โ”‚ โ”‚ โ”‚ ... (length: 81) โ”‚ + โ”‚ โ”‚ img_description: str โ”‚ DocArray logoooo โ”‚ + โ”‚ โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ + โ”œโ”€โ”€ ๐Ÿ“„ Page : 4bd7e45 ... + โ”‚ โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ + โ”‚ โ”‚ Attribute โ”‚ Value โ”‚ + โ”‚ โ”œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ค + โ”‚ โ”‚ main_text: str โ”‚ Second page โ”‚ + โ”‚ โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ + โ””โ”€โ”€ ... 1 more Page documents + ``` +
\ No newline at end of file diff --git a/docs/data_types/table/table.md b/docs/data_types/table/table.md new file mode 100644 index 00000000000..202f84208ae --- /dev/null +++ b/docs/data_types/table/table.md @@ -0,0 +1,223 @@ +# ๐Ÿ“Š Table + +DocArray supports many different modalities including tabular data. +This section will show you how to load and handle tabular data using DocArray. + +## Load CSV table + +A common way to store tabular data is via `CSV` (comma-separated values) files. +You can easily load such data from a given `CSV` file into a [`DocList`][docarray.DocList]. +Let's take a look at the following example file, which includes data about books and their authors and publishing year. + +```text +title,author,year +Harry Potter and the Philosopher's Stone,J. K. Rowling,1997 +Klara and the sun,Kazuo Ishiguro,2020 +A little life,Hanya Yanagihara,2015 +``` + +First, you have to define the Document schema describing the data. +```python +from docarray import BaseDoc + + +class Book(BaseDoc): + title: str + author: str + year: int +``` +Next, you can load the content of the CSV file to a [`DocList`][docarray.DocList] instance of `Book`s via [`.from_csv()`][docarray.array.doc_list.io.IOMixinArray.from_csv]. +```python +from docarray import DocList + + +docs = DocList[Book].from_csv( + file_path='https://github.com/docarray/docarray/blob/feat-rewrite-v2/tests/toydata/books.csv?raw=true' +) +docs.summary() +``` +
+ Output + ``` { .text .no-copy } + โ•ญโ”€โ”€โ”€โ”€โ”€โ”€ DocList Summary โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ + โ”‚ โ”‚ + โ”‚ Type DocList[Book] โ”‚ + โ”‚ Length 3 โ”‚ + โ”‚ โ”‚ + โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ + โ•ญโ”€โ”€ Document Schema โ”€โ”€โ•ฎ + โ”‚ โ”‚ + โ”‚ Book โ”‚ + โ”‚ โ”œโ”€โ”€ title: str โ”‚ + โ”‚ โ”œโ”€โ”€ author: str โ”‚ + โ”‚ โ””โ”€โ”€ year: int โ”‚ + โ”‚ โ”‚ + โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ + ``` +
+ +The resulting [`DocList`][docarray.DocList] object contains three `Book`s since each row of the CSV file corresponds to one book and is assigned to one `Book` instance. + + +## Save to CSV file + +Vice versa, you can also store your [`DocList`][docarray.DocList] data in a `.csv` file using [`.to_csv()`][docarray.array.doc_list.io.IOMixinArray.to_csv]. +``` { .python } +docs.to_csv(file_path='/path/to/my_file.csv') +``` + +Tabular data is often not the best choice to represent nested Documents. Hence, nested Documents will be stored flattened and can be accessed by their `'__'`-separated access paths. + +Let's take a look at an example. We now want to store not only the book data but moreover book review data. To do so, we define a `BookReview` class that has a nested `book` attribute as well as the non-nested attributes `n_ratings` and `stars`. + +```python +class BookReview(BaseDoc): + book: Book + n_ratings: int + stars: float + + +review_docs = DocList[BookReview]( + [BookReview(book=book, n_ratings=12345, stars=5) for book in docs] +) +review_docs.summary() +``` +
+ Output + ``` { .text .no-copy} + โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ DocList Summary โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ + โ”‚ โ”‚ + โ”‚ Type DocList[BookReview] โ”‚ + โ”‚ Length 3 โ”‚ + โ”‚ โ”‚ + โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ + โ•ญโ”€โ”€โ”€โ”€ Document Schema โ”€โ”€โ”€โ”€โ•ฎ + โ”‚ โ”‚ + โ”‚ BookReview โ”‚ + โ”‚ โ”œโ”€โ”€ book: Book โ”‚ + โ”‚ โ”‚ โ”œโ”€โ”€ title: str โ”‚ + โ”‚ โ”‚ โ”œโ”€โ”€ author: str โ”‚ + โ”‚ โ”‚ โ””โ”€โ”€ year: int โ”‚ + โ”‚ โ”œโ”€โ”€ n_ratings: int โ”‚ + โ”‚ โ””โ”€โ”€ stars: float โ”‚ + โ”‚ โ”‚ + โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ + ``` +
+ +As expected all nested attributes will be stored by there access path. +``` { .python } +review_docs.to_csv(file_path='/path/to/nested_documents.csv') +``` +``` { .text .no-copy hl_lines="1" } +id,book__id,book__title,book__author,book__year,n_ratings,stars +d6363aa3b78b4f4244fb976570a84ff7,8cd85fea52b3a3bc582cf56c9d612cbb,Harry Potter and the Philosopher's Stone,J. K. Rowling,1997,12345,5.0 +5b53fff67e6b6cede5870f2ee09edb05,87b369b93593967226c525cf226e3325,Klara and the sun,Kazuo Ishiguro,2020,12345,5.0 +addca0475756fc12cdec8faf8fb10d71,03194cec1b75927c2259b3c0fff1ab6f,A little life,Hanya Yanagihara,2015,12345,5.0 + +``` + +## Handle TSV tables + +Not only can you load and save comma-separated values (`CSV`) data, but also tab-separated values (`TSV`), +by adjusting the `dialect` parameter in [`.from_csv()`][docarray.array.doc_list.io.IOMixinArray.from_csv] +and [`.to_csv()`][docarray.array.doc_list.io.IOMixinArray.to_csv]. + +The dialect defaults to `'excel'`, which refers to comma-separated values. For tab-separated values, you can use +`'excel-tab'`. + +Let's take a look at what this would look like with a tab-separated file: + +```text +title author year +Title1 author1 2020 +Title2 author2 1234 +``` + +```python +docs = DocList[Book].from_csv( + file_path='https://github.com/docarray/docarray/blob/feat-rewrite-v2/tests/toydata/books.tsv?raw=true', + dialect='excel-tab', +) +for doc in docs: + doc.summary() +``` +
+ Output + ```text + ๐Ÿ“„ Book : c1ac9d4 ... + โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ + โ”‚ Attribute โ”‚ Value โ”‚ + โ”œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ค + โ”‚ title: str โ”‚ Title1 โ”‚ + โ”‚ author: str โ”‚ author1 โ”‚ + โ”‚ year: int โ”‚ 2020 โ”‚ + โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ + ๐Ÿ“„ Book : c1ac9d4 ... + โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ + โ”‚ Attribute โ”‚ Value โ”‚ + โ”œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ค + โ”‚ title: str โ”‚ Title1 โ”‚ + โ”‚ author: str โ”‚ author1 โ”‚ + โ”‚ year: int โ”‚ 2020 โ”‚ + โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ + ``` +
+ +Great! All the data is correctly read and stored in `Book` instances. +## Other separators + +If your values are separated by yet another separator, you can create your own `csv.Dialect` class. +To do so you can create a class, that inherits from `csv.Dialect`. +Within this class, you can define your dialect's behavior by setting the provided [formatting parameters](https://docs.python.org/3/library/csv.html#dialects-and-formatting-parameters). + +For instance, let's assume you have a semicolon-separated table: + +```text +first_name;last_name;year +Jane;Austin;2020 +John;Doe;1234 +``` + +Now, let's define our `SemicolonSeparator` class. Next to the `delimiter` parameter, we have to set some more formatting parameters such as `doublequote` and `lineterminator`. +```python +import csv + + +class SemicolonSeparator(csv.Dialect): + delimiter = ';' + doublequote = True + lineterminator = '\r\n' + quotechar = '"' + quoting = csv.QUOTE_MINIMAL +``` +Finally, you can load your data by setting the `dialect` parameter in [`.from_csv()`][docarray.array.doc_list.io.IOMixinArray.from_csv] to an instance of your `SemicolonSeparator`. +```python +docs = DocList[Book].from_csv( + file_path='https://github.com/docarray/docarray/blob/feat-rewrite-v2/tests/toydata/books_semicolon_sep.csv?raw=true', + dialect=SemicolonSeparator(), +) +for doc in docs: + doc.summary() +``` +
+ Output + ```text + ๐Ÿ“„ Book : 321e9fd ... + โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ + โ”‚ Attribute โ”‚ Value โ”‚ + โ”œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ค + โ”‚ title: str โ”‚ Title1 โ”‚ + โ”‚ author: str โ”‚ author1 โ”‚ + โ”‚ year: int โ”‚ 2020 โ”‚ + โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ + ๐Ÿ“„ Book : 16d2097 ... + โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ + โ”‚ Attribute โ”‚ Value โ”‚ + โ”œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ค + โ”‚ title: str โ”‚ Title2 โ”‚ + โ”‚ author: str โ”‚ author2 โ”‚ + โ”‚ year: int โ”‚ 1234 โ”‚ + โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ + ``` +
diff --git a/docs/data_types/text/text.md b/docs/data_types/text/text.md new file mode 100644 index 00000000000..7af4cbe7ab7 --- /dev/null +++ b/docs/data_types/text/text.md @@ -0,0 +1,108 @@ + +# ๐Ÿ”ค Text + +DocArray supports many different modalities including `Text`. +This section will show you how to load and handle text data using DocArray. + +!!! tip + Check out our predefined [`TextDoc`](#getting-started-predefined-textdoc) to get started and play around with our text features. + +You can store text in DocArray like this: + +```python +from docarray import BaseDoc + + +class MyText(BaseDoc): + text: str = None + + +doc = MyText(text='Hello world!') +``` + +The text can include any type of character, including emojis: + +```python +doc.text = '๐Ÿ‘‹ เคจเคฎเคธเฅเคคเฅ‡ เคฆเฅเคจเคฟเคฏเคพ! ไฝ ๅฅฝไธ–็•Œ๏ผใ“ใ‚“ใซใกใฏไธ–็•Œ๏ผ ะŸั€ะธะฒะตั‚ ะผะธั€!' +``` + +## Load text file + +If your text data is too long to be written inline or if it is stored in a file, you can also define the URL as a [`TextUrl`][docarray.typing.url.text_url.TextUrl] first and then load the text data. + +Let's first define a schema: + +```python +from docarray import BaseDoc +from docarray.typing import TextUrl + + +class MyText(BaseDoc): + text: str = None + url: TextUrl = None +``` +Next, you can instantiate a `MyText` object with a `url` attribute and load its content to the `text` field. +```python +doc = MyText( + url='https://www.w3.org/History/19921103-hypertext/hypertext/README.html', +) +doc.text = doc.url.load() + +assert doc.text.startswith('Read Me') +``` + +## Segment long texts + +Often times when you index or search text data, you donโ€™t want to consider thousands of words as one huge string. +Instead, some finer granularity would be nice. You can do this by leveraging nested fields. For example, letโ€™s split some page content into its sentences by `'.'`. + +```python +from docarray import BaseDoc, DocList + + +class Sentence(BaseDoc): + text: str + + +class Page(BaseDoc): + content: DocList[Sentence] + + +long_text = 'First sentence. Second sentence. And many many more sentences.' +page = Page(content=[Sentence(text=t) for t in long_text.split('.')]) + +page.summary() +``` +
+ Output + ``` { .text .no-copy } + ๐Ÿ“„ Page : 13d909a ... + โ””โ”€โ”€ ๐Ÿ’  content: DocList[Sentence] + โ”œโ”€โ”€ ๐Ÿ“„ Sentence : 6725382 ... + โ”‚ โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ + โ”‚ โ”‚ Attribute โ”‚ Value โ”‚ + โ”‚ โ”œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ค + โ”‚ โ”‚ text: str โ”‚ First sentence โ”‚ + โ”‚ โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ + โ”œโ”€โ”€ ๐Ÿ“„ Sentence : 17a934c ... + โ”‚ โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ + โ”‚ โ”‚ Attribute โ”‚ Value โ”‚ + โ”‚ โ”œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ค + โ”‚ โ”‚ text: str โ”‚ Second sentence โ”‚ + โ”‚ โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ + โ””โ”€โ”€ ... 2 more Sentence documents + ``` +
+ +## Getting started - Predefined `TextDoc` + +To get started and play around with your text data, DocArray provides a predefined [`TextDoc`][docarray.documents.text.TextDoc], which includes all of the previously mentioned functionalities: + +``` { .python } +class TextDoc(BaseDoc): + text: Optional[str] + url: Optional[TextUrl] + embedding: Optional[AnyEmbedding] + bytes_: Optional[bytes] +``` + diff --git a/docs/data_types/video/key_frames.png b/docs/data_types/video/key_frames.png new file mode 100644 index 00000000000..5c039e11c20 Binary files /dev/null and b/docs/data_types/video/key_frames.png differ diff --git a/docs/data_types/video/mov_bbb.mp4 b/docs/data_types/video/mov_bbb.mp4 new file mode 100644 index 00000000000..530676502bd Binary files /dev/null and b/docs/data_types/video/mov_bbb.mp4 differ diff --git a/docs/data_types/video/mov_bbb_framerate_60.mp4 b/docs/data_types/video/mov_bbb_framerate_60.mp4 new file mode 100644 index 00000000000..46345c4f24f Binary files /dev/null and b/docs/data_types/video/mov_bbb_framerate_60.mp4 differ diff --git a/docs/data_types/video/video.md b/docs/data_types/video/video.md new file mode 100644 index 00000000000..f09dce2e978 --- /dev/null +++ b/docs/data_types/video/video.md @@ -0,0 +1,223 @@ +# ๐ŸŽฅ Video + +DocArray supports many modalities including `Video`. +This section will show you how to load and handle video data using DocArray. + +Moreover, you will learn about DocArray's video-specific types, to represent your video data ranging from [`VideoUrl`][docarray.typing.url.VideoUrl] to [`VideoBytes`][docarray.typing.bytes.VideoBytes] and [`VideoNdArray`][docarray.typing.tensor.video.video_ndarray.VideoNdArray]. + +!!! note + This requires an `av` dependency. You can install all necessary dependencies via: + ```cmd + pip install "docarray[video]" + ``` + +## Load video data + +In DocArray video data is represented by a video tensor, an audio tensor, and the key frame indices. + +![type:video](mov_bbb.mp4){: style='width: 600px; height: 330px'} + +!!! tip + Check out our predefined [`VideoDoc`](#getting-started-predefined-videodoc) to get started and play around with our video features. + +First, let's define a `MyVideo` class with all of those attributes and instantiate an object with a local or remote URL: + +```python +from docarray import BaseDoc +from docarray.typing import AudioNdArray, NdArray, VideoNdArray, VideoUrl + + +class MyVideo(BaseDoc): + url: VideoUrl + video: VideoNdArray = None + audio: AudioNdArray = None + key_frame_indices: NdArray = None + + +doc = MyVideo( + url='https://github.com/docarray/docarray/blob/feat-rewrite-v2/tests/toydata/mov_bbb.mp4?raw=true' +) +``` + +Now you can load the video file content by simply calling [`.load()`][docarray.typing.url.audio_url.AudioUrl.load] on your [`AudioUrl`][docarray.typing.url.audio_url.AudioUrl] instance. +This will return a [NamedTuple](https://docs.python.org/3/library/typing.html#typing.NamedTuple) of a **video tensor**, an **audio tensor**, and the **key frame indices**: + +- The video tensor is a 4-dim array of shape `(n_frames, height, width, channels)`.
The first dimension represents the frame id. +The last three dimensions represent the image data of the corresponding frame. + +- If the video contains audio, it will be stored as an AudioNdArray. + +- Additionally, the key frame indices will be stored. A key frame is defined as the starting point of any smooth transition. + + +```python +doc.video, doc.audio, doc.key_frame_indices = doc.url.load() + +assert isinstance(doc.video, VideoNdArray) +assert isinstance(doc.audio, AudioNdArray) +assert isinstance(doc.key_frame_indices, NdArray) + +print(doc.video.shape) +``` +``` { .text .no-copy } +(250, 176, 320, 3) +``` +For the given example you can infer from `doc.video`'s shape, that the video contains 250 frames of size 176x320 in RGB mode. +Based on the overall length of the video (10s), you can infer the framerate is approximately 250/10 = 25 frames per second (fps). + + +## VideoTensor + +DocArray offers several [`VideoTensor`s](../../../../api_references/typing/tensor/video) to store your data to: + +- [`VideoNdArray`][docarray.typing.tensor.video.video_ndarray.VideoNdArray] +- [`VideoTorchTensor`][docarray.typing.tensor.video.VideoTorchTensor] +- [`VideoTensorFlowTensor`][docarray.typing.tensor.video.VideoTensorFlowTensor] + +If you specify the type of your tensor to one of the above, it will be cast to that automatically: + +```python hl_lines="7 8 15 16" +from docarray import BaseDoc +from docarray.typing import VideoTensorFlowTensor, VideoTorchTensor, VideoUrl + + +class MyVideo(BaseDoc): + url: VideoUrl + tf_tensor: VideoTensorFlowTensor = None + torch_tensor: VideoTorchTensor = None + + +doc = MyVideo( + url='https://github.com/docarray/docarray/blob/feat-rewrite-v2/tests/toydata/mov_bbb.mp4?raw=true' +) + +doc.tf_tensor = doc.url.load().video +doc.torch_tensor = doc.url.load().video + +assert isinstance(doc.tf_tensor, VideoTensorFlowTensor) +assert isinstance(doc.torch_tensor, VideoTorchTensor) +``` + + + +## VideoBytes + +Alternatively, you can load your [`VideoUrl`][docarray.typing.url.VideoUrl] instance to [`VideoBytes`][docarray.typing.bytes.VideoBytes], and your [`VideoBytes`][docarray.typing.bytes.VideoBytes] instance to a [`VideoTensor`](../../../../api_references/typing/tensor/video) of your choice: + +```python hl_lines="15 16" +from docarray import BaseDoc +from docarray.typing import VideoTensor, VideoUrl, VideoBytes + + +class MyVideo(BaseDoc): + url: VideoUrl + bytes_: VideoBytes = None + video: VideoTensor = None + + +doc = MyVideo( + url='https://github.com/docarray/docarray/blob/feat-rewrite-v2/tests/toydata/mov_bbb.mp4?raw=true' +) + +doc.bytes_ = doc.url.load_bytes() +doc.video = doc.url.load().video +``` + +Vice versa, you can also transform a [`VideoTensor`](../../../../api_references/typing/tensor/video) to [`VideoBytes`][docarray.typing.bytes.VideoBytes]: + +```python +from docarray.typing import VideoBytes + +bytes_from_tensor = doc.video.to_bytes() + +assert isinstance(bytes_from_tensor, VideoBytes) +``` + + +## Key frame extraction + +A key frame is defined as the starting point of any smooth transition. +Given the key frame indices, you can access selected scenes. + +```python +indices = doc.key_frame_indices +first_scene = doc.video[indices[0] : indices[1]] + +assert (indices == [0, 95]).all() +assert first_scene.shape == (95, 176, 320, 3) +``` + +Or you can access the first frame of all new scenes and display them in a notebook: + +``` { .python } +from docarray.typing import ImageNdArray +from pydantic import parse_obj_as + + +key_frames = doc.video[doc.key_frame_indices] +for frame in key_frames: + img = parse_obj_as(ImageNdArray, frame) + img.display() +``` + +
+ ![](key_frames.png){ width="350" } +
+ + + +## Save video to file + +You can save your video tensor to a file. In the example below you save the video with a framerate of 60 fps, which results in a 4-secOND video, instead of the original 10-second video with a frame rate of 25 fps. +``` { .python } +doc.video.save( + file_path="/path/my_video.mp4", + video_frame_rate=60, +) +``` + +## Display video in a notebook + +You can play a video in a notebook from its URL as well as its tensor, by calling `.display()` on either one. For the latter, you can optionally give the corresponding [`AudioTensor`](../../../../api_references/typing/tensor/audio) as a parameter. + +``` { .python } +doc_fast = MyAudio(url="/path/my_video.mp4") +doc_fast.url.display() +``` +![type:video](mov_bbb_framerate_60.mp4){: style='width: 600px; height: 330px'} + + + +## Getting started - Predefined `VideoDoc` + +To get started and play around with your video data, DocArray provides a predefined [`VideoDoc`][docarray.documents.video.VideoDoc], which includes all of the previously mentioned functionalities: + +``` { .python } +class VideoDoc(BaseDoc): + url: Optional[VideoUrl] + audio: Optional[AudioDoc] = AudioDoc() + tensor: Optional[VideoTensor] + key_frame_indices: Optional[AnyTensor] + embedding: Optional[AnyEmbedding] + bytes_: Optional[bytes] +``` + +You can use this class directly or extend it to your preference: + +```python +from typing import Optional + +from docarray.documents import VideoDoc + + +# extend it +class MyVideo(VideoDoc): + name: Optional[str] + + +video = MyVideo( + url='https://github.com/docarray/docarray/blob/feat-rewrite-v2/tests/toydata/mov_bbb.mp4?raw=true' +) +video.name = 'My first video doc!' +video.tensor = video.url.load().video +``` \ No newline at end of file diff --git a/docs/stylesheets/extra.css b/docs/stylesheets/extra.css new file mode 100644 index 00000000000..d1e81086a86 --- /dev/null +++ b/docs/stylesheets/extra.css @@ -0,0 +1,3 @@ +:root > * { + --md-code-hl-color: #FFFFD0; +} diff --git a/mkdocs.yml b/mkdocs.yml index f4441995378..a55e6d77080 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -12,6 +12,7 @@ theme: features: # - navigation.sections - navigation.indexes + - content.code.copy palette: # Palette toggle for light mode - scheme: default @@ -59,10 +60,13 @@ extra: - icon: fontawesome/brands/twitter link: https://twitter.com/docarray +extra_css: + - stylesheets/extra.css plugins: - search - awesome-pages + - mkdocs-video - mkdocstrings: handlers: python: @@ -85,6 +89,15 @@ nav: - how_to/multimodal_training_and_serving.md - how_to/optimize_performance_with_id_generation.md - how_to/audio2text.md + + - Data Types: + - data_types/text/text.md + - data_types/image/image.md + - data_types/audio/audio.md + - data_types/video/video.md + - data_types/3d_mesh/3d_mesh.md + - data_types/table/table.md + - data_types/multimodal/multimodal.md - ... - Glossary: glossary.md - Contributing: CONTRIBUTING.md diff --git a/poetry.lock b/poetry.lock index 12dd4370927..771cafd3e80 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1539,7 +1539,7 @@ name = "lxml" version = "4.9.2" description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API." category = "main" -optional = true +optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, != 3.4.*" files = [ {file = "lxml-4.9.2-cp27-cp27m-macosx_10_15_x86_64.whl", hash = "sha256:76cf573e5a365e790396a5cc2b909812633409306c6531a6877c59061e42c4f2"}, @@ -1957,6 +1957,22 @@ files = [ {file = "mkdocs_material_extensions-1.1.1.tar.gz", hash = "sha256:9c003da71e2cc2493d910237448c672e00cefc800d3d6ae93d2fc69979e3bd93"}, ] +[[package]] +name = "mkdocs-video" +version = "1.5.0" +description = "" +category = "dev" +optional = false +python-versions = ">=3.6" +files = [ + {file = "mkdocs-video-1.5.0.tar.gz", hash = "sha256:0defc018f4b7927f8afffc4d8e039c84dfba636dffc5e25e2bfa8d6350bc8eca"}, + {file = "mkdocs_video-1.5.0-py3-none-any.whl", hash = "sha256:b35613d4dacbac2dfa94d8c2600383cda14ad99a1fa1542b5fc4e9c6d19e9fe1"}, +] + +[package.dependencies] +lxml = ">=4.7.0" +mkdocs = ">=1.1.0,<2" + [[package]] name = "mkdocstrings" version = "0.20.0" @@ -4605,4 +4621,4 @@ web = ["fastapi"] [metadata] lock-version = "2.0" python-versions = ">=3.7,<4.0" -content-hash = "dd56d7cfa5b6758063baba58a5259f06535e0f425366360d042836aa479eab15" +content-hash = "61780ee493f649cc3cc164f8a3585083d69aed63831fad3c3cdcf91609804221" diff --git a/pyproject.toml b/pyproject.toml index 6982b351b47..49a5cd704fb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -67,6 +67,7 @@ mkdocstrings = {extras = ["python"], version = ">=0.20.0"} mkdocs-material= ">=9.1.2" mkdocs-awesome-pages-plugin = ">=2.8.0" mktestdocs= ">=0.2.0" +mkdocs-video = ">=1.5.0" [build-system] requires = ["poetry-core>=1.0.0"] build-backend = "poetry.core.masonry.api" diff --git a/tests/documentation/test_docs.py b/tests/documentation/test_docs.py index 6ca32d7700f..085022b5a00 100644 --- a/tests/documentation/test_docs.py +++ b/tests/documentation/test_docs.py @@ -44,7 +44,12 @@ def check_md_file(fpath, memory=False, lang="python", keyword_ignore=[]): @pytest.mark.parametrize( - 'fpath', pathlib.Path('docs/user_guide').glob('**/*.md'), ids=str + 'fpath', + [ + *list(pathlib.Path('docs/user_guide').glob('**/*.md')), + *list(pathlib.Path('docs/data_types').glob('**/*.md')), + ], + ids=str, ) def test_files_good(fpath): check_md_file(fpath=fpath, memory=True) diff --git a/tests/toydata/books.csv b/tests/toydata/books.csv index 7467bd4586e..4e9cab40504 100644 --- a/tests/toydata/books.csv +++ b/tests/toydata/books.csv @@ -1,4 +1,4 @@ -title,author,year -Harry Potter and the Philosopher's Stone,J. K. Rowling,1997 -Klara and the sun,Kazuo Ishiguro,2020 -A little life,Hanya Yanagihara,2015 \ No newline at end of file +title,author,year +Harry Potter and the Philosopher's Stone,J. K. Rowling,1997 +Klara and the sun,Kazuo Ishiguro,2020 +A little life,Hanya Yanagihara,2015