diff --git a/docarray/array/any_array.py b/docarray/array/any_array.py index da718519682..3d966d34904 100644 --- a/docarray/array/any_array.py +++ b/docarray/array/any_array.py @@ -121,7 +121,7 @@ def _set_data_column( field: str, values: Union[List, T, 'AbstractTensor'], ): - """Set all Documents in this [`DocList`][docarray.typing.DocList] using the passed values + """Set all Documents in this [`DocList`][docarray.array.doc_list.doc_list.DocList] using the passed values :param field: name of the fields to extract :values: the values to set at the DocList level @@ -140,7 +140,7 @@ def to_protobuf(self) -> 'DocListProto': ... def _to_node_protobuf(self) -> 'NodeProto': - """Convert a [`DocList`][docarray.typing.DocList] into a NodeProto protobuf message. + """Convert a [`DocList`][docarray.array.doc_list.doc_list.DocList] into a NodeProto protobuf message. This function should be called when a DocList is nested into another Document that need to be converted into a protobuf @@ -157,13 +157,11 @@ def traverse_flat( ) -> Union[List[Any], 'AbstractTensor']: """ Return a List of the accessed objects when applying the `access_path`. If this - results in a nested list or list of [`DocList`s][docarray.typing.DocList], the list will be flattened + results in a nested list or list of [`DocList`s][docarray.array.doc_list.doc_list.DocList], the list will be flattened on the first level. The access path is a string that consists of attribute names, concatenated and `"__"`-separated. It describes the path from the first level to an arbitrary one, e.g. `'content__image__url'`. - :param access_path: a string that represents the access path (`"__"`-separated). - :return: list of the accessed objects, flattened if nested. ```python from docarray import BaseDoc, DocList, Text @@ -210,7 +208,8 @@ class Book(BaseDoc): chapters = docs.traverse_flat(access_path='chapters') # list of 30 strings ``` - If your [`DocList`][docarray.typing.DocList] is in doc_vec mode, and you want to access a field of + + If your [`DocList`][docarray.array.doc_list.doc_list.DocList] is in doc_vec mode, and you want to access a field of type [`AnyTensor`][docarray.typing.AnyTensor], the doc_vec tensor will be returned instead of a list: ```python @@ -232,6 +231,9 @@ class Image(BaseDoc): access_path='tensor' ) # tensor of shape (2, 3, 224, 224) ``` + + :param access_path: a string that represents the access path ("__"-separated). + :return: list of the accessed objects, flattened if nested. """ ... @@ -263,7 +265,7 @@ def _flatten_one_level(sequence: List[Any]) -> List[Any]: def summary(self): """ - Print a summary of this [`DocList`][docarray.typing.DocList] object and a summary of the schema of its + Print a summary of this [`DocList`][docarray.array.doc_list.doc_list.DocList] object and a summary of the schema of its Document type. """ DocArraySummary(self).summary() @@ -275,13 +277,13 @@ def _batch( show_progress: bool = False, ) -> Generator[T, None, None]: """ - Creates a `Generator` that yields [`DocList`][docarray.typing.DocList] of size `batch_size`. + Creates a `Generator` that yields [`DocList`][docarray.array.doc_list.doc_list.DocList] of size `batch_size`. Note, that the last batch might be smaller than `batch_size`. :param batch_size: Size of each generated batch. :param shuffle: If set, shuffle the Documents before dividing into minibatches. :param show_progress: if set, show a progress bar when batching documents. - :yield: a Generator of [`DocList`][docarray.typing.DocList], each in the length of `batch_size` + :yield: a Generator of [`DocList`][docarray.array.doc_list.doc_list.DocList], each in the length of `batch_size` """ from rich.progress import track diff --git a/docarray/array/doc_list/doc_list.py b/docarray/array/doc_list/doc_list.py index d01d7a31e0d..8eb1a822d59 100644 --- a/docarray/array/doc_list/doc_list.py +++ b/docarray/array/doc_list/doc_list.py @@ -96,6 +96,7 @@ class Image(BaseDoc): # You can also set fields, with `docs.tensor = np.random.random([10, 100])`: + import numpy as np docs.tensor = np.random.random([10, 100]) diff --git a/docarray/array/doc_list/io.py b/docarray/array/doc_list/io.py index e0814e89fa8..9f153e2f1bd 100644 --- a/docarray/array/doc_list/io.py +++ b/docarray/array/doc_list/io.py @@ -141,7 +141,7 @@ def from_bytes( :param data: Bytes from which to deserialize :param protocol: protocol that was used to serialize - :param compress: compress algorithm that was used to serialize + :param compress: compression algorithm that was used to serialize between `lz4`, `bz2`, `lzma`, `zlib`, `gzip` :param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf` :return: the deserialized `DocList` """ @@ -247,7 +247,7 @@ def to_bytes( For more Pythonic code, please use ``bytes(...)``. :param protocol: protocol to use. It can be 'pickle-array', 'protobuf-array', 'pickle' or 'protobuf' - :param compress: compress algorithm to use + :param compress: compress algorithm to use between : `lz4`, `bz2`, `lzma`, `zlib`, `gzip` :param file_ctx: File or filename or serialized bytes where the data is stored. :param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf` :return: the binary serialization in bytes or None if file_ctx is passed where to store @@ -277,7 +277,7 @@ def from_base64( :param data: Base64 string to deserialize :param protocol: protocol that was used to serialize - :param compress: compress algorithm that was used to serialize + :param compress: compress algorithm that was used to serialize between `lz4`, `bz2`, `lzma`, `zlib`, `gzip` :param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf` :return: the deserialized `DocList` """ @@ -297,7 +297,7 @@ def to_base64( """Serialize itself into base64 encoded string. :param protocol: protocol to use. It can be 'pickle-array', 'protobuf-array', 'pickle' or 'protobuf' - :param compress: compress algorithm to use + :param compress: compress algorithm to use between `lz4`, `bz2`, `lzma`, `zlib`, `gzip` :param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf` :return: the binary serialization in bytes or None if file_ctx is passed where to store """ @@ -566,7 +566,7 @@ def _load_binary_all( ): """Read a `DocList` object from a binary file :param protocol: protocol to use. It can be 'pickle-array', 'protobuf-array', 'pickle' or 'protobuf' - :param compress: compress algorithm to use + :param compress: compress algorithm to use between `lz4`, `bz2`, `lzma`, `zlib`, `gzip` :param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf` :return: a `DocList` """ @@ -646,7 +646,7 @@ def _load_binary_stream( """Yield `Document` objects from a binary file :param protocol: protocol to use. It can be 'pickle' or 'protobuf' - :param compress: compress algorithm to use + :param compress: compress algorithm to use between `lz4`, `bz2`, `lzma`, `zlib`, `gzip` :param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf` :return: a generator of `Document` objects """ @@ -702,13 +702,7 @@ def load_binary( ) -> Union[T, Generator['T_doc', None, None]]: """Load doc_list elements from a compressed binary file. - :param file: File or filename or serialized bytes where the data is stored. - :param protocol: protocol to use. It can be 'pickle-array', 'protobuf-array', 'pickle' or 'protobuf' - :param compress: compress algorithm to use - :param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf` - :param streaming: if `True` returns a generator over `Document` objects. In case protocol is pickle the `Documents` are streamed from disk to save memory usage - :return: a `DocList` object !!! note If `file` is `str` it can specify `protocol` and `compress` as file extensions. @@ -716,6 +710,15 @@ def load_binary( string interpolation of the respective `protocol` and `compress` methods. For example if `file=my_docarray.protobuf.lz4` then the binary data will be loaded assuming `protocol=protobuf` and `compress=lz4`. + + :param file: File or filename or serialized bytes where the data is stored. + :param protocol: protocol to use. It can be 'pickle-array', 'protobuf-array', 'pickle' or 'protobuf' + :param compress: compress algorithm to use between `lz4`, `bz2`, `lzma`, `zlib`, `gzip` + :param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf` + :param streaming: if `True` returns a generator over `Document` objects. + + :return: a `DocList` object + """ load_protocol: Optional[str] = protocol load_compress: Optional[str] = compress @@ -765,7 +768,7 @@ def save_binary( :param file: File or filename to which the data is saved. :param protocol: protocol to use. It can be 'pickle-array', 'protobuf-array', 'pickle' or 'protobuf' - :param compress: compress algorithm to use + :param compress: compress algorithm to use between `lz4`, `bz2`, `lzma`, `zlib`, `gzip` :param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf` !!! note diff --git a/docarray/base_doc/doc.py b/docarray/base_doc/doc.py index a5c42a82ee4..0ed39bd0d49 100644 --- a/docarray/base_doc/doc.py +++ b/docarray/base_doc/doc.py @@ -1,5 +1,15 @@ import os -from typing import TYPE_CHECKING, Any, Dict, Optional, Type, TypeVar +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Dict, + Optional, + Type, + TypeVar, + Union, + no_type_check, +) import orjson from pydantic import BaseModel, Field @@ -12,11 +22,16 @@ from docarray.typing.tensor.abstract_tensor import AbstractTensor if TYPE_CHECKING: + from pydantic import Protocol + from pydantic.types import StrBytes + from pydantic.typing import AbstractSetIntStr, MappingIntStrAny + from docarray.array.doc_vec.column_storage import ColumnStorageView _console: Console = Console() T = TypeVar('T', bound='BaseDoc') +T_update = TypeVar('T_update', bound='UpdateMixin') class BaseDoc(BaseModel, IOMixin, UpdateMixin, BaseNode): @@ -141,3 +156,67 @@ def _docarray_to_json_compatible(self) -> Dict: :return: A dictionary of the BaseDoc object """ return self.dict() + + ######################################################################################################################################################## + ### this section is just for documentation purposes will be removed later once https://github.com/mkdocstrings/griffe/issues/138 is fixed ############## + ######################################################################################################################################################## + + def json( + self, + *, + include: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None, + exclude: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None, + by_alias: bool = False, + skip_defaults: Optional[bool] = None, + exclude_unset: bool = False, + exclude_defaults: bool = False, + exclude_none: bool = False, + encoder: Optional[Callable[[Any], Any]] = None, + models_as_dict: bool = True, + **dumps_kwargs: Any, + ) -> str: + """ + Generate a JSON representation of the model, `include` and `exclude` arguments as per `dict()`. + + `encoder` is an optional function to supply as `default` to json.dumps(), other arguments as per `json.dumps()`. + """ + return super().json( + include=include, + exclude=exclude, + by_alias=by_alias, + skip_defaults=skip_defaults, + exclude_unset=exclude_unset, + exclude_defaults=exclude_defaults, + exclude_none=exclude_none, + encoder=encoder, + models_as_dict=models_as_dict, + **dumps_kwargs, + ) + + @no_type_check + @classmethod + def parse_raw( + cls: Type[T], + b: 'StrBytes', + *, + content_type: str = None, + encoding: str = 'utf8', + proto: 'Protocol' = None, + allow_pickle: bool = False, + ) -> T: + """ + Parse a raw string or bytes into a base doc + :param b: + :param content_type: + :param encoding: the encoding to use when parsing a string, defaults to 'utf8' + :param proto: protocol to use. + :param allow_pickle: allow pickle protocol + :return: a document + """ + return super(BaseDoc, cls).parse_raw( + b, + content_type=content_type, + encoding=encoding, + proto=proto, + allow_pickle=allow_pickle, + ) diff --git a/docarray/base_doc/mixins/io.py b/docarray/base_doc/mixins/io.py index b2a64e8082b..e50d9ac791d 100644 --- a/docarray/base_doc/mixins/io.py +++ b/docarray/base_doc/mixins/io.py @@ -138,7 +138,7 @@ def to_bytes( For more Pythonic code, please use ``bytes(...)``. :param protocol: protocol to use. It can be 'pickle' or 'protobuf' - :param compress: compress algorithm to use + :param compress: compression algorithm to use :return: the binary serialization in bytes """ import pickle diff --git a/docarray/base_doc/mixins/update.py b/docarray/base_doc/mixins/update.py index 99fdbc2bf8e..471e97483ba 100644 --- a/docarray/base_doc/mixins/update.py +++ b/docarray/base_doc/mixins/update.py @@ -25,7 +25,8 @@ def update(self, other: T): Updates self with the content of other. Changes are applied to self. Updating one Document with another consists in the following: - setting data properties of the second Document to the first Document - if they are not None + if they are not None: + - Concatenating lists and updating sets - Updating recursively Documents and DocArrays - Updating Dictionaries of the left with the right @@ -38,30 +39,33 @@ def update(self, other: T): so they behave as regular types and the value of `self` is updated with the value of `other` - EXAMPLE USAGE - .. code-block:: python + --- + + ```python + from typing import List, Optional - from docarray import BaseDoc - from docarray.documents import Text + from docarray import BaseDoc - class MyDocument(BaseDoc): - content: str - title: Optional[str] = None - tags_: List + class MyDocument(BaseDoc): + content: str + title: Optional[str] = None + tags_: List - doc1 = MyDocument( - content='Core content of the document', title='Title', tags_=['python', 'AI'] - ) - doc2 = MyDocument(content='Core content updated', tags_=['docarray']) + doc1 = MyDocument( + content='Core content of the document', title='Title', tags_=['python', 'AI'] + ) + doc2 = MyDocument(content='Core content updated', tags_=['docarray']) - doc1.update(doc2) - assert doc1.content == 'Core content updated' - assert doc1.title == 'Title' - assert doc1.tags_ == ['python', 'AI', 'docarray'] + doc1.update(doc2) + assert doc1.content == 'Core content updated' + assert doc1.title == 'Title' + assert doc1.tags_ == ['python', 'AI', 'docarray'] + ``` + --- :param other: The Document with which to update the contents of this """ if type(self) != type(other): diff --git a/docs/api_references/array/da.md b/docs/api_references/array/da.md index eedcec827cd..e1f5b33f008 100644 --- a/docs/api_references/array/da.md +++ b/docs/api_references/array/da.md @@ -2,3 +2,4 @@ ::: docarray.array.doc_list.doc_list.DocList ::: docarray.array.doc_list.io.IOMixinArray +::: docarray.array.doc_list.pushpull.PushPullMixin diff --git a/docs/api_references/base_doc/base_doc.md b/docs/api_references/base_doc/base_doc.md index 0fe2dc80891..abce654ee96 100644 --- a/docs/api_references/base_doc/base_doc.md +++ b/docs/api_references/base_doc/base_doc.md @@ -1,3 +1,6 @@ # BaseDoc ::: docarray.base_doc.doc.BaseDoc +::: docarray.base_doc.mixins.io.IOMixin +::: docarray.base_doc.mixins.update.UpdateMixin + diff --git a/docs/integrations/fastapi.md b/docs/user_guide/sending/api/fastAPI.md similarity index 90% rename from docs/integrations/fastapi.md rename to docs/user_guide/sending/api/fastAPI.md index e55b09fba9e..d35308fefce 100644 --- a/docs/integrations/fastapi.md +++ b/docs/user_guide/sending/api/fastAPI.md @@ -1,9 +1,15 @@ -# Use DocArray with FastAPI +# FastAPI -FastAPI is a high-performance web framework for building APIs with Python. It's designed to be easy to use and supports asynchronous programming. -Since [`DocArray` documents are Pydantic Models (with a twist)](../user_guide/representing/first_step.md) they can be easily integrated with FastAPI, +[FastAPI](https://fastapi.tiangolo.com/) is a high-performance web framework for building APIs with Python based on Python type hints. It's designed to be easy to use and supports asynchronous programming. +Since [`DocArray` documents are Pydantic Models (with a twist)](../../representing/first_step.md) they can be easily integrated with FastAPI, and provide a seamless and efficient way to work with multimodal data in FastAPI-powered APIs. +!!! note + you need to install FastAPI to follow this section + ``` + pip install fastapi + ``` + First, you should define schemas for your input and/or output Documents: ```python diff --git a/docs/how_to/audio2text.md b/docs/user_guide/sending/api/jina.md similarity index 99% rename from docs/how_to/audio2text.md rename to docs/user_guide/sending/api/jina.md index d2f2507e08f..cbdf50acd2a 100644 --- a/docs/how_to/audio2text.md +++ b/docs/user_guide/sending/api/jina.md @@ -1,3 +1,5 @@ +# Jina + # Create an audio to text app with Jina and DocArray V2 This is how you can build an Audio to Text app using Jina, DocArray and Whisper. diff --git a/docs/user_guide/sending/first_step.md b/docs/user_guide/sending/first_step.md index 1079b9dd75b..6e2d2608943 100644 --- a/docs/user_guide/sending/first_step.md +++ b/docs/user_guide/sending/first_step.md @@ -1 +1,12 @@ -# Sending data +# Intro + +In the representation section we saw how to use [`BaseDoc`][docarray.base_doc.doc.BaseDoc], [`DocList`][docarray.array.doc_list.doc_list.DocList] and [`DocVec`][docarray.array.doc_vec.doc_vec.DocVec] +to represent multi-modal data. In this section we will see **how to send these data over the wire**. + + +This section is divided into two: + +- [Serialization](./ser/send_doc.md) of [`BaseDoc`][docarray.base_doc.doc.BaseDoc], [`DocList`][docarray.array.doc_list.doc_list.DocList] and [`DocVec`][docarray.array.doc_vec.doc_vec.DocVec] +- [Using DocArray with a web framework to build a multimodal API](./api/jina.md) + + diff --git a/docs/user_guide/sending/ser/send_doc.md b/docs/user_guide/sending/ser/send_doc.md new file mode 100644 index 00000000000..dd77557dbba --- /dev/null +++ b/docs/user_guide/sending/ser/send_doc.md @@ -0,0 +1,55 @@ +# BaseDoc + +You need to serialize a [BaseDoc][docarray.base_doc.doc.BaseDoc] before you can store or send it. + +!!! note + [BaseDoc][docarray.base_doc.doc.BaseDoc] supports serialization to `protobuf` and `json` formats. + +## Serialization to protobuf + +You can use [`to_protobuf`][docarray.base_doc.mixins.io.IOMixin.to_protobuf] to serialize a [BaseDoc][docarray.base_doc.doc.BaseDoc] to a protobuf message object +and use [`from_protobuf`][docarray.base_doc.mixins.io.IOMixin.from_protobuf] to deserialize it. + +```python +from typing import List +from docarray import BaseDoc + + +class MyDoc(BaseDoc): + text: str + tags: List[str] + + +doc = MyDoc(text='hello world', tags=['hello', 'world']) +proto_message = doc.to_protobuf() +new_doc = MyDoc.from_protobuf(proto_message) +assert doc == new_doc # True +``` + +## Serialization to JSON + +You can use [`json`][docarray.base_doc.doc.BaseDoc.json] to serialize a [BaseDoc][docarray.base_doc.doc.BaseDoc] to a json string +and use [`parse_raw`][docarray.base_doc.doc.BaseDoc.parse_raw] to deserialize it. + +```python +from typing import List +from docarray import BaseDoc + + +class MyDoc(BaseDoc): + text: str + tags: List[str] + + +doc = MyDoc(text='hello world', tags=['hello', 'world']) +json_str = doc.json() +new_doc = MyDoc.parse_raw(json_str) +assert doc == new_doc # True +``` + +See also: + +* The serializing [DocList](./send_doclist.md) section +* The serializing [DocVec](./send_docvec.md) section + + diff --git a/docs/user_guide/sending/ser/send_doclist.md b/docs/user_guide/sending/ser/send_doclist.md new file mode 100644 index 00000000000..70b1789ca5f --- /dev/null +++ b/docs/user_guide/sending/ser/send_doclist.md @@ -0,0 +1,165 @@ +# DocList +When sending or storing [`DocList`][docarray.array.doc_list.doc_list.DocList], you need to use serialization. [DocList][docarray.array.doc_list.doc_list.DocList] supports multiple ways to serialize the data. + +## JSON +You can use [`to_json()`][docarray.array.doc_list.io.IOMixinArray.to_json] and [`from_json()`][docarray.array.doc_list.io.IOMixinArray.from_json] to serialize and deserialize a [DocList][docarray.array.doc_list.doc_list.DocList]: + +```python +from docarray import BaseDoc, DocList + + +class SimpleDoc(BaseDoc): + text: str + + +dl = DocList[SimpleDoc]([SimpleDoc(text=f'doc {i}') for i in range(2)]) + +with open('simple-dl.json', 'wb') as f: + json_dl = dl.to_json() + print(json_dl) + f.write(json_dl) + +with open('simple-dl.json', 'r') as f: + dl_load_from_json = DocList[SimpleDoc].from_json(f.read()) + print(dl_load_from_json) +``` + +[to_json()][docarray.array.doc_list.io.IOMixinArray.to_json] returns the binary representation of the json object. [from_json()][docarray.array.doc_list.io.IOMixinArray.from_json] can load from either `str` or `binary` representation of the json object. + +```output +b'[{"id":"5540e72d407ae81abb2390e9249ed066","text":"doc 0"},{"id":"fbe9f80d2fa03571e899a2887af1ac1b","text":"doc 1"}]' +``` + +## Protobuf +To serialize a DocList with `protobuf`, you can use [`to_protobuf()`][docarray.array.doc_list.io.IOMixinArray.to_protobuf] and [`from_protobuf()`][docarray.array.doc_list.io.IOMixinArray.from_protobuf] to serialize and deserialize a [DocList][docarray.array.doc_list.doc_list.DocList]: + +```python +from docarray import BaseDoc, DocList + + +class SimpleDoc(BaseDoc): + text: str + + +dl = DocList[SimpleDoc]([SimpleDoc(text=f'doc {i}') for i in range(2)]) + +proto_message_dl = dl.to_protobuf() +dl_from_proto = DocList[SimpleDoc].from_protobuf(proto_message_dl) +print(type(proto_message_dl)) +print(dl_from_proto) +``` + +[to_protobuf()][docarray.array.doc_list.io.IOMixinArray.to_protobuf] returns a protobuf object of `docarray_pb2.DocListProto` class. [from_protobuf()][docarray.array.doc_list.io.IOMixinArray.from_protobuf] accepts a protobuf message object to construct a [DocList][docarray.array.doc_list.doc_list.DocList]. + +## Base64 +When transferring over the network, you can choose `Base64` format to serialize the [`DocList`][docarray.array.doc_list.doc_list.DocList]. +Serializing a [DocList][docarray.array.doc_list.doc_list.DocList] in Base64 supports both `pickle` and `protobuf` protocols. Besides, you can choose different compression methods. + +To serialize a [DocList][docarray.array.doc_list.doc_list.DocList] in Base64, you can use [`to_base64()`][docarray.array.doc_list.io.IOMixinArray.to_base64] and [`from_base64()`][docarray.array.doc_list.io.IOMixinArray.from_protobuf] to serialize and deserialize a [DocList][docarray.array.doc_list.doc_list.DocList]: + +We support multiple compression methods. (namely : `lz4`, `bz2`, `lzma`, `zlib`, `gzip`) + + +```python +from docarray import BaseDoc, DocList + + +class SimpleDoc(BaseDoc): + text: str + + +dl = DocList[SimpleDoc]([SimpleDoc(text=f'doc {i}') for i in range(2)]) + +base64_repr_dl = dl.to_base64(compress=None, protocol='pickle') + +dl_from_base64 = DocList[SimpleDoc].from_base64( + base64_repr_dl, compress=None, protocol='pickle' +) +``` + +## Binary +Similar to `Base64` serialization, `Binary` serialization also supports different protocols and compression methods. + +To save a [DocList][docarray.array.doc_list.doc_list.DocList] into a binary file, you can use [`save_binary()`][docarray.array.doc_list.io.IOMixinArray.to_base64] and [`load_binary()`][docarray.array.doc_list.io.IOMixinArray.from_protobuf] to serialize and deserialize a [DocList][docarray.array.doc_list.doc_list.DocList]: + +```python +from docarray import BaseDoc, DocList + + +class SimpleDoc(BaseDoc): + text: str + + +dl = DocList[SimpleDoc]([SimpleDoc(text=f'doc {i}') for i in range(2)]) + +dl.save_binary('simple-dl.pickle', compress=None, protocol='pickle') + +dl_from_binary = DocList[SimpleDoc].load_binary( + 'simple-dl.pickle', compress=None, protocol='pickle' +) +``` + +The [DocList][docarray.array.doc_list.doc_list.DocList] is stored at `simple-dl.pickle` file. + +### Bytes +Under the hood, [save_binary()][docarray.array.doc_list.io.IOMixinArray.to_base64] prepares the file object and calls [to_bytes()][docarray.array.doc_list.io.IOMixinArray.to_bytes] function to convert the [DocList][docarray.array.doc_list.doc_list.DocList] into a byte object. You can use [to_bytes()][docarray.array.doc_list.io.IOMixinArray.to_bytes] function directly and use [from_bytes()][docarray.array.doc_list.io.IOMixinArray.from_bytes] to load the [DocList][docarray.array.doc_list.doc_list.DocList] from a byte object. You can use `protocol` to choose between `pickle` and `protobuf`. Besides, [to_bytes()][docarray.array.doc_list.io.IOMixinArray.to_bytes] and [save_binary()][docarray.array.doc_list.io.IOMixinArray.save_binary] support multiple options for `compress` as well. + +```python +from docarray import BaseDoc, DocList + + +class SimpleDoc(BaseDoc): + text: str + + +dl = DocList[SimpleDoc]([SimpleDoc(text=f'doc {i}') for i in range(2)]) + +bytes_dl = dl.to_bytes(protocol='pickle', compress=None) + +dl_from_bytes = DocList[SimpleDoc].from_bytes( + bytes_dl, compress=None, protocol='pickle' +) +``` + + +## CSV +You can use [`from_csv()`][docarray.array.doc_list.io.IOMixinArray.from_csv] and [`to_csv()`][docarray.array.doc_list.io.IOMixinArray.to_csv] to de-/serializae and deserialize the [DocList][docarray.array.doc_list.doc_list.DocList] from/to a CSV file. Use the `dialect` parameter to choose the dialect of the CSV format: + +```python +from docarray import BaseDoc, DocList + + +class SimpleDoc(BaseDoc): + text: str + + +dl = DocList[SimpleDoc]([SimpleDoc(text=f'doc {i}') for i in range(2)]) + +dl.to_csv('simple-dl.csv') +dl_from_csv = DocList[SimpleDoc].from_csv('simple-dl.csv') +print(dl_from_csv) +``` + + +## Pandas.Dataframe +You can use [`from_dataframe()`][docarray.array.doc_list.io.IOMixinArray.from_dataframe] and [`to_dataframe()`][docarray.array.doc_list.io.IOMixinArray.to_dataframe] to load/save the [DocList][docarray.array.doc_list.doc_list.DocList] from/to a pandas DataFrame: + +```python +from docarray import BaseDoc, DocList + + +class SimpleDoc(BaseDoc): + text: str + + +dl = DocList[SimpleDoc]([SimpleDoc(text=f'doc {i}') for i in range(2)]) + +df = dl.to_dataframe() +dl_from_dataframe = DocList[SimpleDoc].from_dataframe(df) +print(dl_from_dataframe) +``` + +See also: + +* The serializing [BaseDoc](./send_doc.md) section +* The serializing [DocVec](./send_docvec.md) section diff --git a/docs/user_guide/sending/ser/send_docvec.md b/docs/user_guide/sending/ser/send_docvec.md new file mode 100644 index 00000000000..3fbaf759075 --- /dev/null +++ b/docs/user_guide/sending/ser/send_docvec.md @@ -0,0 +1,30 @@ +# DocVec + +When sending or storing [`DocVec`][docarray.array.doc_list.doc_list.DocVec], you need to use serialization. [DocVec][docarray.array.doc_list.doc_list.DocVec] only supports protobuf to serialize the data. +You can use [`to_protobuf`][docarray.array.doc_list.doc_list.DocVec.to_protobuf] and [`from_protobuf`][docarray.array.doc_list.doc_list.DocVec.from_protobuf] to serialize and deserialize a [DocVec][docarray.array.doc_list.doc_list.DocVec] + +```python +import numpy as np + +from docarray import BaseDoc, DocVec +from docarray.typing import AnyTensor + + +class SimpleVecDoc(BaseDoc): + tensor: AnyTensor + + +dv = DocVec[SimpleVecDoc]([SimpleVecDoc(tensor=np.ones(16)) for _ in range(8)]) + +proto_message_dv = dv.to_protobuf() + +dv_from_proto = DocVec[SimpleVecDoc].from_protobuf(proto_message_dv) +``` + +!!! note + We are planning to add more serialization formats in the future, notably JSON. + +[`to_protobuf`][docarray.array.doc_list.doc_list.DocVec.to_protobuf] returns a protobuf object of `docarray_pb2.DocVecProto` class. [`from_protobuf`][docarray.array.doc_list.doc_list.DocVec.from_protobuf] accepts a protobuf message object to construct a [DocVec][docarray.array.doc_list.doc_list.DocVec]. + +* The serializing [BaseDoc](./send_doc.md) section +* The serializing [DocList](./send_doclist.md) section diff --git a/mkdocs.yml b/mkdocs.yml index cdbe290989f..1cdbdb86bd1 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -81,16 +81,22 @@ nav: - Representing data: - user_guide/representing/first_step.md - user_guide/representing/array.md - - user_guide/sending/first_step.md + - Sending: + - user_guide/sending/first_step.md + - Serialization: + - user_guide/sending/ser/send_doc.md + - user_guide/sending/ser/send_doclist.md + - user_guide/sending/ser/send_docvec.md + - Building API: + - user_guide/sending/api/jina.md + - user_guide/sending/api/fastAPI.md + - user_guide/storing/first_step.md - How-to: - how_to/add_doc_index.md - how_to/multimodal_training_and_serving.md - how_to/optimize_performance_with_id_generation.md - - how_to/audio2text.md - - Integrations: - - integrations/fastapi.md - Data Types: - data_types/text/text.md - data_types/image/image.md diff --git a/tests/documentation/test_docs.py b/tests/documentation/test_docs.py index 085022b5a00..ccda4714700 100644 --- a/tests/documentation/test_docs.py +++ b/tests/documentation/test_docs.py @@ -4,6 +4,8 @@ from mktestdocs import grab_code_blocks from mktestdocs.__main__ import _executors, check_raw_string +file_to_skip = ['fastAPI', 'jina'] + def check_raw_file_full(raw, lang="python", keyword_ignore=[]): if lang not in _executors: @@ -43,16 +45,25 @@ def check_md_file(fpath, memory=False, lang="python", keyword_ignore=[]): check_raw_file_full(text, lang=lang, keyword_ignore=keyword_ignore) -@pytest.mark.parametrize( - 'fpath', - [ - *list(pathlib.Path('docs/user_guide').glob('**/*.md')), - *list(pathlib.Path('docs/data_types').glob('**/*.md')), - ], - ids=str, -) +files_to_check = [ + *list(pathlib.Path('docs/user_guide').glob('**/*.md')), + *list(pathlib.Path('docs/data_types').glob('**/*.md')), +] + +file_to_remove = [] + +for file in files_to_check: + for fn in file_to_skip: + if fn in str(file): + file_to_remove.append(file) + +for file in file_to_remove: + files_to_check.remove(file) + + +@pytest.mark.parametrize('fpath', files_to_check, ids=str) def test_files_good(fpath): - check_md_file(fpath=fpath, memory=True) + check_md_file(fpath=fpath, memory=True, keyword_ignore=['pickle']) def test_readme():