From a34d9cc0596261d766fcf83266c6320a889c5fc6 Mon Sep 17 00:00:00 2001 From: nan-wang Date: Mon, 10 Apr 2023 12:08:11 +0200 Subject: [PATCH 01/33] docs: add serialization for json Signed-off-by: nan-wang --- docs/user_guide/sending/send_doclist.md | 31 +++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 docs/user_guide/sending/send_doclist.md diff --git a/docs/user_guide/sending/send_doclist.md b/docs/user_guide/sending/send_doclist.md new file mode 100644 index 00000000000..c4b340649eb --- /dev/null +++ b/docs/user_guide/sending/send_doclist.md @@ -0,0 +1,31 @@ +# Serialization for `DocList` +When sending or storing `DocList`, you need to use serialization. `DocList` supports multiple ways to serialize the data. + +## json +You can use `to_json()` and `from_json()` to serialize and deserialize a `DocList`. + +```python +from docarray import BaseDoc, DocList + + +class SimpleDoc(BaseDoc): + text: str + + +dl = DocList[SimpleDoc]([SimpleDoc(text=f'doc {i}') for i in range(2)]) + +with open('simple-dl.json', 'wb') as f: + json_dl = dl.to_json() + print(json_dl) + f.write(json_dl) + +with open('simple-dl.json', 'r') as f: + dl_load_from_json = DocList[SimpleDoc].from_json(f.read()) + print(dl_load_from_json) +``` + +`to_json()` return the binary representation of the json object. `from_json()` can load from either `str` or `binary` representation of the json object. + +```output +b'[{"id":"5540e72d407ae81abb2390e9249ed066","text":"doc 0"},{"id":"fbe9f80d2fa03571e899a2887af1ac1b","text":"doc 1"}]' +``` \ No newline at end of file From 2a9de6d2d8016b3ea4b140a005effebbba538ec9 Mon Sep 17 00:00:00 2001 From: nan-wang Date: Mon, 10 Apr 2023 12:40:25 +0200 Subject: [PATCH 02/33] docs: add serialization for binary and protobuf Signed-off-by: nan-wang --- docs/user_guide/sending/send_doclist.md | 69 +++++++++++++++++++++++-- 1 file changed, 66 insertions(+), 3 deletions(-) diff --git a/docs/user_guide/sending/send_doclist.md b/docs/user_guide/sending/send_doclist.md index c4b340649eb..374acf0781a 100644 --- a/docs/user_guide/sending/send_doclist.md +++ b/docs/user_guide/sending/send_doclist.md @@ -1,7 +1,7 @@ # Serialization for `DocList` When sending or storing `DocList`, you need to use serialization. `DocList` supports multiple ways to serialize the data. -## json +## JSON You can use `to_json()` and `from_json()` to serialize and deserialize a `DocList`. ```python @@ -24,8 +24,71 @@ with open('simple-dl.json', 'r') as f: print(dl_load_from_json) ``` -`to_json()` return the binary representation of the json object. `from_json()` can load from either `str` or `binary` representation of the json object. +`to_json()` returns the binary representation of the json object. `from_json()` can load from either `str` or `binary` representation of the json object. ```output b'[{"id":"5540e72d407ae81abb2390e9249ed066","text":"doc 0"},{"id":"fbe9f80d2fa03571e899a2887af1ac1b","text":"doc 1"}]' -``` \ No newline at end of file +``` + +## Protobuf +When using protobuf, you can use `to_protobuf()` and `from_protobuf()` to serialize and deserialize a `DocList` + +```python +from docarray import BaseDoc, DocList + + +class SimpleDoc(BaseDoc): + text: str + + +dl = DocList[SimpleDoc]([SimpleDoc(text=f'doc {i}') for i in range(2)]) + +proto_message_dl = dl.to_protobuf() +dl_from_proto = DocList[SimpleDoc].from_protobuf(proto_message_dl) +print(type(proto_message_dl)) +print(dl_from_proto) +``` + +`to_protobuf()` returns a protobuf object of `docarray_pb2.DocListProto` class. `from_protobuf()` accepts a protobuf message object to construct a `DocList`. + + +## Bytes + + +## Binary +Storing a `DocList` supports both `pickle` and `protobuf` protocols. Besides, you can choose different compression methods. + + +| Compression Methods | Notes | +| --- |-------| +| `lz4` | | +| `bz2` | | +| `lzma` | | +| `zlib` | | +| `gzip` | | + +```python +from docarray import BaseDoc, DocList + + +class SimpleDoc(BaseDoc): + text: str + + +dl = DocList[SimpleDoc]([SimpleDoc(text=f'doc {i}') for i in range(2)]) + +dl.save_binary('simple-dl.pickle', compress=None, protocol='pickle') + +dl_from_binary = DocList[SimpleDoc].load_binary('simple-dl.pickle', compress=None, protocol='pickle') +``` + +The `DocList` is stored at `simple-dl.pickle` file. + + +## Base64 + + +## CSV + + +## Pandas.Dataframe \ No newline at end of file From 189787a5f5d847dba274edef72617b01af319123 Mon Sep 17 00:00:00 2001 From: nan-wang Date: Mon, 10 Apr 2023 12:56:28 +0200 Subject: [PATCH 03/33] docs: add serialization for base64 and bytes Signed-off-by: nan-wang --- docs/user_guide/sending/send_doclist.md | 44 +++++++++++++++++++++---- 1 file changed, 37 insertions(+), 7 deletions(-) diff --git a/docs/user_guide/sending/send_doclist.md b/docs/user_guide/sending/send_doclist.md index 374acf0781a..dfd43b10905 100644 --- a/docs/user_guide/sending/send_doclist.md +++ b/docs/user_guide/sending/send_doclist.md @@ -51,12 +51,9 @@ print(dl_from_proto) `to_protobuf()` returns a protobuf object of `docarray_pb2.DocListProto` class. `from_protobuf()` accepts a protobuf message object to construct a `DocList`. - -## Bytes - - -## Binary -Storing a `DocList` supports both `pickle` and `protobuf` protocols. Besides, you can choose different compression methods. +## Base64 +When transferring over the network, you can choose `Base64` format to serialize the `DocList`. +Storing a `DocList` in Base64 supports both `pickle` and `protobuf` protocols. Besides, you can choose different compression methods. | Compression Methods | Notes | @@ -71,6 +68,24 @@ Storing a `DocList` supports both `pickle` and `protobuf` protocols. Besides, yo from docarray import BaseDoc, DocList +class SimpleDoc(BaseDoc): + text: str + + +dl = DocList[SimpleDoc]([SimpleDoc(text=f'doc {i}') for i in range(2)]) + +base64_repr_dl = dl.to_base64(compress=None, protocol='pickle') + +dl_from_base64 = DocList[SimpleDoc].to_base64(base64_repr_dl , compress=None, protocol='pickle') +``` + +## Binary +Similar as in `Base64` serialization, `Binary` serialization also supports different protocols and compression methods. + +```python +from docarray import BaseDoc, DocList + + class SimpleDoc(BaseDoc): text: str @@ -84,8 +99,23 @@ dl_from_binary = DocList[SimpleDoc].load_binary('simple-dl.pickle', compress=Non The `DocList` is stored at `simple-dl.pickle` file. +### Bytes +Under the hood, `save_binary()` prepares the file object and calls `to_bytes()` function to convert the `DocList` into a byte object. You can use `to_bytes()` function directly and use `from_bytes()` to load the `DocList` from a byte object. You can use `protocol` to choose between `pickle` and `protobuf`. Besides, `to_bytes()` and `save_bytes()` support multiple options for `compress` as well. -## Base64 +```python +from docarray import BaseDoc, DocList + + +class SimpleDoc(BaseDoc): + text: str + + +dl = DocList[SimpleDoc]([SimpleDoc(text=f'doc {i}') for i in range(2)]) + +bytes_dl = dl.to_bytes(protocol='pickle', compress=None) + +dl_from_bytes = DocList[SimpleDoc].from_bytes(bytes_dl, compress=None, protocol='pickle') +``` ## CSV From f363a1b2b4b011d82ab870bc73a4669ffb56de93 Mon Sep 17 00:00:00 2001 From: nan-wang Date: Mon, 10 Apr 2023 13:01:32 +0200 Subject: [PATCH 04/33] docs: add serialization for csv Signed-off-by: nan-wang --- docs/user_guide/sending/send_doclist.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/docs/user_guide/sending/send_doclist.md b/docs/user_guide/sending/send_doclist.md index dfd43b10905..936795bf86e 100644 --- a/docs/user_guide/sending/send_doclist.md +++ b/docs/user_guide/sending/send_doclist.md @@ -119,6 +119,22 @@ dl_from_bytes = DocList[SimpleDoc].from_bytes(bytes_dl, compress=None, protocol= ## CSV +You can use `from_csv()` and `to_csv()` to de-/serializae and deserialize the `DocList` from/to a CSV file. Use the `dialect` parameter to choose the dialect of the CSV format. Check more details in the API doc. TODO: Add api doc here. + +```python +from docarray import BaseDoc, DocList + + +class SimpleDoc(BaseDoc): + text: str + + +dl = DocList[SimpleDoc]([SimpleDoc(text=f'doc {i}') for i in range(2)]) + +dl.to_csv('simple-dl.csv') +dl_from_csv = DocList[SimpleDoc].from_csv('simple-dl.csv') +print(dl_from_csv) +``` ## Pandas.Dataframe \ No newline at end of file From aedb7d8a54871b49d2e374970190a0a00462ab26 Mon Sep 17 00:00:00 2001 From: nan-wang Date: Mon, 10 Apr 2023 13:04:28 +0200 Subject: [PATCH 05/33] docs: add serialization for dataframe Signed-off-by: nan-wang --- docs/user_guide/sending/send_doclist.md | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/docs/user_guide/sending/send_doclist.md b/docs/user_guide/sending/send_doclist.md index 936795bf86e..92a48f1c803 100644 --- a/docs/user_guide/sending/send_doclist.md +++ b/docs/user_guide/sending/send_doclist.md @@ -137,4 +137,20 @@ print(dl_from_csv) ``` -## Pandas.Dataframe \ No newline at end of file +## Pandas.Dataframe +You can use `from_pandas()` and `to_pandas()` to load/save the `DocList` from/to a pandas DataFrame. + +```python +from docarray import BaseDoc, DocList + + +class SimpleDoc(BaseDoc): + text: str + + +dl = DocList[SimpleDoc]([SimpleDoc(text=f'doc {i}') for i in range(2)]) + +df = dl.to_pandas() +dl_from_dataframe = DocList[SimpleDoc].from_pandas(df) +print(dl_from_dataframe) +``` \ No newline at end of file From e223aba1ec3553ba775f67c8cea6c6c51fa4c057 Mon Sep 17 00:00:00 2001 From: samsja Date: Wed, 12 Apr 2023 11:20:08 +0200 Subject: [PATCH 06/33] fix: add doctring to documentaion basedoc Signed-off-by: samsja --- docarray/base_doc/doc.py | 118 +++++++++++++++++++++++++++++ docarray/base_doc/mixins/update.py | 34 +++++---- 2 files changed, 136 insertions(+), 16 deletions(-) diff --git a/docarray/base_doc/doc.py b/docarray/base_doc/doc.py index a5c42a82ee4..7e100bceecb 100644 --- a/docarray/base_doc/doc.py +++ b/docarray/base_doc/doc.py @@ -13,6 +13,7 @@ if TYPE_CHECKING: from docarray.array.doc_vec.column_storage import ColumnStorageView + from docarray.proto import DocProto _console: Console = Console() @@ -141,3 +142,120 @@ def _docarray_to_json_compatible(self) -> Dict: :return: A dictionary of the BaseDoc object """ return self.dict() + + ######################################################################################################################################################## + ### this section is just for documentation purposes will be removed later once https://github.com/mkdocstrings/griffe/issues/138 is fixed ############## + ######################################################################################################################################################## + + def to_bytes( + self, protocol: str = 'protobuf', compress: Optional[str] = None + ) -> bytes: + """Serialize itself into bytes. + + For more Pythonic code, please use ``bytes(...)``. + + :param protocol: protocol to use. It can be 'pickle' or 'protobuf' + :param compress: compress algorithm to use + :return: the binary serialization in bytes + """ + return super().to_bytes(protocol, compress) + + @classmethod + def from_bytes( + cls: Type[T], + data: bytes, + protocol: str = 'protobuf', + compress: Optional[str] = None, + ) -> T: + """Build Document object from binary bytes + + :param data: binary bytes + :param protocol: protocol to use. It can be 'pickle' or 'protobuf' + :param compress: compress method to use + :return: a Document object + """ + return super(BaseDoc, cls).from_bytes(data, protocol, compress) + + def to_base64( + self, protocol: str = 'protobuf', compress: Optional[str] = None + ) -> str: + """Serialize a Document object into as base64 string + + :param protocol: protocol to use. It can be 'pickle' or 'protobuf' + :param compress: compress method to use + :return: a base64 encoded string + """ + return super().to_base64(protocol, compress) + + @classmethod + def from_base64( + cls: Type[T], + data: str, + protocol: str = 'pickle', + compress: Optional[str] = None, + ) -> T: + """Build Document object from binary bytes + + :param data: a base64 encoded string + :param protocol: protocol to use. It can be 'pickle' or 'protobuf' + :param compress: compress method to use + :return: a Document object + """ + return super(BaseDoc, cls).from_base64(data, protocol, compress) + + @classmethod + def from_protobuf(cls: Type[T], pb_msg: 'DocProto') -> T: + """create a Document from a protobuf message + + :param pb_msg: the proto message of the Document + :return: a Document initialize with the proto data + """ + return super(BaseDoc, cls).from_protobuf(pb_msg) + + def update(self, other: T): + """ + Updates self with the content of other. Changes are applied to self. + Updating one Document with another consists in the following: + - setting data properties of the second Document to the first Document + if they are not None + - Concatenating lists and updating sets + - Updating recursively Documents and DocArrays + - Updating Dictionaries of the left with the right + + It behaves as an update operation for Dictionaries, except that since + it is applied to a static schema type, the presence of the field is + given by the field not having a None value and that DocArrays, + lists and sets are concatenated. It is worth mentioning that Tuples + are not merged together since they are meant to be inmutable, + so they behave as regular types and the value of `self` is updated + with the value of `other` + + + --- + + ```python + from docarray import BaseDoc + from docarray.documents import Text + + + class MyDocument(BaseDoc): + content: str + title: Optional[str] = None + tags_: List + + + doc1 = MyDocument( + content='Core content of the document', title='Title', tags_=['python', 'AI'] + ) + doc2 = MyDocument(content='Core content updated', tags_=['docarray']) + + doc1.update(doc2) + assert doc1.content == 'Core content updated' + assert doc1.title == 'Title' + assert doc1.tags_ == ['python', 'AI', 'docarray'] + ``` + + --- + :param other: The Document with which to update the contents of this + """ + super().update(other) diff --git a/docarray/base_doc/mixins/update.py b/docarray/base_doc/mixins/update.py index 99fdbc2bf8e..5a21738a7d4 100644 --- a/docarray/base_doc/mixins/update.py +++ b/docarray/base_doc/mixins/update.py @@ -38,30 +38,32 @@ def update(self, other: T): so they behave as regular types and the value of `self` is updated with the value of `other` - EXAMPLE USAGE - .. code-block:: python + --- - from docarray import BaseDoc - from docarray.documents import Text + ```python + from docarray import BaseDoc + from docarray.documents import Text - class MyDocument(BaseDoc): - content: str - title: Optional[str] = None - tags_: List + class MyDocument(BaseDoc): + content: str + title: Optional[str] = None + tags_: List - doc1 = MyDocument( - content='Core content of the document', title='Title', tags_=['python', 'AI'] - ) - doc2 = MyDocument(content='Core content updated', tags_=['docarray']) + doc1 = MyDocument( + content='Core content of the document', title='Title', tags_=['python', 'AI'] + ) + doc2 = MyDocument(content='Core content updated', tags_=['docarray']) - doc1.update(doc2) - assert doc1.content == 'Core content updated' - assert doc1.title == 'Title' - assert doc1.tags_ == ['python', 'AI', 'docarray'] + doc1.update(doc2) + assert doc1.content == 'Core content updated' + assert doc1.title == 'Title' + assert doc1.tags_ == ['python', 'AI', 'docarray'] + ``` + --- :param other: The Document with which to update the contents of this """ if type(self) != type(other): From 7491246becf3fa53b6de01a2a6e091a1c1804144 Mon Sep 17 00:00:00 2001 From: samsja Date: Wed, 12 Apr 2023 11:21:43 +0200 Subject: [PATCH 07/33] fix: fix mypy Signed-off-by: samsja --- docarray/base_doc/doc.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docarray/base_doc/doc.py b/docarray/base_doc/doc.py index 7e100bceecb..ccb5b65b99e 100644 --- a/docarray/base_doc/doc.py +++ b/docarray/base_doc/doc.py @@ -18,6 +18,7 @@ _console: Console = Console() T = TypeVar('T', bound='BaseDoc') +T_update = TypeVar('T_update', bound='UpdateMixin') class BaseDoc(BaseModel, IOMixin, UpdateMixin, BaseNode): @@ -212,7 +213,7 @@ def from_protobuf(cls: Type[T], pb_msg: 'DocProto') -> T: """ return super(BaseDoc, cls).from_protobuf(pb_msg) - def update(self, other: T): + def update(self, other: T_update): """ Updates self with the content of other. Changes are applied to self. Updating one Document with another consists in the following: From 5a40e79c70d72bb28a5eb55da58c47696e3d3c8b Mon Sep 17 00:00:00 2001 From: samsja Date: Wed, 12 Apr 2023 11:39:04 +0200 Subject: [PATCH 08/33] fix: add docstring doc list Signed-off-by: samsja --- docarray/array/doc_list/doc_list.py | 290 +++++++++++++++++++++++++++- 1 file changed, 289 insertions(+), 1 deletion(-) diff --git a/docarray/array/doc_list/doc_list.py b/docarray/array/doc_list/doc_list.py index 89364ff4842..257f294a5e7 100644 --- a/docarray/array/doc_list/doc_list.py +++ b/docarray/array/doc_list/doc_list.py @@ -1,10 +1,15 @@ +import csv import io +import pathlib from functools import wraps from typing import ( TYPE_CHECKING, Any, + BinaryIO, Callable, + Generator, Iterable, + Iterator, List, MutableSequence, Optional, @@ -18,7 +23,7 @@ from typing_inspect import is_union_type from docarray.array.any_array import AnyDocArray -from docarray.array.doc_list.io import IOMixinArray +from docarray.array.doc_list.io import IOMixinArray, _LazyRequestReader from docarray.array.doc_list.pushpull import PushPullMixin from docarray.array.doc_list.sequence_indexing_mixin import ( IndexingSequenceMixin, @@ -28,6 +33,7 @@ from docarray.typing import NdArray if TYPE_CHECKING: + import pandas as pd from pydantic import BaseConfig from pydantic.fields import ModelField @@ -305,3 +311,285 @@ def __getitem__(self: T, item: IndexIterType) -> T: def __getitem__(self, item): return super().__getitem__(item) + + ######################################################################################################################################################## + ### this section is just for documentation purposes will be removed later once https://github.com/mkdocstrings/griffe/issues/138 is fixed ############## + ######################################################################################################################################################## + + def to_protobuf(self) -> 'DocListProto': + """Convert DocList into a Protobuf message""" + return super(DocList, self).to_protobuf() + + @classmethod + def from_bytes( + cls: Type[T], + data: bytes, + protocol: str = 'protobuf-array', + compress: Optional[str] = None, + show_progress: bool = False, + ) -> T: + """Deserialize bytes into a DocList. + + :param data: Bytes from which to deserialize + :param protocol: protocol that was used to serialize + :param compress: compress algorithm that was used to serialize + :param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf` + :return: the deserialized DocList + """ + return super(DocList, cls).from_bytes( + data, protocol=protocol, compress=compress, show_progress=show_progress + ) + + def to_binary_stream( + self, + protocol: str = 'protobuf', + compress: Optional[str] = None, + show_progress: bool = False, + ) -> Iterator[bytes]: + return super().to_binary_stream( + protocol=protocol, compress=compress, show_progress=show_progress + ) + + def to_bytes( + self, + protocol: str = 'protobuf-array', + compress: Optional[str] = None, + file_ctx: Optional[BinaryIO] = None, + show_progress: bool = False, + ) -> Optional[bytes]: + """Serialize itself into bytes. + + For more Pythonic code, please use ``bytes(...)``. + + :param protocol: protocol to use. It can be 'pickle-array', 'protobuf-array', 'pickle' or 'protobuf' + :param compress: compress algorithm to use + :param file_ctx: File or filename or serialized bytes where the data is stored. + :param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf` + :return: the binary serialization in bytes or None if file_ctx is passed where to store + """ + return super().to_bytes( + protocol=protocol, + compress=compress, + file_ctx=file_ctx, + show_progress=show_progress, + ) + + @classmethod + def from_base64( + cls: Type[T], + data: str, + protocol: str = 'protobuf-array', + compress: Optional[str] = None, + show_progress: bool = False, + ) -> T: + """Deserialize base64 strings into a DocList. + + :param data: Base64 string to deserialize + :param protocol: protocol that was used to serialize + :param compress: compress algorithm that was used to serialize + :param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf` + :return: the deserialized DocList + """ + return super(DocList, cls).from_base64( + data, protocol=protocol, compress=compress, show_progress=show_progress + ) + + def to_base64( + self, + protocol: str = 'protobuf-array', + compress: Optional[str] = None, + show_progress: bool = False, + ) -> str: + """Serialize itself into base64 encoded string. + + :param protocol: protocol to use. It can be 'pickle-array', 'protobuf-array', 'pickle' or 'protobuf' + :param compress: compress algorithm to use + :param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf` + :return: the binary serialization in bytes or None if file_ctx is passed where to store + """ + return super().to_base64( + protocol=protocol, compress=compress, show_progress=show_progress + ) + + @classmethod + def from_json( + cls: Type[T], + file: Union[str, bytes, bytearray], + ) -> T: + """Deserialize JSON strings or bytes into a DocList. + + :param file: JSON object from where to deserialize a DocList + :return: the deserialized DocList + """ + return super(DocList, cls).from_json(file) + + def to_json(self) -> bytes: + """Convert the object into JSON bytes. Can be loaded via :meth:`.from_json`. + :return: JSON serialization of DocList + """ + return super().to_json() + + @classmethod + def from_csv( + cls, + file_path: str, + encoding: str = 'utf-8', + dialect: Union[str, csv.Dialect] = 'excel', + ) -> 'DocList': + """ + Load a DocList from a csv file following the schema defined in the + :attr:`~docarray.DocList.doc_type` attribute. + Every row of the csv file will be mapped to one document in the doc_list. + The column names (defined in the first row) have to match the field names + of the Document type. + For nested fields use "__"-separated access paths, such as 'image__url'. + + List-like fields (including field of type DocList) are not supported. + + :param file_path: path to csv file to load DocList from. + :param encoding: encoding used to read the csv file. Defaults to 'utf-8'. + :param dialect: defines separator and how to handle whitespaces etc. + Can be a csv.Dialect instance or one string of: + 'excel' (for comma seperated values), + 'excel-tab' (for tab separated values), + 'unix' (for csv file generated on UNIX systems). + :return: DocList + """ + return super(DocList, cls).from_csv( + file_path, encoding=encoding, dialect=dialect + ) + + def to_csv( + self, file_path: str, dialect: Union[str, csv.Dialect] = 'excel' + ) -> None: + """ + Save a DocList to a csv file. + The field names will be stored in the first row. Each row corresponds to the + information of one Document. + Columns for nested fields will be named after the "__"-seperated access paths, + such as `"image__url"` for `image.url`. + + :param file_path: path to a csv file. + :param dialect: defines separator and how to handle whitespaces etc. + Can be a csv.Dialect instance or one string of: + 'excel' (for comma seperated values), + 'excel-tab' (for tab separated values), + 'unix' (for csv file generated on UNIX systems). + """ + return super().to_csv(file_path, dialect=dialect) + + @classmethod + def from_dataframe(cls, df: 'pd.DataFrame') -> 'DocList': + """ + Load a DocList from a `pandas.DataFrame` following the schema + defined in the :attr:`~docarray.DocList.doc_type` attribute. + Every row of the dataframe will be mapped to one Document in the doc_list. + The column names of the dataframe have to match the field names of the + Document type. + For nested fields use "__"-separated access paths as column names, + such as 'image__url'. + + List-like fields (including field of type DocList) are not supported. + + EXAMPLE USAGE: + + .. code-block:: python + + import pandas as pd + + from docarray import BaseDoc, DocList + + + class Person(BaseDoc): + name: str + follower: int + + + df = pd.DataFrame( + data=[['Maria', 12345], ['Jake', 54321]], columns=['name', 'follower'] + ) + + docs = DocList[Person].from_dataframe(df) + + assert docs.name == ['Maria', 'Jake'] + assert docs.follower == [12345, 54321] + + + :param df: pandas.DataFrame to extract Document's information from + :return: DocList where each Document contains the information of one + corresponding row of the `pandas.DataFrame`. + """ + return super(DocList, cls).from_dataframe(df) + + def to_dataframe(self) -> 'pd.DataFrame': + """ + Save a DocList to a `pandas.DataFrame`. + The field names will be stored as column names. Each row of the dataframe corresponds + to the information of one Document. + Columns for nested fields will be named after the "__"-seperated access paths, + such as `"image__url"` for `image.url`. + + :return: pandas.DataFrame + """ + return super().to_dataframe() + + @classmethod + def load_binary( + cls: Type[T], + file: Union[str, bytes, pathlib.Path, io.BufferedReader, _LazyRequestReader], + protocol: str = 'protobuf-array', + compress: Optional[str] = None, + show_progress: bool = False, + streaming: bool = False, + ) -> Union[T, Generator['T_doc', None, None]]: + """Load doc_list elements from a compressed binary file. + + :param file: File or filename or serialized bytes where the data is stored. + :param protocol: protocol to use. It can be 'pickle-array', 'protobuf-array', 'pickle' or 'protobuf' + :param compress: compress algorithm to use + :param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf` + :param streaming: if `True` returns a generator over `Document` objects. + In case protocol is pickle the `Documents` are streamed from disk to save memory usage + :return: a DocList object + + .. note:: + If `file` is `str` it can specify `protocol` and `compress` as file extensions. + This functionality assumes `file=file_name.$protocol.$compress` where `$protocol` and `$compress` refer to a + string interpolation of the respective `protocol` and `compress` methods. + For example if `file=my_docarray.protobuf.lz4` then the binary data will be loaded assuming `protocol=protobuf` + and `compress=lz4`. + """ + return super().load_binary( + file, protocol=protocol, compress=compress, show_progress=show_progress + ) + + def save_binary( + self, + file: Union[str, pathlib.Path], + protocol: str = 'protobuf-array', + compress: Optional[str] = None, + show_progress: bool = False, + ) -> None: + """Save DocList into a binary file. + + It will use the protocol to pick how to save the DocList. + If used 'picke-doc_list` and `protobuf-array` the DocList will be stored + and compressed at complete level using `pickle` or `protobuf`. + When using `protobuf` or `pickle` as protocol each Document in DocList + will be stored individually and this would make it available for streaming. + + :param file: File or filename to which the data is saved. + :param protocol: protocol to use. It can be 'pickle-array', 'protobuf-array', 'pickle' or 'protobuf' + :param compress: compress algorithm to use + :param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf` + + .. note:: + If `file` is `str` it can specify `protocol` and `compress` as file extensions. + This functionality assumes `file=file_name.$protocol.$compress` where `$protocol` and `$compress` refer to a + string interpolation of the respective `protocol` and `compress` methods. + For example if `file=my_docarray.protobuf.lz4` then the binary data will be created using `protocol=protobuf` + and `compress=lz4`. + """ + return super().save_binary( + file, protocol=protocol, compress=compress, show_progress=show_progress + ) From 4e53699e6ebbfabd51fa797a1d6c6f0081b72cbe Mon Sep 17 00:00:00 2001 From: samsja Date: Wed, 12 Apr 2023 11:52:51 +0200 Subject: [PATCH 09/33] fix: dic doc array docstring Signed-off-by: samsja --- docarray/array/any_array.py | 99 ++++++++++++++++------------- docarray/array/doc_list/doc_list.py | 28 ++++---- docarray/array/doc_list/io.py | 28 ++++---- 3 files changed, 85 insertions(+), 70 deletions(-) diff --git a/docarray/array/any_array.py b/docarray/array/any_array.py index d156da9ea8c..2e1e6db091c 100644 --- a/docarray/array/any_array.py +++ b/docarray/array/any_array.py @@ -162,77 +162,88 @@ def traverse_flat( names, concatenated and "__"-separated. It describes the path from the first level to an arbitrary one, e.g. 'content__image__url'. - :param access_path: a string that represents the access path ("__"-separated). - :return: list of the accessed objects, flattened if nested. - EXAMPLE USAGE - .. code-block:: python - from docarray import BaseDoc, DocList, Text + --- + ```python + from docarray import BaseDoc, DocList, Text - class Author(BaseDoc): - name: str + class Author(BaseDoc): + name: str - class Book(BaseDoc): - author: Author - content: Text + class Book(BaseDoc): + author: Author + content: Text - docs = DocList[Book]( - Book(author=Author(name='Jenny'), content=Text(text=f'book_{i}')) - for i in range(10) # noqa: E501 - ) - books = docs.traverse_flat(access_path='content') # list of 10 Text objs + docs = DocList[Book]( + Book(author=Author(name='Jenny'), content=Text(text=f'book_{i}')) + for i in range(10) # noqa: E501 + ) - authors = docs.traverse_flat(access_path='author__name') # list of 10 strings + books = docs.traverse_flat(access_path='content') # list of 10 Text objs + + authors = docs.traverse_flat(access_path='author__name') # list of 10 strings + ``` + + --- If the resulting list is a nested list, it will be flattened: - EXAMPLE USAGE - .. code-block:: python - from docarray import BaseDoc, DocList + --- + ```python + from docarray import BaseDoc, DocList - class Chapter(BaseDoc): - content: str + class Chapter(BaseDoc): + content: str - class Book(BaseDoc): - chapters: DocList[Chapter] + class Book(BaseDoc): + chapters: DocList[Chapter] - docs = DocList[Book]( - Book(chapters=DocList[Chapter]([Chapter(content='some_content') for _ in range(3)])) - for _ in range(10) - ) - chapters = docs.traverse_flat(access_path='chapters') # list of 30 strings + docs = DocList[Book]( + Book(chapters=DocList[Chapter]([Chapter(content='some_content') for _ in range(3)])) + for _ in range(10) + ) + chapters = docs.traverse_flat(access_path='chapters') # list of 30 strings + ``` + + --- If your DocList is in doc_vec mode, and you want to access a field of type AnyTensor, the doc_vec tensor will be returned instead of a list: - EXAMPLE USAGE - .. code-block:: python - class Image(BaseDoc): - tensor: TorchTensor[3, 224, 224] + --- + ```python + class Image(BaseDoc): + tensor: TorchTensor[3, 224, 224] - batch = DocList[Image]( - [ - Image( - tensor=torch.zeros(3, 224, 224), - ) - for _ in range(2) - ] - ) - batch_stacked = batch.stack() - tensors = batch_stacked.traverse_flat( - access_path='tensor' - ) # tensor of shape (2, 3, 224, 224) + batch = DocList[Image]( + [ + Image( + tensor=torch.zeros(3, 224, 224), + ) + for _ in range(2) + ] + ) + + batch_stacked = batch.stack() + tensors = batch_stacked.traverse_flat( + access_path='tensor' + ) # tensor of shape (2, 3, 224, 224) + ``` + --- + + :param access_path: a string that represents the access path ("__"-separated). + :return: list of the accessed objects, flattened if nested. """ ... diff --git a/docarray/array/doc_list/doc_list.py b/docarray/array/doc_list/doc_list.py index 257f294a5e7..91ee5443872 100644 --- a/docarray/array/doc_list/doc_list.py +++ b/docarray/array/doc_list/doc_list.py @@ -491,29 +491,31 @@ def from_dataframe(cls, df: 'pd.DataFrame') -> 'DocList': List-like fields (including field of type DocList) are not supported. - EXAMPLE USAGE: - .. code-block:: python + --- - import pandas as pd + ```python + import pandas as pd - from docarray import BaseDoc, DocList + from docarray import BaseDoc, DocList - class Person(BaseDoc): - name: str - follower: int + class Person(BaseDoc): + name: str + follower: int - df = pd.DataFrame( - data=[['Maria', 12345], ['Jake', 54321]], columns=['name', 'follower'] - ) + df = pd.DataFrame( + data=[['Maria', 12345], ['Jake', 54321]], columns=['name', 'follower'] + ) - docs = DocList[Person].from_dataframe(df) + docs = DocList[Person].from_dataframe(df) - assert docs.name == ['Maria', 'Jake'] - assert docs.follower == [12345, 54321] + assert docs.name == ['Maria', 'Jake'] + assert docs.follower == [12345, 54321] + ``` + --- :param df: pandas.DataFrame to extract Document's information from :return: DocList where each Document contains the information of one diff --git a/docarray/array/doc_list/io.py b/docarray/array/doc_list/io.py index fed12363697..91623df4e5d 100644 --- a/docarray/array/doc_list/io.py +++ b/docarray/array/doc_list/io.py @@ -451,28 +451,30 @@ def from_dataframe(cls, df: 'pd.DataFrame') -> 'DocList': List-like fields (including field of type DocList) are not supported. - EXAMPLE USAGE: + --- - .. code-block:: python + ```python + import pandas as pd - import pandas as pd + from docarray import BaseDoc, DocList - from docarray import BaseDoc, DocList + class Person(BaseDoc): + name: str + follower: int - class Person(BaseDoc): - name: str - follower: int + df = pd.DataFrame( + data=[['Maria', 12345], ['Jake', 54321]], columns=['name', 'follower'] + ) - df = pd.DataFrame( - data=[['Maria', 12345], ['Jake', 54321]], columns=['name', 'follower'] - ) + docs = DocList[Person].from_dataframe(df) - docs = DocList[Person].from_dataframe(df) + assert docs.name == ['Maria', 'Jake'] + assert docs.follower == [12345, 54321] + ``` - assert docs.name == ['Maria', 'Jake'] - assert docs.follower == [12345, 54321] + --- :param df: pandas.DataFrame to extract Document's information from From 39c1df974e30f0a83889c5478b6e27f17952d96a Mon Sep 17 00:00:00 2001 From: samsja Date: Wed, 12 Apr 2023 13:02:22 +0200 Subject: [PATCH 10/33] fix: fix page for doc list serilizaiton Signed-off-by: samsja --- docarray/array/doc_list/doc_list.py | 22 ++++++------ docarray/array/doc_list/io.py | 16 ++++----- docs/user_guide/sending/send_doclist.md | 48 +++++++++++++------------ mkdocs.yml | 5 ++- 4 files changed, 50 insertions(+), 41 deletions(-) diff --git a/docarray/array/doc_list/doc_list.py b/docarray/array/doc_list/doc_list.py index 91ee5443872..d7b16d3031c 100644 --- a/docarray/array/doc_list/doc_list.py +++ b/docarray/array/doc_list/doc_list.py @@ -332,7 +332,7 @@ def from_bytes( :param data: Bytes from which to deserialize :param protocol: protocol that was used to serialize - :param compress: compress algorithm that was used to serialize + :param compress: compress algorithm that was used to serialize between `lz4`, `bz2`, `lzma`, `zlib`, `gzip` :param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf` :return: the deserialized DocList """ @@ -362,7 +362,7 @@ def to_bytes( For more Pythonic code, please use ``bytes(...)``. :param protocol: protocol to use. It can be 'pickle-array', 'protobuf-array', 'pickle' or 'protobuf' - :param compress: compress algorithm to use + :param compress: compress algorithm to use between `lz4`, `bz2`, `lzma`, `zlib`, `gzip` :param file_ctx: File or filename or serialized bytes where the data is stored. :param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf` :return: the binary serialization in bytes or None if file_ctx is passed where to store @@ -403,7 +403,7 @@ def to_base64( """Serialize itself into base64 encoded string. :param protocol: protocol to use. It can be 'pickle-array', 'protobuf-array', 'pickle' or 'protobuf' - :param compress: compress algorithm to use + :param compress: compress algorithm to use between `lz4`, `bz2`, `lzma`, `zlib`, `gzip` :param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf` :return: the binary serialization in bytes or None if file_ctx is passed where to store """ @@ -548,7 +548,7 @@ def load_binary( :param file: File or filename or serialized bytes where the data is stored. :param protocol: protocol to use. It can be 'pickle-array', 'protobuf-array', 'pickle' or 'protobuf' - :param compress: compress algorithm to use + :param compress: compress algorithm to use between 'lz4', 'gzip', 'bz2', 'zstd', 'lzma' :param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf` :param streaming: if `True` returns a generator over `Document` objects. In case protocol is pickle the `Documents` are streamed from disk to save memory usage @@ -580,17 +580,19 @@ def save_binary( When using `protobuf` or `pickle` as protocol each Document in DocList will be stored individually and this would make it available for streaming. - :param file: File or filename to which the data is saved. - :param protocol: protocol to use. It can be 'pickle-array', 'protobuf-array', 'pickle' or 'protobuf' - :param compress: compress algorithm to use - :param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf` - - .. note:: + !! note If `file` is `str` it can specify `protocol` and `compress` as file extensions. This functionality assumes `file=file_name.$protocol.$compress` where `$protocol` and `$compress` refer to a string interpolation of the respective `protocol` and `compress` methods. For example if `file=my_docarray.protobuf.lz4` then the binary data will be created using `protocol=protobuf` and `compress=lz4`. + + :param file: File or filename to which the data is saved. + :param protocol: protocol to use. It can be 'pickle-array', 'protobuf-array', 'pickle' or 'protobuf' + :param compress: compress algorithm to use between : `lz4`, `bz2`, `lzma`, `zlib`, `gzip` + :param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf` + + """ return super().save_binary( file, protocol=protocol, compress=compress, show_progress=show_progress diff --git a/docarray/array/doc_list/io.py b/docarray/array/doc_list/io.py index 91623df4e5d..265e648449d 100644 --- a/docarray/array/doc_list/io.py +++ b/docarray/array/doc_list/io.py @@ -141,7 +141,7 @@ def from_bytes( :param data: Bytes from which to deserialize :param protocol: protocol that was used to serialize - :param compress: compress algorithm that was used to serialize + :param compress: compress algorithm that was used to serialize between `lz4`, `bz2`, `lzma`, `zlib`, `gzip` :param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf` :return: the deserialized DocList """ @@ -247,7 +247,7 @@ def to_bytes( For more Pythonic code, please use ``bytes(...)``. :param protocol: protocol to use. It can be 'pickle-array', 'protobuf-array', 'pickle' or 'protobuf' - :param compress: compress algorithm to use + :param compress: compress algorithm to use between : `lz4`, `bz2`, `lzma`, `zlib`, `gzip` :param file_ctx: File or filename or serialized bytes where the data is stored. :param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf` :return: the binary serialization in bytes or None if file_ctx is passed where to store @@ -277,7 +277,7 @@ def from_base64( :param data: Base64 string to deserialize :param protocol: protocol that was used to serialize - :param compress: compress algorithm that was used to serialize + :param compress: compress algorithm that was used to serialize between `lz4`, `bz2`, `lzma`, `zlib`, `gzip` :param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf` :return: the deserialized DocList """ @@ -297,7 +297,7 @@ def to_base64( """Serialize itself into base64 encoded string. :param protocol: protocol to use. It can be 'pickle-array', 'protobuf-array', 'pickle' or 'protobuf' - :param compress: compress algorithm to use + :param compress: compress algorithm to use between `lz4`, `bz2`, `lzma`, `zlib`, `gzip` :param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf` :return: the binary serialization in bytes or None if file_ctx is passed where to store """ @@ -562,7 +562,7 @@ def _load_binary_all( ): """Read a `DocList` object from a binary file :param protocol: protocol to use. It can be 'pickle-array', 'protobuf-array', 'pickle' or 'protobuf' - :param compress: compress algorithm to use + :param compress: compress algorithm to use between `lz4`, `bz2`, `lzma`, `zlib`, `gzip` :param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf` :return: a `DocList` """ @@ -642,7 +642,7 @@ def _load_binary_stream( """Yield `Document` objects from a binary file :param protocol: protocol to use. It can be 'pickle' or 'protobuf' - :param compress: compress algorithm to use + :param compress: compress algorithm to use between `lz4`, `bz2`, `lzma`, `zlib`, `gzip` :param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf` :return: a generator of `Document` objects """ @@ -700,7 +700,7 @@ def load_binary( :param file: File or filename or serialized bytes where the data is stored. :param protocol: protocol to use. It can be 'pickle-array', 'protobuf-array', 'pickle' or 'protobuf' - :param compress: compress algorithm to use + :param compress: compress algorithm to use between `lz4`, `bz2`, `lzma`, `zlib`, `gzip` :param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf` :param streaming: if `True` returns a generator over `Document` objects. In case protocol is pickle the `Documents` are streamed from disk to save memory usage @@ -761,7 +761,7 @@ def save_binary( :param file: File or filename to which the data is saved. :param protocol: protocol to use. It can be 'pickle-array', 'protobuf-array', 'pickle' or 'protobuf' - :param compress: compress algorithm to use + :param compress: compress algorithm to use between `lz4`, `bz2`, `lzma`, `zlib`, `gzip` :param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf` .. note:: diff --git a/docs/user_guide/sending/send_doclist.md b/docs/user_guide/sending/send_doclist.md index 92a48f1c803..25ecce1cbff 100644 --- a/docs/user_guide/sending/send_doclist.md +++ b/docs/user_guide/sending/send_doclist.md @@ -1,8 +1,8 @@ -# Serialization for `DocList` -When sending or storing `DocList`, you need to use serialization. `DocList` supports multiple ways to serialize the data. +# Serialization for DocList +When sending or storing [`DocList`][docarray.array.doc_list.doc_list.DocList], you need to use serialization. [DocList][docarray.array.doc_list.doc_list.DocList] supports multiple ways to serialize the data. ## JSON -You can use `to_json()` and `from_json()` to serialize and deserialize a `DocList`. +You can use [`to_json()`][docarray.array.doc_list.doc_list.DocList.to_json] and [`from_json()`][docarray.array.doc_list.doc_list.DocList.from_json] to serialize and deserialize a [DocList][docarray.array.doc_list.doc_list.DocList]. ```python from docarray import BaseDoc, DocList @@ -24,14 +24,14 @@ with open('simple-dl.json', 'r') as f: print(dl_load_from_json) ``` -`to_json()` returns the binary representation of the json object. `from_json()` can load from either `str` or `binary` representation of the json object. +[to_json()][docarray.array.doc_list.doc_list.DocList.to_json] returns the binary representation of the json object. [from_json()][docarray.array.doc_list.doc_list.DocList.from_json] can load from either `str` or `binary` representation of the json object. ```output b'[{"id":"5540e72d407ae81abb2390e9249ed066","text":"doc 0"},{"id":"fbe9f80d2fa03571e899a2887af1ac1b","text":"doc 1"}]' ``` ## Protobuf -When using protobuf, you can use `to_protobuf()` and `from_protobuf()` to serialize and deserialize a `DocList` +To serialize a DocList with `protobuf`, you can use [`to_protobuf()`][docarray.array.doc_list.doc_list.DocList.to_protobuf] and [`from_protobuf()`][docarray.array.doc_list.doc_list.DocList.from_protobuf] to serialize and deserialize a [DocList][docarray.array.doc_list.doc_list.DocList]. ```python from docarray import BaseDoc, DocList @@ -49,20 +49,16 @@ print(type(proto_message_dl)) print(dl_from_proto) ``` -`to_protobuf()` returns a protobuf object of `docarray_pb2.DocListProto` class. `from_protobuf()` accepts a protobuf message object to construct a `DocList`. +[to_protobuf()][docarray.array.doc_list.doc_list.DocList.to_protobuf] returns a protobuf object of `docarray_pb2.DocListProto` class. [from_protobuf()][docarray.array.doc_list.doc_list.DocList.from_protobuf] accepts a protobuf message object to construct a [DocList][docarray.array.doc_list.doc_list.DocList]. ## Base64 -When transferring over the network, you can choose `Base64` format to serialize the `DocList`. -Storing a `DocList` in Base64 supports both `pickle` and `protobuf` protocols. Besides, you can choose different compression methods. +When transferring over the network, you can choose `Base64` format to serialize the [`DocList`][docarray.array.doc_list.doc_list.DocList]. +Serializing a [DocList][docarray.array.doc_list.doc_list.DocList] in Base64 supports both `pickle` and `protobuf` protocols. Besides, you can choose different compression methods. +To serialize a [DocList][docarray.array.doc_list.doc_list.DocList] in Base64, you can use [`to_base64()`][docarray.array.doc_list.doc_list.DocList.to_base64] and [`from_base64()`][docarray.array.doc_list.doc_list.DocList.from_protobuf] to serialize and deserialize a [DocList][docarray.array.doc_list.doc_list.from_base64]. + +We support multiple compression methods. (namely : `lz4`, `bz2`, `lzma`, `zlib`, `gzip`) -| Compression Methods | Notes | -| --- |-------| -| `lz4` | | -| `bz2` | | -| `lzma` | | -| `zlib` | | -| `gzip` | | ```python from docarray import BaseDoc, DocList @@ -76,12 +72,16 @@ dl = DocList[SimpleDoc]([SimpleDoc(text=f'doc {i}') for i in range(2)]) base64_repr_dl = dl.to_base64(compress=None, protocol='pickle') -dl_from_base64 = DocList[SimpleDoc].to_base64(base64_repr_dl , compress=None, protocol='pickle') +dl_from_base64 = DocList[SimpleDoc].from_base64( + base64_repr_dl, compress=None, protocol='pickle' +) ``` ## Binary Similar as in `Base64` serialization, `Binary` serialization also supports different protocols and compression methods. +To save a [DocList][docarray.array.doc_list.doc_list.DocList] into a binary file, you can use [`save_binary()`][docarray.array.doc_list.doc_list.DocList.to_base64] and [`load_binary()`][docarray.array.doc_list.doc_list.DocList.from_protobuf] to serialize and deserialize a [DocList][docarray.array.doc_list.doc_list.from_base64]. + ```python from docarray import BaseDoc, DocList @@ -94,13 +94,15 @@ dl = DocList[SimpleDoc]([SimpleDoc(text=f'doc {i}') for i in range(2)]) dl.save_binary('simple-dl.pickle', compress=None, protocol='pickle') -dl_from_binary = DocList[SimpleDoc].load_binary('simple-dl.pickle', compress=None, protocol='pickle') +dl_from_binary = DocList[SimpleDoc].load_binary( + 'simple-dl.pickle', compress=None, protocol='pickle' +) ``` -The `DocList` is stored at `simple-dl.pickle` file. +The [DocList][docarray.array.doc_list.doc_list.DocList] is stored at `simple-dl.pickle` file. ### Bytes -Under the hood, `save_binary()` prepares the file object and calls `to_bytes()` function to convert the `DocList` into a byte object. You can use `to_bytes()` function directly and use `from_bytes()` to load the `DocList` from a byte object. You can use `protocol` to choose between `pickle` and `protobuf`. Besides, `to_bytes()` and `save_bytes()` support multiple options for `compress` as well. +Under the hood, [save_binary()][docarray.array.doc_list.doc_list.DocList.to_base64] prepares the file object and calls [to_bytes()][docarray.array.doc_list.doc_list.DocList.to_bytes] function to convert the [DocList][docarray.array.doc_list.doc_list.DocList] into a byte object. You can use [to_bytes()][docarray.array.doc_list.doc_list.DocList.to_bytes] function directly and use [from_bytes()][docarray.array.doc_list.doc_list.DocList.from_bytes] to load the [DocList][docarray.array.doc_list.doc_list.DocList] from a byte object. You can use `protocol` to choose between `pickle` and `protobuf`. Besides, [to_bytes()][docarray.array.doc_list.doc_list.DocList.to_bytes] and [save_bytes()][docarray.array.doc_list.doc_list.DocList.save_bytes] support multiple options for `compress` as well. ```python from docarray import BaseDoc, DocList @@ -114,12 +116,14 @@ dl = DocList[SimpleDoc]([SimpleDoc(text=f'doc {i}') for i in range(2)]) bytes_dl = dl.to_bytes(protocol='pickle', compress=None) -dl_from_bytes = DocList[SimpleDoc].from_bytes(bytes_dl, compress=None, protocol='pickle') +dl_from_bytes = DocList[SimpleDoc].from_bytes( + bytes_dl, compress=None, protocol='pickle' +) ``` ## CSV -You can use `from_csv()` and `to_csv()` to de-/serializae and deserialize the `DocList` from/to a CSV file. Use the `dialect` parameter to choose the dialect of the CSV format. Check more details in the API doc. TODO: Add api doc here. +You can use [`from_csv()`][docarray.array.doc_list.doc_list.DocList.from_csv] and [`to_csv()`][docarray.array.doc_list.doc_list.DocList.to_csv] to de-/serializae and deserialize the [DocList][docarray.array.doc_list.doc_list.DocList] from/to a CSV file. Use the `dialect` parameter to choose the dialect of the CSV format. ```python from docarray import BaseDoc, DocList @@ -138,7 +142,7 @@ print(dl_from_csv) ## Pandas.Dataframe -You can use `from_pandas()` and `to_pandas()` to load/save the `DocList` from/to a pandas DataFrame. +You can use [`from_dataframe()`][docarray.array.doc_list.doc_list.DocList.from_dataframe] and [`to_dataframe()`][docarray.array.doc_list.doc_list.DocList.to_dataframe] to load/save the [DocList][docarray.array.doc_list.doc_list.DocList] from/to a pandas DataFrame. ```python from docarray import BaseDoc, DocList diff --git a/mkdocs.yml b/mkdocs.yml index f4441995378..1fa2f413d4e 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -77,7 +77,10 @@ nav: - Representing: - user_guide/representing/first_step.md - user_guide/representing/array.md - - user_guide/sending/first_step.md + - Sending: + - user_guide/sending/first_step.md + - user_guide/sending/send_doclist.md + - user_guide/storing/first_step.md - How-to: From 5de2719e94f5208dcd8188459aa9b8dbd46e6958 Mon Sep 17 00:00:00 2001 From: samsja Date: Wed, 12 Apr 2023 13:10:40 +0200 Subject: [PATCH 11/33] fix: fix docstring Signed-off-by: samsja --- docarray/array/doc_list/doc_list.py | 40 +++++++++++++++++++---------- 1 file changed, 27 insertions(+), 13 deletions(-) diff --git a/docarray/array/doc_list/doc_list.py b/docarray/array/doc_list/doc_list.py index d7b16d3031c..b0a656a2195 100644 --- a/docarray/array/doc_list/doc_list.py +++ b/docarray/array/doc_list/doc_list.py @@ -101,27 +101,41 @@ class Image(BaseDoc): fields at the DocList level (for example `docs.tensor` or `docs.url`). You can also set fields, with `docs.tensor = np.random.random([10, 100])`: - print(docs.url) - # [ImageUrl('http://url.com/foo.png', host_type='domain'), ...] - import numpy as np + --- - docs.tensor = np.random.random([10, 100]) - print(docs.tensor) - # [NdArray([0.11299577, 0.47206767, 0.481723 , 0.34754724, 0.15016037, - # 0.88861321, 0.88317666, 0.93845579, 0.60486676, ... ]), ...] + ```python + print(docs.url) + # [ImageUrl('http://url.com/foo.png', host_type='domain'), ...] + import numpy as np + + docs.tensor = np.random.random([10, 100]) + print(docs.tensor) + # [NdArray([0.11299577, 0.47206767, 0.481723 , 0.34754724, 0.15016037, + # 0.88861321, 0.88317666, 0.93845579, 0.60486676, ... ]), ...] + ``` + --- You can index into a DocList like a numpy doc_list or torch tensor: + --- - docs[0] # index by position - docs[0:5:2] # index by slice - docs[[0, 2, 3]] # index by list of indices - docs[True, False, True, True, ...] # index by boolean mask + ```python + docs[0] # index by position + docs[0:5:2] # index by slice + docs[[0, 2, 3]] # index by list of indices + docs[True, False, True, True, ...] # index by boolean mask + ``` + --- You can delete items from a DocList like a Python List + --- + + ```python + del docs[0] # remove first element from DocList + del docs[0:5] # remove elements for 0 to 5 from DocList + ``` - del docs[0] # remove first element from DocList - del docs[0:5] # remove elements for 0 to 5 from DocList + --- :param docs: iterable of Document From e9df25b4ab2dbf49207b51712884d245eaccb8d2 Mon Sep 17 00:00:00 2001 From: samsja Date: Wed, 12 Apr 2023 13:24:06 +0200 Subject: [PATCH 12/33] feat: add docvec Signed-off-by: samsja --- docs/user_guide/sending/send_docvec.md | 24 ++++++++++++++++++++++++ mkdocs.yml | 1 + 2 files changed, 25 insertions(+) create mode 100644 docs/user_guide/sending/send_docvec.md diff --git a/docs/user_guide/sending/send_docvec.md b/docs/user_guide/sending/send_docvec.md new file mode 100644 index 00000000000..d400caa844c --- /dev/null +++ b/docs/user_guide/sending/send_docvec.md @@ -0,0 +1,24 @@ +# Serialization of DocVec + +When sending or storing [`DocVec`][docarray.array.doc_list.doc_list.DocVec], you need to use serialization. [DocVec][docarray.array.doc_list.doc_list.DocVec] only supports protobuf to serialize the data. +You can use [`to_protobuf`][docarray.array.doc_list.doc_list.DocVec.to_protobuf] and [`from_protobuf`][docarray.array.doc_list.doc_list.DocVec.from_protobuf] to serialize and deserialize a [DocVec][docarray.array.doc_list.doc_list.DocVec] + +```python +import numpy as np + +from docarray import BaseDoc, DocVec +from docarray.typing import AnyTensor + + +class SimpleVecDoc(BaseDoc): + tensor: AnyTensor + + +dv = DocVec[SimpleVecDoc]([SimpleVecDoc(tensor=np.ones(16)) for _ in range(8)]) + +proto_message_dv = dv.to_protobuf() + +dv_from_proto = DocVec[SimpleVecDoc].from_protobuf(proto_message_dv) +``` + +[`to_protobuf`][docarray.array.doc_list.doc_list.DocVec.to_protobuf] returns a protobuf object of `docarray_pb2.DocVecProto` class. [`from_protobuf`][docarray.array.doc_list.doc_list.DocVec.from_protobuf] accepts a protobuf message object to construct a [DocVec][docarray.array.doc_list.doc_list.DocVec]. diff --git a/mkdocs.yml b/mkdocs.yml index 1fa2f413d4e..e02e8f72056 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -80,6 +80,7 @@ nav: - Sending: - user_guide/sending/first_step.md - user_guide/sending/send_doclist.md + - user_guide/sending/send_docvec.md - user_guide/storing/first_step.md From 28534744e5ff7258a722db484fadc1980ff490d3 Mon Sep 17 00:00:00 2001 From: samsja Date: Wed, 12 Apr 2023 13:34:42 +0200 Subject: [PATCH 13/33] docs: add send doc section Signed-off-by: samsja --- docs/user_guide/sending/send_doc.md | 48 +++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 docs/user_guide/sending/send_doc.md diff --git a/docs/user_guide/sending/send_doc.md b/docs/user_guide/sending/send_doc.md new file mode 100644 index 00000000000..5b7e7fff91d --- /dev/null +++ b/docs/user_guide/sending/send_doc.md @@ -0,0 +1,48 @@ +# Serialization of BaseDoc + +In order to send or store [BaseDoc][docarray.base_doc.doc.BaseDoc] you need to serialize them first. + +!! note + [BaseDoc][docarray.base_doc.doc.BaseDoc] supports serialization to `protobuf` and `json` formats. + +## Serialization to protobuf + +You can use [`to_protobuf`][docarray.base_doc.doc.BaseDoc.to_protobuf] to serialize a [BaseDoc][docarray.base_doc.doc.BaseDoc] to a protobuf message object +and use [`from_protobuf`][docarray.base_doc.doc.BaseDoc.from_protobuf] to deserialize it. + +```python +from typing import List +from docarray import BaseDoc + + +class MyDoc(BaseDoc): + text: str + tags: List[str] + + +doc = MyDoc(text='hello world', tags=['hello', 'world']) +proto_message = doc.to_protobuf() +new_doc = MyDoc.from_protobuf(proto_message) +assert doc == new_doc # True +``` + +## Serialization to json + +You can use [`json`][docarray.base_doc.doc.BaseDoc.json] to serialize a [BaseDoc][docarray.base_doc.doc.BaseDoc] to a json string +and use [`parse_raw`][docarray.base_doc.doc.BaseDoc.parse_raw] to deserialize it. + +```python +from typing import List +from docarray import BaseDoc + + +class MyDoc(BaseDoc): + text: str + tags: List[str] + + +doc = MyDoc(text='hello world', tags=['hello', 'world']) +json_str = doc.json() +new_doc = MyDoc.parse_raw(json_str) +assert doc == new_doc # True +``` \ No newline at end of file From a6910f16c49cfe6d930c94a852b1e72645451f74 Mon Sep 17 00:00:00 2001 From: samsja Date: Wed, 12 Apr 2023 13:53:28 +0200 Subject: [PATCH 14/33] docs: fix docstring Signed-off-by: samsja --- docarray/base_doc/doc.py | 82 +++++++++++++++++++++++++++++++++++++++- mkdocs.yml | 1 + 2 files changed, 82 insertions(+), 1 deletion(-) diff --git a/docarray/base_doc/doc.py b/docarray/base_doc/doc.py index ccb5b65b99e..dbc1f155f58 100644 --- a/docarray/base_doc/doc.py +++ b/docarray/base_doc/doc.py @@ -1,5 +1,15 @@ import os -from typing import TYPE_CHECKING, Any, Dict, Optional, Type, TypeVar +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Dict, + Optional, + Type, + TypeVar, + Union, + no_type_check, +) import orjson from pydantic import BaseModel, Field @@ -12,6 +22,10 @@ from docarray.typing.tensor.abstract_tensor import AbstractTensor if TYPE_CHECKING: + from pydantic import Protocol + from pydantic.types import StrBytes + from pydantic.typing import AbstractSetIntStr, MappingIntStrAny + from docarray.array.doc_vec.column_storage import ColumnStorageView from docarray.proto import DocProto @@ -260,3 +274,69 @@ class MyDocument(BaseDoc): :param other: The Document with which to update the contents of this """ super().update(other) + + def to_protobuf(self) -> 'DocProto': + """Convert Document into a Protobuf message. + + :return: the protobuf message + """ + super().to_protobuf() + + def json( + self, + *, + include: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None, + exclude: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None, + by_alias: bool = False, + skip_defaults: Optional[bool] = None, + exclude_unset: bool = False, + exclude_defaults: bool = False, + exclude_none: bool = False, + encoder: Optional[Callable[[Any], Any]] = None, + models_as_dict: bool = True, + **dumps_kwargs: Any, + ) -> str: + """ + Generate a JSON representation of the model, `include` and `exclude` arguments as per `dict()`. + + `encoder` is an optional function to supply as `default` to json.dumps(), other arguments as per `json.dumps()`. + """ + return super().json( + include=include, + exclude=exclude, + by_alias=by_alias, + skip_defaults=skip_defaults, + exclude_unset=exclude_unset, + exclude_defaults=exclude_defaults, + exclude_none=exclude_none, + encoder=encoder, + models_as_dict=models_as_dict, + **dumps_kwargs, + ) + + @no_type_check + def parse_raw( + cls: Type[T], + b: StrBytes, + *, + content_type: str = None, + encoding: str = 'utf8', + proto: Protocol = None, + allow_pickle: bool = False, + ) -> T: + """ + Parse a raw string or bytes into a base doc + :param b: + :param content_type: + :param encoding: the encoding to use when parsing a string, defaults to 'utf8' + :param proto: protocol to use. + :param allow_pickle: allow pickle protocol + :return: a document + """ + return super(BaseDoc, cls).parse_raw( + b, + content_type=content_type, + encoding=encoding, + proto=proto, + allow_pickle=allow_pickle, + ) diff --git a/mkdocs.yml b/mkdocs.yml index e02e8f72056..4817151a82b 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -79,6 +79,7 @@ nav: - user_guide/representing/array.md - Sending: - user_guide/sending/first_step.md + - user_guide/sending/send_doc.md - user_guide/sending/send_doclist.md - user_guide/sending/send_docvec.md From e73a6c450364a3f782396066ab05b744123401d2 Mon Sep 17 00:00:00 2001 From: samsja Date: Wed, 12 Apr 2023 13:59:03 +0200 Subject: [PATCH 15/33] refactor: better tree structure for sending Signed-off-by: samsja --- docs/user_guide/sending/api/fastAPI.md | 0 docs/user_guide/sending/api/jina.md | 1 + docs/user_guide/sending/first_step.md | 2 +- docs/user_guide/sending/{ => ser}/send_doc.md | 0 docs/user_guide/sending/{ => ser}/send_doclist.md | 0 docs/user_guide/sending/{ => ser}/send_docvec.md | 0 mkdocs.yml | 10 +++++++--- 7 files changed, 9 insertions(+), 4 deletions(-) create mode 100644 docs/user_guide/sending/api/fastAPI.md create mode 100644 docs/user_guide/sending/api/jina.md rename docs/user_guide/sending/{ => ser}/send_doc.md (100%) rename docs/user_guide/sending/{ => ser}/send_doclist.md (100%) rename docs/user_guide/sending/{ => ser}/send_docvec.md (100%) diff --git a/docs/user_guide/sending/api/fastAPI.md b/docs/user_guide/sending/api/fastAPI.md new file mode 100644 index 00000000000..e69de29bb2d diff --git a/docs/user_guide/sending/api/jina.md b/docs/user_guide/sending/api/jina.md new file mode 100644 index 00000000000..4e51fd6ee93 --- /dev/null +++ b/docs/user_guide/sending/api/jina.md @@ -0,0 +1 @@ +# Jina \ No newline at end of file diff --git a/docs/user_guide/sending/first_step.md b/docs/user_guide/sending/first_step.md index a18433535b9..f822386b8af 100644 --- a/docs/user_guide/sending/first_step.md +++ b/docs/user_guide/sending/first_step.md @@ -1 +1 @@ -# Sending +# first step diff --git a/docs/user_guide/sending/send_doc.md b/docs/user_guide/sending/ser/send_doc.md similarity index 100% rename from docs/user_guide/sending/send_doc.md rename to docs/user_guide/sending/ser/send_doc.md diff --git a/docs/user_guide/sending/send_doclist.md b/docs/user_guide/sending/ser/send_doclist.md similarity index 100% rename from docs/user_guide/sending/send_doclist.md rename to docs/user_guide/sending/ser/send_doclist.md diff --git a/docs/user_guide/sending/send_docvec.md b/docs/user_guide/sending/ser/send_docvec.md similarity index 100% rename from docs/user_guide/sending/send_docvec.md rename to docs/user_guide/sending/ser/send_docvec.md diff --git a/mkdocs.yml b/mkdocs.yml index 4817151a82b..579074bbfc6 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -79,9 +79,13 @@ nav: - user_guide/representing/array.md - Sending: - user_guide/sending/first_step.md - - user_guide/sending/send_doc.md - - user_guide/sending/send_doclist.md - - user_guide/sending/send_docvec.md + - Serialization: + - user_guide/sending/ser/send_doc.md + - user_guide/sending/ser/send_doclist.md + - user_guide/sending/ser/send_docvec.md + - Building API: + - user_guide/sending/api/jina.md + - user_guide/sending/api/fastAPI.md - user_guide/storing/first_step.md From 66fc6db5c104294d5869b2ccc17134f72874aa3c Mon Sep 17 00:00:00 2001 From: samsja Date: Wed, 12 Apr 2023 14:04:17 +0200 Subject: [PATCH 16/33] fix: fix tests Signed-off-by: samsja --- docarray/base_doc/doc.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/docarray/base_doc/doc.py b/docarray/base_doc/doc.py index dbc1f155f58..bb319f3074d 100644 --- a/docarray/base_doc/doc.py +++ b/docarray/base_doc/doc.py @@ -280,7 +280,7 @@ def to_protobuf(self) -> 'DocProto': :return: the protobuf message """ - super().to_protobuf() + return super().to_protobuf() def json( self, @@ -315,13 +315,14 @@ def json( ) @no_type_check + @classmethod def parse_raw( cls: Type[T], - b: StrBytes, + b: 'StrBytes', *, content_type: str = None, encoding: str = 'utf8', - proto: Protocol = None, + proto: 'Protocol' = None, allow_pickle: bool = False, ) -> T: """ From f32bcca05f2f83e8d68b176f8c7258bb80d529bb Mon Sep 17 00:00:00 2001 From: samsja Date: Wed, 12 Apr 2023 14:40:57 +0200 Subject: [PATCH 17/33] fix: fix python code snippet ods Signed-off-by: samsja --- docarray/base_doc/mixins/io.py | 1 + docs/user_guide/sending/ser/send_doclist.md | 4 ++-- simple-dl.csv | 3 +++ simple-dl.json | 1 + tests/documentation/test_docs.py | 2 +- 5 files changed, 8 insertions(+), 3 deletions(-) create mode 100644 simple-dl.csv create mode 100644 simple-dl.json diff --git a/docarray/base_doc/mixins/io.py b/docarray/base_doc/mixins/io.py index b2a64e8082b..ad5913b6498 100644 --- a/docarray/base_doc/mixins/io.py +++ b/docarray/base_doc/mixins/io.py @@ -144,6 +144,7 @@ def to_bytes( import pickle if protocol == 'pickle': + breakpoint() bstr = pickle.dumps(self) elif protocol == 'protobuf': bstr = self.to_protobuf().SerializePartialToString() diff --git a/docs/user_guide/sending/ser/send_doclist.md b/docs/user_guide/sending/ser/send_doclist.md index 25ecce1cbff..532926cba28 100644 --- a/docs/user_guide/sending/ser/send_doclist.md +++ b/docs/user_guide/sending/ser/send_doclist.md @@ -154,7 +154,7 @@ class SimpleDoc(BaseDoc): dl = DocList[SimpleDoc]([SimpleDoc(text=f'doc {i}') for i in range(2)]) -df = dl.to_pandas() -dl_from_dataframe = DocList[SimpleDoc].from_pandas(df) +df = dl.to_dataframe() +dl_from_dataframe = DocList[SimpleDoc].from_dataframe(df) print(dl_from_dataframe) ``` \ No newline at end of file diff --git a/simple-dl.csv b/simple-dl.csv new file mode 100644 index 00000000000..73fcef9088e --- /dev/null +++ b/simple-dl.csv @@ -0,0 +1,3 @@ +id,text +e5083675a1ff093b5db61485dea954e1,doc 0 +6cf91fb8ce69c2adcca4abeacab1bbb2,doc 1 diff --git a/simple-dl.json b/simple-dl.json new file mode 100644 index 00000000000..e8402651a63 --- /dev/null +++ b/simple-dl.json @@ -0,0 +1 @@ +[{"id":"c972944303fc583b0a66057c323af21a","text":"doc 0"},{"id":"febc35bbd6563d24fa8a832447fba5bb","text":"doc 1"}] \ No newline at end of file diff --git a/tests/documentation/test_docs.py b/tests/documentation/test_docs.py index 6ca32d7700f..646bb4c582f 100644 --- a/tests/documentation/test_docs.py +++ b/tests/documentation/test_docs.py @@ -47,7 +47,7 @@ def check_md_file(fpath, memory=False, lang="python", keyword_ignore=[]): 'fpath', pathlib.Path('docs/user_guide').glob('**/*.md'), ids=str ) def test_files_good(fpath): - check_md_file(fpath=fpath, memory=True) + check_md_file(fpath=fpath, memory=True, keyword_ignore=['pickle']) def test_readme(): From 4047c23c4a236c46a205c6bec9ebcb6809646a0d Mon Sep 17 00:00:00 2001 From: samsja Date: Wed, 12 Apr 2023 14:50:08 +0200 Subject: [PATCH 18/33] fix: fix remove breakpoint Signed-off-by: samsja --- docarray/base_doc/mixins/io.py | 1 - 1 file changed, 1 deletion(-) diff --git a/docarray/base_doc/mixins/io.py b/docarray/base_doc/mixins/io.py index ad5913b6498..b2a64e8082b 100644 --- a/docarray/base_doc/mixins/io.py +++ b/docarray/base_doc/mixins/io.py @@ -144,7 +144,6 @@ def to_bytes( import pickle if protocol == 'pickle': - breakpoint() bstr = pickle.dumps(self) elif protocol == 'protobuf': bstr = self.to_protobuf().SerializePartialToString() From 682130706415ee3637ad20e69642e9c6c7cdc14f Mon Sep 17 00:00:00 2001 From: samsja Date: Wed, 12 Apr 2023 15:36:38 +0200 Subject: [PATCH 19/33] feat: add intro Signed-off-by: samsja --- docs/user_guide/sending/first_step.md | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/docs/user_guide/sending/first_step.md b/docs/user_guide/sending/first_step.md index f822386b8af..05441f8337d 100644 --- a/docs/user_guide/sending/first_step.md +++ b/docs/user_guide/sending/first_step.md @@ -1 +1,11 @@ -# first step +# Intro + +In the representation section we saw how to use [`BaseDoc`][docarray.base_doc.doc.BaseDoc], [`DocList`][docarray.array.doc_list.doc_list.DocList] and [`DocVec`][docarray.array.doc_vec.doc_vec.DocVec] +to represent multi-modal data. In this section we will see **how to send these data over the wire**. + + +This section is dived in two: + +- [Serialization](./ser/send_doc.md) of [`BaseDoc`][docarray.base_doc.doc.BaseDoc], [`DocList`][docarray.array.doc_list.doc_list.DocList] and [`DocVec`][docarray.array.doc_vec.doc_vec.DocVec] +- [Using DocArray with web framework to build multimodal API](./api/jina.md) + From ce60c65fc1d8839596409beb26d96424251fe282 Mon Sep 17 00:00:00 2001 From: samsja Date: Wed, 12 Apr 2023 15:48:24 +0200 Subject: [PATCH 20/33] feat: add ref Signed-off-by: samsja --- docs/user_guide/sending/ser/send_doc.md | 11 +++++++++-- docs/user_guide/sending/ser/send_doclist.md | 9 +++++++-- docs/user_guide/sending/ser/send_docvec.md | 8 +++++++- 3 files changed, 23 insertions(+), 5 deletions(-) diff --git a/docs/user_guide/sending/ser/send_doc.md b/docs/user_guide/sending/ser/send_doc.md index 5b7e7fff91d..e65f5d7d950 100644 --- a/docs/user_guide/sending/ser/send_doc.md +++ b/docs/user_guide/sending/ser/send_doc.md @@ -1,4 +1,4 @@ -# Serialization of BaseDoc +# BaseDoc In order to send or store [BaseDoc][docarray.base_doc.doc.BaseDoc] you need to serialize them first. @@ -45,4 +45,11 @@ doc = MyDoc(text='hello world', tags=['hello', 'world']) json_str = doc.json() new_doc = MyDoc.parse_raw(json_str) assert doc == new_doc # True -``` \ No newline at end of file +``` + +See also: + +* The serializing [DocList](./send_doclist.md) section +* The serializing [DocVec](./send_docvec.md) section + + diff --git a/docs/user_guide/sending/ser/send_doclist.md b/docs/user_guide/sending/ser/send_doclist.md index 532926cba28..e3bb6583e6e 100644 --- a/docs/user_guide/sending/ser/send_doclist.md +++ b/docs/user_guide/sending/ser/send_doclist.md @@ -1,4 +1,4 @@ -# Serialization for DocList +# DocList When sending or storing [`DocList`][docarray.array.doc_list.doc_list.DocList], you need to use serialization. [DocList][docarray.array.doc_list.doc_list.DocList] supports multiple ways to serialize the data. ## JSON @@ -157,4 +157,9 @@ dl = DocList[SimpleDoc]([SimpleDoc(text=f'doc {i}') for i in range(2)]) df = dl.to_dataframe() dl_from_dataframe = DocList[SimpleDoc].from_dataframe(df) print(dl_from_dataframe) -``` \ No newline at end of file +``` + +See also: + +* The serializing [BaseDoc](./send_doc.md) section +* The serializing [DocVec](./send_docvec.md) section diff --git a/docs/user_guide/sending/ser/send_docvec.md b/docs/user_guide/sending/ser/send_docvec.md index d400caa844c..3868ff7c60b 100644 --- a/docs/user_guide/sending/ser/send_docvec.md +++ b/docs/user_guide/sending/ser/send_docvec.md @@ -1,4 +1,4 @@ -# Serialization of DocVec +# DocVec When sending or storing [`DocVec`][docarray.array.doc_list.doc_list.DocVec], you need to use serialization. [DocVec][docarray.array.doc_list.doc_list.DocVec] only supports protobuf to serialize the data. You can use [`to_protobuf`][docarray.array.doc_list.doc_list.DocVec.to_protobuf] and [`from_protobuf`][docarray.array.doc_list.doc_list.DocVec.from_protobuf] to serialize and deserialize a [DocVec][docarray.array.doc_list.doc_list.DocVec] @@ -21,4 +21,10 @@ proto_message_dv = dv.to_protobuf() dv_from_proto = DocVec[SimpleVecDoc].from_protobuf(proto_message_dv) ``` +!!! note + We are planning to add more serilization format in the future, notably JSON. + [`to_protobuf`][docarray.array.doc_list.doc_list.DocVec.to_protobuf] returns a protobuf object of `docarray_pb2.DocVecProto` class. [`from_protobuf`][docarray.array.doc_list.doc_list.DocVec.from_protobuf] accepts a protobuf message object to construct a [DocVec][docarray.array.doc_list.doc_list.DocVec]. + +* The serializing [BaseDoc](./send_doc.md) section +* The serializing [DocList](./send_doclist.md) section From e66a90096a8a85d9879ee4b986246e6e827de38c Mon Sep 17 00:00:00 2001 From: samsja Date: Wed, 12 Apr 2023 16:09:52 +0200 Subject: [PATCH 21/33] feat: move fastapi part Signed-off-by: samsja --- docs/integrations/fastapi.md | 134 ----------------------- docs/user_guide/sending/api/fastAPI.md | 140 +++++++++++++++++++++++++ mkdocs.yml | 2 - 3 files changed, 140 insertions(+), 136 deletions(-) delete mode 100644 docs/integrations/fastapi.md diff --git a/docs/integrations/fastapi.md b/docs/integrations/fastapi.md deleted file mode 100644 index e55b09fba9e..00000000000 --- a/docs/integrations/fastapi.md +++ /dev/null @@ -1,134 +0,0 @@ -# Use DocArray with FastAPI - -FastAPI is a high-performance web framework for building APIs with Python. It's designed to be easy to use and supports asynchronous programming. -Since [`DocArray` documents are Pydantic Models (with a twist)](../user_guide/representing/first_step.md) they can be easily integrated with FastAPI, -and provide a seamless and efficient way to work with multimodal data in FastAPI-powered APIs. - - -First, you should define schemas for your input and/or output Documents: -```python -from docarray import BaseDoc -from docarray.documents import ImageDoc -from docarray.typing import NdArray - - -class InputDoc(BaseDoc): - img: ImageDoc - - -class OutputDoc(BaseDoc): - embedding_clip: NdArray - embedding_bert: NdArray -``` - -Afterwards, you can use your Documents with FastAPI: -```python -import numpy as np -from fastapi import FastAPI -from httpx import AsyncClient - -from docarray.documents import ImageDoc -from docarray.base_doc import DocumentResponse - -input_doc = InputDoc(img=ImageDoc(tensor=np.zeros((3, 224, 224)))) - -app = FastAPI() - - -@app.post("/doc/", response_model=OutputDoc, response_class=DocumentResponse) -async def create_item(doc: InputDoc) -> OutputDoc: - ## call my fancy model to generate the embeddings - doc = OutputDoc( - embedding_clip=np.zeros((100, 1)), embedding_bert=np.zeros((100, 1)) - ) - return doc - - -async with AsyncClient(app=app, base_url="http://test") as ac: - response = await ac.post("/doc/", data=input_doc.json()) - -doc = OutputDoc.parse_raw(response.content.decode()) -``` - -The big advantage here is **first-class support for ML centric data**, such as {Torch, TF, ...}Tensor, Embedding, etc. - -This includes handy features such as validating the shape of a tensor: - -```python -from docarray import BaseDoc -from docarray.typing import TorchTensor -import torch - - -class MyDoc(BaseDoc): - tensor: TorchTensor[3, 224, 224] - - -doc = MyDoc(tensor=torch.zeros(3, 224, 224)) # works -doc = MyDoc(tensor=torch.zeros(224, 224, 3)) # works by reshaping -doc = MyDoc(tensor=torch.zeros(224)) # fails validation - - -class Image(BaseDoc): - tensor: TorchTensor[3, 'x', 'x'] - - -Image(tensor=torch.zeros(3, 224, 224)) # works -Image( - tensor=torch.zeros(3, 64, 128) -) # fails validation because second dimension does not match third -Image( - tensor=torch.zeros(4, 224, 224) -) # fails validation because of the first dimension -Image( - tensor=torch.zeros(3, 64) -) # fails validation because it does not have enough dimensions -``` - - -Further, you can send and receive lists of Documents represented as a `DocArray` object: - -!!! note - Currently, `FastAPI` receives `DocArray` objects as lists, so you have to construct a DocArray inside the function. - Also, if you want to return a `DocArray` object, first you have to convert it to a list. - (Shown in the example below) - -```python -from typing import List - -import numpy as np -from fastapi import FastAPI -from httpx import AsyncClient - -from docarray import DocArray -from docarray.base_doc import DocArrayResponse -from docarray.documents import TextDoc - -# Create a docarray -docs = DocArray[TextDoc]([TextDoc(text='first'), TextDoc(text='second')]) - -app = FastAPI() - - -# Always use our custom response class (needed to dump tensors) -@app.post("/doc/", response_class=DocArrayResponse) -async def create_embeddings(docs: List[TextDoc]) -> List[TextDoc]: - # The docs FastAPI will receive will be treated as List[TextDoc] - # so you need to cast it to DocArray - docs = DocArray[TextDoc].construct(docs) - - # Embed docs - for doc in docs: - doc.embedding = np.zeros((3, 224, 224)) - - # Return your DocArray as a list - return list(docs) - - -async with AsyncClient(app=app, base_url="http://test") as ac: - response = await ac.post("/doc/", data=docs.to_json()) # sending docs as json - -assert response.status_code == 200 -# You can read FastAPI's response in the following way -docs = DocArray[TextDoc].from_json(response.content.decode()) -``` diff --git a/docs/user_guide/sending/api/fastAPI.md b/docs/user_guide/sending/api/fastAPI.md index e69de29bb2d..5409b989787 100644 --- a/docs/user_guide/sending/api/fastAPI.md +++ b/docs/user_guide/sending/api/fastAPI.md @@ -0,0 +1,140 @@ +# FastAPI + +[FastAPI](https://fastapi.tiangolo.com/) is a high-performance web framework for building APIs with Python based on python type hint. It's designed to be easy to use and supports asynchronous programming. +Since [`DocArray` documents are Pydantic Models (with a twist)](../../representing/first_step.md) they can be easily integrated with FastAPI, +and provide a seamless and efficient way to work with multimodal data in FastAPI-powered APIs. + +!!! note + you need to install FastAPI to follow this section + ``` + pip install fastapi + ``` + + +First, you should define schemas for your input and/or output Documents: +```python +from docarray import BaseDoc +from docarray.documents import ImageDoc +from docarray.typing import NdArray + + +class InputDoc(BaseDoc): + img: ImageDoc + + +class OutputDoc(BaseDoc): + embedding_clip: NdArray + embedding_bert: NdArray +``` + +Afterwards, you can use your Documents with FastAPI: +```python +import numpy as np +from fastapi import FastAPI +from httpx import AsyncClient + +from docarray.documents import ImageDoc +from docarray.base_doc import DocumentResponse + +input_doc = InputDoc(img=ImageDoc(tensor=np.zeros((3, 224, 224)))) + +app = FastAPI() + + +@app.post("/doc/", response_model=OutputDoc, response_class=DocumentResponse) +async def create_item(doc: InputDoc) -> OutputDoc: + ## call my fancy model to generate the embeddings + doc = OutputDoc( + embedding_clip=np.zeros((100, 1)), embedding_bert=np.zeros((100, 1)) + ) + return doc + + +async with AsyncClient(app=app, base_url="http://test") as ac: + response = await ac.post("/doc/", data=input_doc.json()) + +doc = OutputDoc.parse_raw(response.content.decode()) +``` + +The big advantage here is **first-class support for ML centric data**, such as {Torch, TF, ...}Tensor, Embedding, etc. + +This includes handy features such as validating the shape of a tensor: + +```python +from docarray import BaseDoc +from docarray.typing import TorchTensor +import torch + + +class MyDoc(BaseDoc): + tensor: TorchTensor[3, 224, 224] + + +doc = MyDoc(tensor=torch.zeros(3, 224, 224)) # works +doc = MyDoc(tensor=torch.zeros(224, 224, 3)) # works by reshaping +doc = MyDoc(tensor=torch.zeros(224)) # fails validation + + +class Image(BaseDoc): + tensor: TorchTensor[3, 'x', 'x'] + + +Image(tensor=torch.zeros(3, 224, 224)) # works +Image( + tensor=torch.zeros(3, 64, 128) +) # fails validation because second dimension does not match third +Image( + tensor=torch.zeros(4, 224, 224) +) # fails validation because of the first dimension +Image( + tensor=torch.zeros(3, 64) +) # fails validation because it does not have enough dimensions +``` + + +Further, you can send and receive lists of Documents represented as a `DocArray` object: + +!!! note + Currently, `FastAPI` receives `DocArray` objects as lists, so you have to construct a DocArray inside the function. + Also, if you want to return a `DocArray` object, first you have to convert it to a list. + (Shown in the example below) + +```python +from typing import List + +import numpy as np +from fastapi import FastAPI +from httpx import AsyncClient + +from docarray import DocArray +from docarray.base_doc import DocArrayResponse +from docarray.documents import TextDoc + +# Create a docarray +docs = DocArray[TextDoc]([TextDoc(text='first'), TextDoc(text='second')]) + +app = FastAPI() + + +# Always use our custom response class (needed to dump tensors) +@app.post("/doc/", response_class=DocArrayResponse) +async def create_embeddings(docs: List[TextDoc]) -> List[TextDoc]: + # The docs FastAPI will receive will be treated as List[TextDoc] + # so you need to cast it to DocArray + docs = DocArray[TextDoc].construct(docs) + + # Embed docs + for doc in docs: + doc.embedding = np.zeros((3, 224, 224)) + + # Return your DocArray as a list + return list(docs) + + +async with AsyncClient(app=app, base_url="http://test") as ac: + response = await ac.post("/doc/", data=docs.to_json()) # sending docs as json + +assert response.status_code == 200 +# You can read FastAPI's response in the following way +docs = DocArray[TextDoc].from_json(response.content.decode()) +``` diff --git a/mkdocs.yml b/mkdocs.yml index 09b8d12e79a..991966990f2 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -98,8 +98,6 @@ nav: - how_to/multimodal_training_and_serving.md - how_to/optimize_performance_with_id_generation.md - how_to/audio2text.md - - Integrations: - - integrations/fastapi.md - Data Types: - data_types/text/text.md - data_types/image/image.md From a642abe967514e0ecf939049b7222b8a2721228c Mon Sep 17 00:00:00 2001 From: samsja Date: Wed, 12 Apr 2023 17:25:40 +0200 Subject: [PATCH 22/33] fix: fix fastAPI Signed-off-by: samsja --- simple-dl.csv | 4 ++-- simple-dl.json | 2 +- tests/documentation/test_docs.py | 19 +++++++++++-------- 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/simple-dl.csv b/simple-dl.csv index 73fcef9088e..b30400587f3 100644 --- a/simple-dl.csv +++ b/simple-dl.csv @@ -1,3 +1,3 @@ id,text -e5083675a1ff093b5db61485dea954e1,doc 0 -6cf91fb8ce69c2adcca4abeacab1bbb2,doc 1 +31b05a66db6fffb90f7b3e5edb71fc52,doc 0 +2bfa118dceb366281d0714b02a78b9c7,doc 1 diff --git a/simple-dl.json b/simple-dl.json index e8402651a63..07bc6ea9e99 100644 --- a/simple-dl.json +++ b/simple-dl.json @@ -1 +1 @@ -[{"id":"c972944303fc583b0a66057c323af21a","text":"doc 0"},{"id":"febc35bbd6563d24fa8a832447fba5bb","text":"doc 1"}] \ No newline at end of file +[{"id":"7d913dc1ed6d875c0b576abf092100d3","text":"doc 0"},{"id":"af7978f9eb8d44de95371e3781a3f37e","text":"doc 1"}] \ No newline at end of file diff --git a/tests/documentation/test_docs.py b/tests/documentation/test_docs.py index 6b5390215e6..447d549788f 100644 --- a/tests/documentation/test_docs.py +++ b/tests/documentation/test_docs.py @@ -43,14 +43,17 @@ def check_md_file(fpath, memory=False, lang="python", keyword_ignore=[]): check_raw_file_full(text, lang=lang, keyword_ignore=keyword_ignore) -@pytest.mark.parametrize( - 'fpath', - [ - *list(pathlib.Path('docs/user_guide').glob('**/*.md')), - *list(pathlib.Path('docs/data_types').glob('**/*.md')), - ], - ids=str, -) +files_to_check = [ + *list(pathlib.Path('docs/user_guide').glob('**/*.md')), + *list(pathlib.Path('docs/data_types').glob('**/*.md')), +] + +for file in files_to_check: + if 'fastAPI' in str(file): # for now we don't test fastAPI stuff because of async + files_to_check.remove(file) + + +@pytest.mark.parametrize('fpath', files_to_check, ids=str) def test_files_good(fpath): check_md_file(fpath=fpath, memory=True, keyword_ignore=['pickle']) From 840a650929056cdaf324d7e971f484da6b524d92 Mon Sep 17 00:00:00 2001 From: samsja Date: Wed, 12 Apr 2023 17:30:03 +0200 Subject: [PATCH 23/33] fix: remove uselss mixin Signed-off-by: samsja --- docs/api_references/array/da.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/api_references/array/da.md b/docs/api_references/array/da.md index eedcec827cd..21a206a9537 100644 --- a/docs/api_references/array/da.md +++ b/docs/api_references/array/da.md @@ -1,4 +1,3 @@ # DocList ::: docarray.array.doc_list.doc_list.DocList -::: docarray.array.doc_list.io.IOMixinArray From 8c2cf02d3577c05287acc4d85ae615e341f59a3a Mon Sep 17 00:00:00 2001 From: samsja Date: Thu, 13 Apr 2023 13:25:05 +0200 Subject: [PATCH 24/33] faet: add jina section Signed-off-by: samsja --- docs/how_to/audio2text.md | 79 --------------------------- docs/user_guide/sending/api/jina.md | 82 ++++++++++++++++++++++++++++- mkdocs.yml | 1 - simple-dl.csv | 3 -- simple-dl.json | 1 - tests/documentation/test_docs.py | 12 ++++- 6 files changed, 91 insertions(+), 87 deletions(-) delete mode 100644 docs/how_to/audio2text.md delete mode 100644 simple-dl.csv delete mode 100644 simple-dl.json diff --git a/docs/how_to/audio2text.md b/docs/how_to/audio2text.md deleted file mode 100644 index fcec869ce0f..00000000000 --- a/docs/how_to/audio2text.md +++ /dev/null @@ -1,79 +0,0 @@ -# Creating an Audio to Text App with Jina and DocArray V2 - -This is how you can build an Audio to Text app using Jina, Docarray and Whisper - -We will use: - -* DocarrayV2: Helps us to load and preprocess multimodal data such as image, text and audio in our case -* Jina: Helps us serve the model quickly and create a client - -First let's install requirements - -## 💾 Installation - -```bash -pip install transformers -pip install openai-whisper -pip install jina -``` - -Now let's import necessary libraries - - -```python -import whisper -from jina import Executor, requests, Deployment -from docarray import BaseDoc, DocList -from docarray.typing import AudioUrl -``` - -Now we need to create the schema of our input and output documents. Since our input is an audio -our input schema should contain an AudioUrl like the following - -```python -class AudioURL(BaseDoc): - audio: AudioUrl -``` - -As for the output schema we would like to receive the transcribed text so we use the following: - -```python -class Response(BaseDoc): - text: str -``` - -Now it's time we create our model, we wrap our model into Jina Executor, this allows us to serve to model -later on and expose its endpoint /transcribe - -```python -class WhisperExecutor(Executor): - def __init__(self, device: str, *args, **kwargs): - super().__init__(*args, **kwargs) - self.model = whisper.load_model("medium.en", device=device) - - @requests - def transcribe(self, docs: DocList[AudioURL], **kwargs) -> DocList[Response]: - response_docs = DocList[Response]() - for doc in docs: - transcribed_text = self.model.transcribe(str(doc.audio))['text'] - response_docs.append(Response(text=transcribed_text)) - return response_docs -``` - -Now we can leverage Deployment object provided by Jina to use this executor -then we send a request to transcribe endpoint. Here we are using an audio file previously recorded -that says, "A Man reading a book" saved under resources/audio.mp3 but feel free to use your own audio. - -```python -with Deployment( - uses=WhisperExecutor, uses_with={'device': "cpu"}, port=12349, timeout_ready=-1 -) as d: - docs = d.post( - on='/transcribe', - inputs=[AudioURL(audio='resources/audio.mp3')], - return_type=DocList[Response], - ) - print(docs[0].text) -``` - -And we get the transcribed result! \ No newline at end of file diff --git a/docs/user_guide/sending/api/jina.md b/docs/user_guide/sending/api/jina.md index 4e51fd6ee93..1afd247f1d4 100644 --- a/docs/user_guide/sending/api/jina.md +++ b/docs/user_guide/sending/api/jina.md @@ -1 +1,81 @@ -# Jina \ No newline at end of file +# Jina + +# Creating an Audio to Text App with Jina and DocArray V2 + +This is how you can build an Audio to Text app using Jina, Docarray and Whisper + +We will use: + +* DocarrayV2: Helps us to load and preprocess multimodal data such as image, text and audio in our case +* Jina: Helps us serve the model quickly and create a client + +First let's install requirements + +## 💾 Installation + +```bash +pip install transformers +pip install openai-whisper +pip install jina +``` + +Now let's import necessary libraries + + +```python +import whisper +from jina import Executor, requests, Deployment +from docarray import BaseDoc, DocList +from docarray.typing import AudioUrl +``` + +Now we need to create the schema of our input and output documents. Since our input is an audio +our input schema should contain an AudioUrl like the following + +```python +class AudioURL(BaseDoc): + audio: AudioUrl +``` + +As for the output schema we would like to receive the transcribed text so we use the following: + +```python +class Response(BaseDoc): + text: str +``` + +Now it's time we create our model, we wrap our model into Jina Executor, this allows us to serve to model +later on and expose its endpoint /transcribe + +```python +class WhisperExecutor(Executor): + def __init__(self, device: str, *args, **kwargs): + super().__init__(*args, **kwargs) + self.model = whisper.load_model("medium.en", device=device) + + @requests + def transcribe(self, docs: DocList[AudioURL], **kwargs) -> DocList[Response]: + response_docs = DocList[Response]() + for doc in docs: + transcribed_text = self.model.transcribe(str(doc.audio))['text'] + response_docs.append(Response(text=transcribed_text)) + return response_docs +``` + +Now we can leverage Deployment object provided by Jina to use this executor +then we send a request to transcribe endpoint. Here we are using an audio file previously recorded +that says, "A Man reading a book" saved under resources/audio.mp3 but feel free to use your own audio. + +```python +with Deployment( + uses=WhisperExecutor, uses_with={'device': "cpu"}, port=12349, timeout_ready=-1 +) as d: + docs = d.post( + on='/transcribe', + inputs=[AudioURL(audio='resources/audio.mp3')], + return_type=DocList[Response], + ) + print(docs[0].text) +``` + +And we get the transcribed result! \ No newline at end of file diff --git a/mkdocs.yml b/mkdocs.yml index 991966990f2..f7f8c00ae9f 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -97,7 +97,6 @@ nav: - how_to/add_doc_index.md - how_to/multimodal_training_and_serving.md - how_to/optimize_performance_with_id_generation.md - - how_to/audio2text.md - Data Types: - data_types/text/text.md - data_types/image/image.md diff --git a/simple-dl.csv b/simple-dl.csv deleted file mode 100644 index b30400587f3..00000000000 --- a/simple-dl.csv +++ /dev/null @@ -1,3 +0,0 @@ -id,text -31b05a66db6fffb90f7b3e5edb71fc52,doc 0 -2bfa118dceb366281d0714b02a78b9c7,doc 1 diff --git a/simple-dl.json b/simple-dl.json deleted file mode 100644 index 07bc6ea9e99..00000000000 --- a/simple-dl.json +++ /dev/null @@ -1 +0,0 @@ -[{"id":"7d913dc1ed6d875c0b576abf092100d3","text":"doc 0"},{"id":"af7978f9eb8d44de95371e3781a3f37e","text":"doc 1"}] \ No newline at end of file diff --git a/tests/documentation/test_docs.py b/tests/documentation/test_docs.py index 447d549788f..ccda4714700 100644 --- a/tests/documentation/test_docs.py +++ b/tests/documentation/test_docs.py @@ -4,6 +4,8 @@ from mktestdocs import grab_code_blocks from mktestdocs.__main__ import _executors, check_raw_string +file_to_skip = ['fastAPI', 'jina'] + def check_raw_file_full(raw, lang="python", keyword_ignore=[]): if lang not in _executors: @@ -48,9 +50,15 @@ def check_md_file(fpath, memory=False, lang="python", keyword_ignore=[]): *list(pathlib.Path('docs/data_types').glob('**/*.md')), ] +file_to_remove = [] + for file in files_to_check: - if 'fastAPI' in str(file): # for now we don't test fastAPI stuff because of async - files_to_check.remove(file) + for fn in file_to_skip: + if fn in str(file): + file_to_remove.append(file) + +for file in file_to_remove: + files_to_check.remove(file) @pytest.mark.parametrize('fpath', files_to_check, ids=str) From c7507bf3199087c2f2922962beeb163bbe26e72b Mon Sep 17 00:00:00 2001 From: samsja Date: Thu, 13 Apr 2023 13:41:21 +0200 Subject: [PATCH 25/33] fix: compress -> compression Signed-off-by: samsja --- docarray/array/doc_list/doc_list.py | 2 +- docarray/array/doc_list/io.py | 2 +- docarray/base_doc/doc.py | 8 ++++---- docarray/base_doc/mixins/io.py | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/docarray/array/doc_list/doc_list.py b/docarray/array/doc_list/doc_list.py index 44e3ba2abce..3725fcc0737 100644 --- a/docarray/array/doc_list/doc_list.py +++ b/docarray/array/doc_list/doc_list.py @@ -335,7 +335,7 @@ def from_bytes( :param data: Bytes from which to deserialize :param protocol: protocol that was used to serialize - :param compress: compress algorithm that was used to serialize between `lz4`, `bz2`, `lzma`, `zlib`, `gzip` + :param compress: compression algorithm that was used to serialize between `lz4`, `bz2`, `lzma`, `zlib`, `gzip` :param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf` :return: the deserialized DocList """ diff --git a/docarray/array/doc_list/io.py b/docarray/array/doc_list/io.py index 3e3b36adde4..688d0310bee 100644 --- a/docarray/array/doc_list/io.py +++ b/docarray/array/doc_list/io.py @@ -141,7 +141,7 @@ def from_bytes( :param data: Bytes from which to deserialize :param protocol: protocol that was used to serialize - :param compress: compress algorithm that was used to serialize between `lz4`, `bz2`, `lzma`, `zlib`, `gzip` + :param compress: compression algorithm that was used to serialize between `lz4`, `bz2`, `lzma`, `zlib`, `gzip` :param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf` :return: the deserialized `DocList` """ diff --git a/docarray/base_doc/doc.py b/docarray/base_doc/doc.py index bb319f3074d..cfb73f1f422 100644 --- a/docarray/base_doc/doc.py +++ b/docarray/base_doc/doc.py @@ -170,7 +170,7 @@ def to_bytes( For more Pythonic code, please use ``bytes(...)``. :param protocol: protocol to use. It can be 'pickle' or 'protobuf' - :param compress: compress algorithm to use + :param compress: compression algorithm to use :return: the binary serialization in bytes """ return super().to_bytes(protocol, compress) @@ -186,7 +186,7 @@ def from_bytes( :param data: binary bytes :param protocol: protocol to use. It can be 'pickle' or 'protobuf' - :param compress: compress method to use + :param compress: compression method to use :return: a Document object """ return super(BaseDoc, cls).from_bytes(data, protocol, compress) @@ -197,7 +197,7 @@ def to_base64( """Serialize a Document object into as base64 string :param protocol: protocol to use. It can be 'pickle' or 'protobuf' - :param compress: compress method to use + :param compress: compression method to use :return: a base64 encoded string """ return super().to_base64(protocol, compress) @@ -213,7 +213,7 @@ def from_base64( :param data: a base64 encoded string :param protocol: protocol to use. It can be 'pickle' or 'protobuf' - :param compress: compress method to use + :param compress: compression method to use :return: a Document object """ return super(BaseDoc, cls).from_base64(data, protocol, compress) diff --git a/docarray/base_doc/mixins/io.py b/docarray/base_doc/mixins/io.py index b2a64e8082b..e50d9ac791d 100644 --- a/docarray/base_doc/mixins/io.py +++ b/docarray/base_doc/mixins/io.py @@ -138,7 +138,7 @@ def to_bytes( For more Pythonic code, please use ``bytes(...)``. :param protocol: protocol to use. It can be 'pickle' or 'protobuf' - :param compress: compress algorithm to use + :param compress: compression algorithm to use :return: the binary serialization in bytes """ import pickle From 27b48bff322fc829156c18136960571ecfaaa2ab Mon Sep 17 00:00:00 2001 From: samsja <55492238+samsja@users.noreply.github.com> Date: Thu, 13 Apr 2023 13:43:44 +0200 Subject: [PATCH 26/33] feat: apply suggestion Co-authored-by: Alex Cureton-Griffiths Co-authored-by: Charlotte Gerhaher Signed-off-by: samsja <55492238+samsja@users.noreply.github.com> --- docarray/base_doc/doc.py | 9 +++++---- docs/user_guide/sending/api/fastAPI.md | 2 +- docs/user_guide/sending/first_step.md | 4 ++-- docs/user_guide/sending/ser/send_doc.md | 4 ++-- docs/user_guide/sending/ser/send_doclist.md | 2 +- docs/user_guide/sending/ser/send_docvec.md | 2 +- 6 files changed, 12 insertions(+), 11 deletions(-) diff --git a/docarray/base_doc/doc.py b/docarray/base_doc/doc.py index cfb73f1f422..3319e6b6cc8 100644 --- a/docarray/base_doc/doc.py +++ b/docarray/base_doc/doc.py @@ -230,8 +230,8 @@ def from_protobuf(cls: Type[T], pb_msg: 'DocProto') -> T: def update(self, other: T_update): """ Updates self with the content of other. Changes are applied to self. - Updating one Document with another consists in the following: - - setting data properties of the second Document to the first Document + Updating one Document with another consists of the following: + - Setting data properties of the second Document to the first Document if they are not None - Concatenating lists and updating sets - Updating recursively Documents and DocArrays @@ -249,8 +249,9 @@ def update(self, other: T_update): --- ```python + from typing import Optional, List + from docarray import BaseDoc - from docarray.documents import Text class MyDocument(BaseDoc): @@ -271,7 +272,7 @@ class MyDocument(BaseDoc): ``` --- - :param other: The Document with which to update the contents of this + :param other: The Document used to update the contents of this Document """ super().update(other) diff --git a/docs/user_guide/sending/api/fastAPI.md b/docs/user_guide/sending/api/fastAPI.md index 5409b989787..d35308fefce 100644 --- a/docs/user_guide/sending/api/fastAPI.md +++ b/docs/user_guide/sending/api/fastAPI.md @@ -1,6 +1,6 @@ # FastAPI -[FastAPI](https://fastapi.tiangolo.com/) is a high-performance web framework for building APIs with Python based on python type hint. It's designed to be easy to use and supports asynchronous programming. +[FastAPI](https://fastapi.tiangolo.com/) is a high-performance web framework for building APIs with Python based on Python type hints. It's designed to be easy to use and supports asynchronous programming. Since [`DocArray` documents are Pydantic Models (with a twist)](../../representing/first_step.md) they can be easily integrated with FastAPI, and provide a seamless and efficient way to work with multimodal data in FastAPI-powered APIs. diff --git a/docs/user_guide/sending/first_step.md b/docs/user_guide/sending/first_step.md index 05441f8337d..5d7fbfa5816 100644 --- a/docs/user_guide/sending/first_step.md +++ b/docs/user_guide/sending/first_step.md @@ -4,8 +4,8 @@ In the representation section we saw how to use [`BaseDoc`][docarray.base_doc.do to represent multi-modal data. In this section we will see **how to send these data over the wire**. -This section is dived in two: +This section is divided into two: - [Serialization](./ser/send_doc.md) of [`BaseDoc`][docarray.base_doc.doc.BaseDoc], [`DocList`][docarray.array.doc_list.doc_list.DocList] and [`DocVec`][docarray.array.doc_vec.doc_vec.DocVec] -- [Using DocArray with web framework to build multimodal API](./api/jina.md) +- [Using DocArray with a web framework to build a multimodal API](./api/jina.md) diff --git a/docs/user_guide/sending/ser/send_doc.md b/docs/user_guide/sending/ser/send_doc.md index e65f5d7d950..0f3826e2288 100644 --- a/docs/user_guide/sending/ser/send_doc.md +++ b/docs/user_guide/sending/ser/send_doc.md @@ -1,8 +1,8 @@ # BaseDoc -In order to send or store [BaseDoc][docarray.base_doc.doc.BaseDoc] you need to serialize them first. +You need to serialize a [BaseDoc][docarray.base_doc.doc.BaseDoc] before you can store or send it. -!! note +!!! note [BaseDoc][docarray.base_doc.doc.BaseDoc] supports serialization to `protobuf` and `json` formats. ## Serialization to protobuf diff --git a/docs/user_guide/sending/ser/send_doclist.md b/docs/user_guide/sending/ser/send_doclist.md index e3bb6583e6e..dd4362a3cbe 100644 --- a/docs/user_guide/sending/ser/send_doclist.md +++ b/docs/user_guide/sending/ser/send_doclist.md @@ -78,7 +78,7 @@ dl_from_base64 = DocList[SimpleDoc].from_base64( ``` ## Binary -Similar as in `Base64` serialization, `Binary` serialization also supports different protocols and compression methods. +Similar to `Base64` serialization, `Binary` serialization also supports different protocols and compression methods. To save a [DocList][docarray.array.doc_list.doc_list.DocList] into a binary file, you can use [`save_binary()`][docarray.array.doc_list.doc_list.DocList.to_base64] and [`load_binary()`][docarray.array.doc_list.doc_list.DocList.from_protobuf] to serialize and deserialize a [DocList][docarray.array.doc_list.doc_list.from_base64]. diff --git a/docs/user_guide/sending/ser/send_docvec.md b/docs/user_guide/sending/ser/send_docvec.md index 3868ff7c60b..3fbaf759075 100644 --- a/docs/user_guide/sending/ser/send_docvec.md +++ b/docs/user_guide/sending/ser/send_docvec.md @@ -22,7 +22,7 @@ dv_from_proto = DocVec[SimpleVecDoc].from_protobuf(proto_message_dv) ``` !!! note - We are planning to add more serilization format in the future, notably JSON. + We are planning to add more serialization formats in the future, notably JSON. [`to_protobuf`][docarray.array.doc_list.doc_list.DocVec.to_protobuf] returns a protobuf object of `docarray_pb2.DocVecProto` class. [`from_protobuf`][docarray.array.doc_list.doc_list.DocVec.from_protobuf] accepts a protobuf message object to construct a [DocVec][docarray.array.doc_list.doc_list.DocVec]. From 1b1c5037138a58784f0f8555e8f450916c17e827 Mon Sep 17 00:00:00 2001 From: samsja Date: Thu, 13 Apr 2023 13:51:37 +0200 Subject: [PATCH 27/33] fix: apply alex suggestion Signed-off-by: samsja --- docs/user_guide/sending/ser/send_doc.md | 2 +- docs/user_guide/sending/ser/send_doclist.md | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/user_guide/sending/ser/send_doc.md b/docs/user_guide/sending/ser/send_doc.md index 0f3826e2288..caa93c4fdb1 100644 --- a/docs/user_guide/sending/ser/send_doc.md +++ b/docs/user_guide/sending/ser/send_doc.md @@ -26,7 +26,7 @@ new_doc = MyDoc.from_protobuf(proto_message) assert doc == new_doc # True ``` -## Serialization to json +## Serialization to JSON You can use [`json`][docarray.base_doc.doc.BaseDoc.json] to serialize a [BaseDoc][docarray.base_doc.doc.BaseDoc] to a json string and use [`parse_raw`][docarray.base_doc.doc.BaseDoc.parse_raw] to deserialize it. diff --git a/docs/user_guide/sending/ser/send_doclist.md b/docs/user_guide/sending/ser/send_doclist.md index dd4362a3cbe..33c0a7b1718 100644 --- a/docs/user_guide/sending/ser/send_doclist.md +++ b/docs/user_guide/sending/ser/send_doclist.md @@ -2,7 +2,7 @@ When sending or storing [`DocList`][docarray.array.doc_list.doc_list.DocList], you need to use serialization. [DocList][docarray.array.doc_list.doc_list.DocList] supports multiple ways to serialize the data. ## JSON -You can use [`to_json()`][docarray.array.doc_list.doc_list.DocList.to_json] and [`from_json()`][docarray.array.doc_list.doc_list.DocList.from_json] to serialize and deserialize a [DocList][docarray.array.doc_list.doc_list.DocList]. +You can use [`to_json()`][docarray.array.doc_list.doc_list.DocList.to_json] and [`from_json()`][docarray.array.doc_list.doc_list.DocList.from_json] to serialize and deserialize a [DocList][docarray.array.doc_list.doc_list.DocList]: ```python from docarray import BaseDoc, DocList @@ -31,7 +31,7 @@ b'[{"id":"5540e72d407ae81abb2390e9249ed066","text":"doc 0"},{"id":"fbe9f80d2fa03 ``` ## Protobuf -To serialize a DocList with `protobuf`, you can use [`to_protobuf()`][docarray.array.doc_list.doc_list.DocList.to_protobuf] and [`from_protobuf()`][docarray.array.doc_list.doc_list.DocList.from_protobuf] to serialize and deserialize a [DocList][docarray.array.doc_list.doc_list.DocList]. +To serialize a DocList with `protobuf`, you can use [`to_protobuf()`][docarray.array.doc_list.doc_list.DocList.to_protobuf] and [`from_protobuf()`][docarray.array.doc_list.doc_list.DocList.from_protobuf] to serialize and deserialize a [DocList][docarray.array.doc_list.doc_list.DocList]: ```python from docarray import BaseDoc, DocList @@ -55,7 +55,7 @@ print(dl_from_proto) When transferring over the network, you can choose `Base64` format to serialize the [`DocList`][docarray.array.doc_list.doc_list.DocList]. Serializing a [DocList][docarray.array.doc_list.doc_list.DocList] in Base64 supports both `pickle` and `protobuf` protocols. Besides, you can choose different compression methods. -To serialize a [DocList][docarray.array.doc_list.doc_list.DocList] in Base64, you can use [`to_base64()`][docarray.array.doc_list.doc_list.DocList.to_base64] and [`from_base64()`][docarray.array.doc_list.doc_list.DocList.from_protobuf] to serialize and deserialize a [DocList][docarray.array.doc_list.doc_list.from_base64]. +To serialize a [DocList][docarray.array.doc_list.doc_list.DocList] in Base64, you can use [`to_base64()`][docarray.array.doc_list.doc_list.DocList.to_base64] and [`from_base64()`][docarray.array.doc_list.doc_list.DocList.from_protobuf] to serialize and deserialize a [DocList][docarray.array.doc_list.doc_list.from_base64]: We support multiple compression methods. (namely : `lz4`, `bz2`, `lzma`, `zlib`, `gzip`) @@ -80,7 +80,7 @@ dl_from_base64 = DocList[SimpleDoc].from_base64( ## Binary Similar to `Base64` serialization, `Binary` serialization also supports different protocols and compression methods. -To save a [DocList][docarray.array.doc_list.doc_list.DocList] into a binary file, you can use [`save_binary()`][docarray.array.doc_list.doc_list.DocList.to_base64] and [`load_binary()`][docarray.array.doc_list.doc_list.DocList.from_protobuf] to serialize and deserialize a [DocList][docarray.array.doc_list.doc_list.from_base64]. +To save a [DocList][docarray.array.doc_list.doc_list.DocList] into a binary file, you can use [`save_binary()`][docarray.array.doc_list.doc_list.DocList.to_base64] and [`load_binary()`][docarray.array.doc_list.doc_list.DocList.from_protobuf] to serialize and deserialize a [DocList][docarray.array.doc_list.doc_list.from_base64]: ```python from docarray import BaseDoc, DocList @@ -123,7 +123,7 @@ dl_from_bytes = DocList[SimpleDoc].from_bytes( ## CSV -You can use [`from_csv()`][docarray.array.doc_list.doc_list.DocList.from_csv] and [`to_csv()`][docarray.array.doc_list.doc_list.DocList.to_csv] to de-/serializae and deserialize the [DocList][docarray.array.doc_list.doc_list.DocList] from/to a CSV file. Use the `dialect` parameter to choose the dialect of the CSV format. +You can use [`from_csv()`][docarray.array.doc_list.doc_list.DocList.from_csv] and [`to_csv()`][docarray.array.doc_list.doc_list.DocList.to_csv] to de-/serializae and deserialize the [DocList][docarray.array.doc_list.doc_list.DocList] from/to a CSV file. Use the `dialect` parameter to choose the dialect of the CSV format: ```python from docarray import BaseDoc, DocList @@ -142,7 +142,7 @@ print(dl_from_csv) ## Pandas.Dataframe -You can use [`from_dataframe()`][docarray.array.doc_list.doc_list.DocList.from_dataframe] and [`to_dataframe()`][docarray.array.doc_list.doc_list.DocList.to_dataframe] to load/save the [DocList][docarray.array.doc_list.doc_list.DocList] from/to a pandas DataFrame. +You can use [`from_dataframe()`][docarray.array.doc_list.doc_list.DocList.from_dataframe] and [`to_dataframe()`][docarray.array.doc_list.doc_list.DocList.to_dataframe] to load/save the [DocList][docarray.array.doc_list.doc_list.DocList] from/to a pandas DataFrame: ```python from docarray import BaseDoc, DocList From dde1612d6a25854b8f40453b5a6615bf557adccc Mon Sep 17 00:00:00 2001 From: samsja Date: Thu, 13 Apr 2023 14:04:36 +0200 Subject: [PATCH 28/33] wip Signed-off-by: samsja --- docarray/base_doc/doc.py | 121 ----------------------- docs/api_references/base_doc/base_doc.md | 3 + 2 files changed, 3 insertions(+), 121 deletions(-) diff --git a/docarray/base_doc/doc.py b/docarray/base_doc/doc.py index 3319e6b6cc8..4ba3ae117c1 100644 --- a/docarray/base_doc/doc.py +++ b/docarray/base_doc/doc.py @@ -162,127 +162,6 @@ def _docarray_to_json_compatible(self) -> Dict: ### this section is just for documentation purposes will be removed later once https://github.com/mkdocstrings/griffe/issues/138 is fixed ############## ######################################################################################################################################################## - def to_bytes( - self, protocol: str = 'protobuf', compress: Optional[str] = None - ) -> bytes: - """Serialize itself into bytes. - - For more Pythonic code, please use ``bytes(...)``. - - :param protocol: protocol to use. It can be 'pickle' or 'protobuf' - :param compress: compression algorithm to use - :return: the binary serialization in bytes - """ - return super().to_bytes(protocol, compress) - - @classmethod - def from_bytes( - cls: Type[T], - data: bytes, - protocol: str = 'protobuf', - compress: Optional[str] = None, - ) -> T: - """Build Document object from binary bytes - - :param data: binary bytes - :param protocol: protocol to use. It can be 'pickle' or 'protobuf' - :param compress: compression method to use - :return: a Document object - """ - return super(BaseDoc, cls).from_bytes(data, protocol, compress) - - def to_base64( - self, protocol: str = 'protobuf', compress: Optional[str] = None - ) -> str: - """Serialize a Document object into as base64 string - - :param protocol: protocol to use. It can be 'pickle' or 'protobuf' - :param compress: compression method to use - :return: a base64 encoded string - """ - return super().to_base64(protocol, compress) - - @classmethod - def from_base64( - cls: Type[T], - data: str, - protocol: str = 'pickle', - compress: Optional[str] = None, - ) -> T: - """Build Document object from binary bytes - - :param data: a base64 encoded string - :param protocol: protocol to use. It can be 'pickle' or 'protobuf' - :param compress: compression method to use - :return: a Document object - """ - return super(BaseDoc, cls).from_base64(data, protocol, compress) - - @classmethod - def from_protobuf(cls: Type[T], pb_msg: 'DocProto') -> T: - """create a Document from a protobuf message - - :param pb_msg: the proto message of the Document - :return: a Document initialize with the proto data - """ - return super(BaseDoc, cls).from_protobuf(pb_msg) - - def update(self, other: T_update): - """ - Updates self with the content of other. Changes are applied to self. - Updating one Document with another consists of the following: - - Setting data properties of the second Document to the first Document - if they are not None - - Concatenating lists and updating sets - - Updating recursively Documents and DocArrays - - Updating Dictionaries of the left with the right - - It behaves as an update operation for Dictionaries, except that since - it is applied to a static schema type, the presence of the field is - given by the field not having a None value and that DocArrays, - lists and sets are concatenated. It is worth mentioning that Tuples - are not merged together since they are meant to be inmutable, - so they behave as regular types and the value of `self` is updated - with the value of `other` - - - --- - - ```python - from typing import Optional, List - - from docarray import BaseDoc - - - class MyDocument(BaseDoc): - content: str - title: Optional[str] = None - tags_: List - - - doc1 = MyDocument( - content='Core content of the document', title='Title', tags_=['python', 'AI'] - ) - doc2 = MyDocument(content='Core content updated', tags_=['docarray']) - - doc1.update(doc2) - assert doc1.content == 'Core content updated' - assert doc1.title == 'Title' - assert doc1.tags_ == ['python', 'AI', 'docarray'] - ``` - - --- - :param other: The Document used to update the contents of this Document - """ - super().update(other) - - def to_protobuf(self) -> 'DocProto': - """Convert Document into a Protobuf message. - - :return: the protobuf message - """ - return super().to_protobuf() - def json( self, *, diff --git a/docs/api_references/base_doc/base_doc.md b/docs/api_references/base_doc/base_doc.md index 0fe2dc80891..abce654ee96 100644 --- a/docs/api_references/base_doc/base_doc.md +++ b/docs/api_references/base_doc/base_doc.md @@ -1,3 +1,6 @@ # BaseDoc ::: docarray.base_doc.doc.BaseDoc +::: docarray.base_doc.mixins.io.IOMixin +::: docarray.base_doc.mixins.update.UpdateMixin + From 3d0d7457e358940cd6dca372f35a7a7dadf117b4 Mon Sep 17 00:00:00 2001 From: samsja Date: Thu, 13 Apr 2023 15:58:42 +0200 Subject: [PATCH 29/33] fix: fix all docstring Signed-off-by: samsja --- docarray/array/any_array.py | 14 +- docarray/array/doc_list/doc_list.py | 294 +------------------- docs/api_references/array/da.md | 2 + docs/user_guide/sending/ser/send_doc.md | 4 +- docs/user_guide/sending/ser/send_doclist.md | 18 +- 5 files changed, 21 insertions(+), 311 deletions(-) diff --git a/docarray/array/any_array.py b/docarray/array/any_array.py index 901f87f82a9..3d966d34904 100644 --- a/docarray/array/any_array.py +++ b/docarray/array/any_array.py @@ -121,7 +121,7 @@ def _set_data_column( field: str, values: Union[List, T, 'AbstractTensor'], ): - """Set all Documents in this [`DocList`][docarray.typing.DocList] using the passed values + """Set all Documents in this [`DocList`][docarray.array.doc_list.doc_list.DocList] using the passed values :param field: name of the fields to extract :values: the values to set at the DocList level @@ -140,7 +140,7 @@ def to_protobuf(self) -> 'DocListProto': ... def _to_node_protobuf(self) -> 'NodeProto': - """Convert a [`DocList`][docarray.typing.DocList] into a NodeProto protobuf message. + """Convert a [`DocList`][docarray.array.doc_list.doc_list.DocList] into a NodeProto protobuf message. This function should be called when a DocList is nested into another Document that need to be converted into a protobuf @@ -157,7 +157,7 @@ def traverse_flat( ) -> Union[List[Any], 'AbstractTensor']: """ Return a List of the accessed objects when applying the `access_path`. If this - results in a nested list or list of [`DocList`s][docarray.typing.DocList], the list will be flattened + results in a nested list or list of [`DocList`s][docarray.array.doc_list.doc_list.DocList], the list will be flattened on the first level. The access path is a string that consists of attribute names, concatenated and `"__"`-separated. It describes the path from the first level to an arbitrary one, e.g. `'content__image__url'`. @@ -209,7 +209,7 @@ class Book(BaseDoc): ``` - If your [`DocList`][docarray.typing.DocList] is in doc_vec mode, and you want to access a field of + If your [`DocList`][docarray.array.doc_list.doc_list.DocList] is in doc_vec mode, and you want to access a field of type [`AnyTensor`][docarray.typing.AnyTensor], the doc_vec tensor will be returned instead of a list: ```python @@ -265,7 +265,7 @@ def _flatten_one_level(sequence: List[Any]) -> List[Any]: def summary(self): """ - Print a summary of this [`DocList`][docarray.typing.DocList] object and a summary of the schema of its + Print a summary of this [`DocList`][docarray.array.doc_list.doc_list.DocList] object and a summary of the schema of its Document type. """ DocArraySummary(self).summary() @@ -277,13 +277,13 @@ def _batch( show_progress: bool = False, ) -> Generator[T, None, None]: """ - Creates a `Generator` that yields [`DocList`][docarray.typing.DocList] of size `batch_size`. + Creates a `Generator` that yields [`DocList`][docarray.array.doc_list.doc_list.DocList] of size `batch_size`. Note, that the last batch might be smaller than `batch_size`. :param batch_size: Size of each generated batch. :param shuffle: If set, shuffle the Documents before dividing into minibatches. :param show_progress: if set, show a progress bar when batching documents. - :yield: a Generator of [`DocList`][docarray.typing.DocList], each in the length of `batch_size` + :yield: a Generator of [`DocList`][docarray.array.doc_list.doc_list.DocList], each in the length of `batch_size` """ from rich.progress import track diff --git a/docarray/array/doc_list/doc_list.py b/docarray/array/doc_list/doc_list.py index 3725fcc0737..8eb1a822d59 100644 --- a/docarray/array/doc_list/doc_list.py +++ b/docarray/array/doc_list/doc_list.py @@ -1,15 +1,10 @@ -import csv import io -import pathlib from functools import wraps from typing import ( TYPE_CHECKING, Any, - BinaryIO, Callable, - Generator, Iterable, - Iterator, List, MutableSequence, Optional, @@ -23,7 +18,7 @@ from typing_inspect import is_union_type from docarray.array.any_array import AnyDocArray -from docarray.array.doc_list.io import IOMixinArray, _LazyRequestReader +from docarray.array.doc_list.io import IOMixinArray from docarray.array.doc_list.pushpull import PushPullMixin from docarray.array.doc_list.sequence_indexing_mixin import ( IndexingSequenceMixin, @@ -33,7 +28,6 @@ from docarray.typing import NdArray if TYPE_CHECKING: - import pandas as pd from pydantic import BaseConfig from pydantic.fields import ModelField @@ -314,289 +308,3 @@ def __getitem__(self: T, item: IndexIterType) -> T: def __getitem__(self, item): return super().__getitem__(item) - - ######################################################################################################################################################## - ### this section is just for documentation purposes will be removed later once https://github.com/mkdocstrings/griffe/issues/138 is fixed ############## - ######################################################################################################################################################## - - def to_protobuf(self) -> 'DocListProto': - """Convert DocList into a Protobuf message""" - return super(DocList, self).to_protobuf() - - @classmethod - def from_bytes( - cls: Type[T], - data: bytes, - protocol: str = 'protobuf-array', - compress: Optional[str] = None, - show_progress: bool = False, - ) -> T: - """Deserialize bytes into a DocList. - - :param data: Bytes from which to deserialize - :param protocol: protocol that was used to serialize - :param compress: compression algorithm that was used to serialize between `lz4`, `bz2`, `lzma`, `zlib`, `gzip` - :param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf` - :return: the deserialized DocList - """ - return super(DocList, cls).from_bytes( - data, protocol=protocol, compress=compress, show_progress=show_progress - ) - - def to_binary_stream( - self, - protocol: str = 'protobuf', - compress: Optional[str] = None, - show_progress: bool = False, - ) -> Iterator[bytes]: - return super().to_binary_stream( - protocol=protocol, compress=compress, show_progress=show_progress - ) - - def to_bytes( - self, - protocol: str = 'protobuf-array', - compress: Optional[str] = None, - file_ctx: Optional[BinaryIO] = None, - show_progress: bool = False, - ) -> Optional[bytes]: - """Serialize itself into bytes. - - For more Pythonic code, please use ``bytes(...)``. - - :param protocol: protocol to use. It can be 'pickle-array', 'protobuf-array', 'pickle' or 'protobuf' - :param compress: compress algorithm to use between `lz4`, `bz2`, `lzma`, `zlib`, `gzip` - :param file_ctx: File or filename or serialized bytes where the data is stored. - :param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf` - :return: the binary serialization in bytes or None if file_ctx is passed where to store - """ - return super().to_bytes( - protocol=protocol, - compress=compress, - file_ctx=file_ctx, - show_progress=show_progress, - ) - - @classmethod - def from_base64( - cls: Type[T], - data: str, - protocol: str = 'protobuf-array', - compress: Optional[str] = None, - show_progress: bool = False, - ) -> T: - """Deserialize base64 strings into a DocList. - - :param data: Base64 string to deserialize - :param protocol: protocol that was used to serialize - :param compress: compress algorithm that was used to serialize - :param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf` - :return: the deserialized DocList - """ - return super(DocList, cls).from_base64( - data, protocol=protocol, compress=compress, show_progress=show_progress - ) - - def to_base64( - self, - protocol: str = 'protobuf-array', - compress: Optional[str] = None, - show_progress: bool = False, - ) -> str: - """Serialize itself into base64 encoded string. - - :param protocol: protocol to use. It can be 'pickle-array', 'protobuf-array', 'pickle' or 'protobuf' - :param compress: compress algorithm to use between `lz4`, `bz2`, `lzma`, `zlib`, `gzip` - :param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf` - :return: the binary serialization in bytes or None if file_ctx is passed where to store - """ - return super().to_base64( - protocol=protocol, compress=compress, show_progress=show_progress - ) - - @classmethod - def from_json( - cls: Type[T], - file: Union[str, bytes, bytearray], - ) -> T: - """Deserialize JSON strings or bytes into a DocList. - - :param file: JSON object from where to deserialize a DocList - :return: the deserialized DocList - """ - return super(DocList, cls).from_json(file) - - def to_json(self) -> bytes: - """Convert the object into JSON bytes. Can be loaded via :meth:`.from_json`. - :return: JSON serialization of DocList - """ - return super().to_json() - - @classmethod - def from_csv( - cls, - file_path: str, - encoding: str = 'utf-8', - dialect: Union[str, csv.Dialect] = 'excel', - ) -> 'DocList': - """ - Load a DocList from a csv file following the schema defined in the - :attr:`~docarray.DocList.doc_type` attribute. - Every row of the csv file will be mapped to one document in the doc_list. - The column names (defined in the first row) have to match the field names - of the Document type. - For nested fields use "__"-separated access paths, such as 'image__url'. - - List-like fields (including field of type DocList) are not supported. - - :param file_path: path to csv file to load DocList from. - :param encoding: encoding used to read the csv file. Defaults to 'utf-8'. - :param dialect: defines separator and how to handle whitespaces etc. - Can be a csv.Dialect instance or one string of: - 'excel' (for comma seperated values), - 'excel-tab' (for tab separated values), - 'unix' (for csv file generated on UNIX systems). - :return: DocList - """ - return super(DocList, cls).from_csv( - file_path, encoding=encoding, dialect=dialect - ) - - def to_csv( - self, file_path: str, dialect: Union[str, csv.Dialect] = 'excel' - ) -> None: - """ - Save a DocList to a csv file. - The field names will be stored in the first row. Each row corresponds to the - information of one Document. - Columns for nested fields will be named after the "__"-seperated access paths, - such as `"image__url"` for `image.url`. - - :param file_path: path to a csv file. - :param dialect: defines separator and how to handle whitespaces etc. - Can be a csv.Dialect instance or one string of: - 'excel' (for comma seperated values), - 'excel-tab' (for tab separated values), - 'unix' (for csv file generated on UNIX systems). - """ - return super().to_csv(file_path, dialect=dialect) - - @classmethod - def from_dataframe(cls, df: 'pd.DataFrame') -> 'DocList': - """ - Load a DocList from a `pandas.DataFrame` following the schema - defined in the :attr:`~docarray.DocList.doc_type` attribute. - Every row of the dataframe will be mapped to one Document in the doc_list. - The column names of the dataframe have to match the field names of the - Document type. - For nested fields use "__"-separated access paths as column names, - such as 'image__url'. - - List-like fields (including field of type DocList) are not supported. - - - --- - - ```python - import pandas as pd - - from docarray import BaseDoc, DocList - - - class Person(BaseDoc): - name: str - follower: int - - - df = pd.DataFrame( - data=[['Maria', 12345], ['Jake', 54321]], columns=['name', 'follower'] - ) - - docs = DocList[Person].from_dataframe(df) - - assert docs.name == ['Maria', 'Jake'] - assert docs.follower == [12345, 54321] - ``` - - --- - - :param df: pandas.DataFrame to extract Document's information from - :return: DocList where each Document contains the information of one - corresponding row of the `pandas.DataFrame`. - """ - return super(DocList, cls).from_dataframe(df) - - def to_dataframe(self) -> 'pd.DataFrame': - """ - Save a DocList to a `pandas.DataFrame`. - The field names will be stored as column names. Each row of the dataframe corresponds - to the information of one Document. - Columns for nested fields will be named after the "__"-seperated access paths, - such as `"image__url"` for `image.url`. - - :return: pandas.DataFrame - """ - return super().to_dataframe() - - @classmethod - def load_binary( - cls: Type[T], - file: Union[str, bytes, pathlib.Path, io.BufferedReader, _LazyRequestReader], - protocol: str = 'protobuf-array', - compress: Optional[str] = None, - show_progress: bool = False, - streaming: bool = False, - ) -> Union[T, Generator['T_doc', None, None]]: - """Load doc_list elements from a compressed binary file. - - :param file: File or filename or serialized bytes where the data is stored. - :param protocol: protocol to use. It can be 'pickle-array', 'protobuf-array', 'pickle' or 'protobuf' - :param compress: compress algorithm to use between 'lz4', 'gzip', 'bz2', 'zstd', 'lzma' - :param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf` - :param streaming: if `True` returns a generator over `Document` objects. - In case protocol is pickle the `Documents` are streamed from disk to save memory usage - :return: a DocList object - - .. note:: - If `file` is `str` it can specify `protocol` and `compress` as file extensions. - This functionality assumes `file=file_name.$protocol.$compress` where `$protocol` and `$compress` refer to a - string interpolation of the respective `protocol` and `compress` methods. - For example if `file=my_docarray.protobuf.lz4` then the binary data will be loaded assuming `protocol=protobuf` - and `compress=lz4`. - """ - return super().load_binary( - file, protocol=protocol, compress=compress, show_progress=show_progress - ) - - def save_binary( - self, - file: Union[str, pathlib.Path], - protocol: str = 'protobuf-array', - compress: Optional[str] = None, - show_progress: bool = False, - ) -> None: - """Save DocList into a binary file. - - It will use the protocol to pick how to save the DocList. - If used 'picke-doc_list` and `protobuf-array` the DocList will be stored - and compressed at complete level using `pickle` or `protobuf`. - When using `protobuf` or `pickle` as protocol each Document in DocList - will be stored individually and this would make it available for streaming. - - !! note - If `file` is `str` it can specify `protocol` and `compress` as file extensions. - This functionality assumes `file=file_name.$protocol.$compress` where `$protocol` and `$compress` refer to a - string interpolation of the respective `protocol` and `compress` methods. - For example if `file=my_docarray.protobuf.lz4` then the binary data will be created using `protocol=protobuf` - and `compress=lz4`. - - :param file: File or filename to which the data is saved. - :param protocol: protocol to use. It can be 'pickle-array', 'protobuf-array', 'pickle' or 'protobuf' - :param compress: compress algorithm to use between : `lz4`, `bz2`, `lzma`, `zlib`, `gzip` - :param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf` - - - """ - return super().save_binary( - file, protocol=protocol, compress=compress, show_progress=show_progress - ) diff --git a/docs/api_references/array/da.md b/docs/api_references/array/da.md index 21a206a9537..e1f5b33f008 100644 --- a/docs/api_references/array/da.md +++ b/docs/api_references/array/da.md @@ -1,3 +1,5 @@ # DocList ::: docarray.array.doc_list.doc_list.DocList +::: docarray.array.doc_list.io.IOMixinArray +::: docarray.array.doc_list.pushpull.PushPullMixin diff --git a/docs/user_guide/sending/ser/send_doc.md b/docs/user_guide/sending/ser/send_doc.md index caa93c4fdb1..dd77557dbba 100644 --- a/docs/user_guide/sending/ser/send_doc.md +++ b/docs/user_guide/sending/ser/send_doc.md @@ -7,8 +7,8 @@ You need to serialize a [BaseDoc][docarray.base_doc.doc.BaseDoc] before you can ## Serialization to protobuf -You can use [`to_protobuf`][docarray.base_doc.doc.BaseDoc.to_protobuf] to serialize a [BaseDoc][docarray.base_doc.doc.BaseDoc] to a protobuf message object -and use [`from_protobuf`][docarray.base_doc.doc.BaseDoc.from_protobuf] to deserialize it. +You can use [`to_protobuf`][docarray.base_doc.mixins.io.IOMixin.to_protobuf] to serialize a [BaseDoc][docarray.base_doc.doc.BaseDoc] to a protobuf message object +and use [`from_protobuf`][docarray.base_doc.mixins.io.IOMixin.from_protobuf] to deserialize it. ```python from typing import List diff --git a/docs/user_guide/sending/ser/send_doclist.md b/docs/user_guide/sending/ser/send_doclist.md index 33c0a7b1718..70b1789ca5f 100644 --- a/docs/user_guide/sending/ser/send_doclist.md +++ b/docs/user_guide/sending/ser/send_doclist.md @@ -2,7 +2,7 @@ When sending or storing [`DocList`][docarray.array.doc_list.doc_list.DocList], you need to use serialization. [DocList][docarray.array.doc_list.doc_list.DocList] supports multiple ways to serialize the data. ## JSON -You can use [`to_json()`][docarray.array.doc_list.doc_list.DocList.to_json] and [`from_json()`][docarray.array.doc_list.doc_list.DocList.from_json] to serialize and deserialize a [DocList][docarray.array.doc_list.doc_list.DocList]: +You can use [`to_json()`][docarray.array.doc_list.io.IOMixinArray.to_json] and [`from_json()`][docarray.array.doc_list.io.IOMixinArray.from_json] to serialize and deserialize a [DocList][docarray.array.doc_list.doc_list.DocList]: ```python from docarray import BaseDoc, DocList @@ -24,14 +24,14 @@ with open('simple-dl.json', 'r') as f: print(dl_load_from_json) ``` -[to_json()][docarray.array.doc_list.doc_list.DocList.to_json] returns the binary representation of the json object. [from_json()][docarray.array.doc_list.doc_list.DocList.from_json] can load from either `str` or `binary` representation of the json object. +[to_json()][docarray.array.doc_list.io.IOMixinArray.to_json] returns the binary representation of the json object. [from_json()][docarray.array.doc_list.io.IOMixinArray.from_json] can load from either `str` or `binary` representation of the json object. ```output b'[{"id":"5540e72d407ae81abb2390e9249ed066","text":"doc 0"},{"id":"fbe9f80d2fa03571e899a2887af1ac1b","text":"doc 1"}]' ``` ## Protobuf -To serialize a DocList with `protobuf`, you can use [`to_protobuf()`][docarray.array.doc_list.doc_list.DocList.to_protobuf] and [`from_protobuf()`][docarray.array.doc_list.doc_list.DocList.from_protobuf] to serialize and deserialize a [DocList][docarray.array.doc_list.doc_list.DocList]: +To serialize a DocList with `protobuf`, you can use [`to_protobuf()`][docarray.array.doc_list.io.IOMixinArray.to_protobuf] and [`from_protobuf()`][docarray.array.doc_list.io.IOMixinArray.from_protobuf] to serialize and deserialize a [DocList][docarray.array.doc_list.doc_list.DocList]: ```python from docarray import BaseDoc, DocList @@ -49,13 +49,13 @@ print(type(proto_message_dl)) print(dl_from_proto) ``` -[to_protobuf()][docarray.array.doc_list.doc_list.DocList.to_protobuf] returns a protobuf object of `docarray_pb2.DocListProto` class. [from_protobuf()][docarray.array.doc_list.doc_list.DocList.from_protobuf] accepts a protobuf message object to construct a [DocList][docarray.array.doc_list.doc_list.DocList]. +[to_protobuf()][docarray.array.doc_list.io.IOMixinArray.to_protobuf] returns a protobuf object of `docarray_pb2.DocListProto` class. [from_protobuf()][docarray.array.doc_list.io.IOMixinArray.from_protobuf] accepts a protobuf message object to construct a [DocList][docarray.array.doc_list.doc_list.DocList]. ## Base64 When transferring over the network, you can choose `Base64` format to serialize the [`DocList`][docarray.array.doc_list.doc_list.DocList]. Serializing a [DocList][docarray.array.doc_list.doc_list.DocList] in Base64 supports both `pickle` and `protobuf` protocols. Besides, you can choose different compression methods. -To serialize a [DocList][docarray.array.doc_list.doc_list.DocList] in Base64, you can use [`to_base64()`][docarray.array.doc_list.doc_list.DocList.to_base64] and [`from_base64()`][docarray.array.doc_list.doc_list.DocList.from_protobuf] to serialize and deserialize a [DocList][docarray.array.doc_list.doc_list.from_base64]: +To serialize a [DocList][docarray.array.doc_list.doc_list.DocList] in Base64, you can use [`to_base64()`][docarray.array.doc_list.io.IOMixinArray.to_base64] and [`from_base64()`][docarray.array.doc_list.io.IOMixinArray.from_protobuf] to serialize and deserialize a [DocList][docarray.array.doc_list.doc_list.DocList]: We support multiple compression methods. (namely : `lz4`, `bz2`, `lzma`, `zlib`, `gzip`) @@ -80,7 +80,7 @@ dl_from_base64 = DocList[SimpleDoc].from_base64( ## Binary Similar to `Base64` serialization, `Binary` serialization also supports different protocols and compression methods. -To save a [DocList][docarray.array.doc_list.doc_list.DocList] into a binary file, you can use [`save_binary()`][docarray.array.doc_list.doc_list.DocList.to_base64] and [`load_binary()`][docarray.array.doc_list.doc_list.DocList.from_protobuf] to serialize and deserialize a [DocList][docarray.array.doc_list.doc_list.from_base64]: +To save a [DocList][docarray.array.doc_list.doc_list.DocList] into a binary file, you can use [`save_binary()`][docarray.array.doc_list.io.IOMixinArray.to_base64] and [`load_binary()`][docarray.array.doc_list.io.IOMixinArray.from_protobuf] to serialize and deserialize a [DocList][docarray.array.doc_list.doc_list.DocList]: ```python from docarray import BaseDoc, DocList @@ -102,7 +102,7 @@ dl_from_binary = DocList[SimpleDoc].load_binary( The [DocList][docarray.array.doc_list.doc_list.DocList] is stored at `simple-dl.pickle` file. ### Bytes -Under the hood, [save_binary()][docarray.array.doc_list.doc_list.DocList.to_base64] prepares the file object and calls [to_bytes()][docarray.array.doc_list.doc_list.DocList.to_bytes] function to convert the [DocList][docarray.array.doc_list.doc_list.DocList] into a byte object. You can use [to_bytes()][docarray.array.doc_list.doc_list.DocList.to_bytes] function directly and use [from_bytes()][docarray.array.doc_list.doc_list.DocList.from_bytes] to load the [DocList][docarray.array.doc_list.doc_list.DocList] from a byte object. You can use `protocol` to choose between `pickle` and `protobuf`. Besides, [to_bytes()][docarray.array.doc_list.doc_list.DocList.to_bytes] and [save_bytes()][docarray.array.doc_list.doc_list.DocList.save_bytes] support multiple options for `compress` as well. +Under the hood, [save_binary()][docarray.array.doc_list.io.IOMixinArray.to_base64] prepares the file object and calls [to_bytes()][docarray.array.doc_list.io.IOMixinArray.to_bytes] function to convert the [DocList][docarray.array.doc_list.doc_list.DocList] into a byte object. You can use [to_bytes()][docarray.array.doc_list.io.IOMixinArray.to_bytes] function directly and use [from_bytes()][docarray.array.doc_list.io.IOMixinArray.from_bytes] to load the [DocList][docarray.array.doc_list.doc_list.DocList] from a byte object. You can use `protocol` to choose between `pickle` and `protobuf`. Besides, [to_bytes()][docarray.array.doc_list.io.IOMixinArray.to_bytes] and [save_binary()][docarray.array.doc_list.io.IOMixinArray.save_binary] support multiple options for `compress` as well. ```python from docarray import BaseDoc, DocList @@ -123,7 +123,7 @@ dl_from_bytes = DocList[SimpleDoc].from_bytes( ## CSV -You can use [`from_csv()`][docarray.array.doc_list.doc_list.DocList.from_csv] and [`to_csv()`][docarray.array.doc_list.doc_list.DocList.to_csv] to de-/serializae and deserialize the [DocList][docarray.array.doc_list.doc_list.DocList] from/to a CSV file. Use the `dialect` parameter to choose the dialect of the CSV format: +You can use [`from_csv()`][docarray.array.doc_list.io.IOMixinArray.from_csv] and [`to_csv()`][docarray.array.doc_list.io.IOMixinArray.to_csv] to de-/serializae and deserialize the [DocList][docarray.array.doc_list.doc_list.DocList] from/to a CSV file. Use the `dialect` parameter to choose the dialect of the CSV format: ```python from docarray import BaseDoc, DocList @@ -142,7 +142,7 @@ print(dl_from_csv) ## Pandas.Dataframe -You can use [`from_dataframe()`][docarray.array.doc_list.doc_list.DocList.from_dataframe] and [`to_dataframe()`][docarray.array.doc_list.doc_list.DocList.to_dataframe] to load/save the [DocList][docarray.array.doc_list.doc_list.DocList] from/to a pandas DataFrame: +You can use [`from_dataframe()`][docarray.array.doc_list.io.IOMixinArray.from_dataframe] and [`to_dataframe()`][docarray.array.doc_list.io.IOMixinArray.to_dataframe] to load/save the [DocList][docarray.array.doc_list.doc_list.DocList] from/to a pandas DataFrame: ```python from docarray import BaseDoc, DocList From a6c9aa928ebe0f040697215263c308fd5d80977e Mon Sep 17 00:00:00 2001 From: samsja Date: Thu, 13 Apr 2023 16:03:26 +0200 Subject: [PATCH 30/33] fix: fix update docstring Signed-off-by: samsja --- docarray/base_doc/mixins/update.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docarray/base_doc/mixins/update.py b/docarray/base_doc/mixins/update.py index 5a21738a7d4..fd962237b02 100644 --- a/docarray/base_doc/mixins/update.py +++ b/docarray/base_doc/mixins/update.py @@ -25,7 +25,8 @@ def update(self, other: T): Updates self with the content of other. Changes are applied to self. Updating one Document with another consists in the following: - setting data properties of the second Document to the first Document - if they are not None + if they are not None: + - Concatenating lists and updating sets - Updating recursively Documents and DocArrays - Updating Dictionaries of the left with the right From 70c0f45737dffa87d6526ba82d63a63905200995 Mon Sep 17 00:00:00 2001 From: samsja Date: Thu, 13 Apr 2023 16:14:06 +0200 Subject: [PATCH 31/33] fix: fix ruff Signed-off-by: samsja --- docarray/base_doc/doc.py | 1 - 1 file changed, 1 deletion(-) diff --git a/docarray/base_doc/doc.py b/docarray/base_doc/doc.py index 4ba3ae117c1..0ed39bd0d49 100644 --- a/docarray/base_doc/doc.py +++ b/docarray/base_doc/doc.py @@ -27,7 +27,6 @@ from pydantic.typing import AbstractSetIntStr, MappingIntStrAny from docarray.array.doc_vec.column_storage import ColumnStorageView - from docarray.proto import DocProto _console: Console = Console() From 2828cf21b93c97d5e4d9f4a525ec33204956ad13 Mon Sep 17 00:00:00 2001 From: samsja Date: Thu, 13 Apr 2023 16:26:22 +0200 Subject: [PATCH 32/33] fix: fix smth Signed-off-by: samsja --- docarray/array/doc_list/io.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/docarray/array/doc_list/io.py b/docarray/array/doc_list/io.py index 688d0310bee..9f153e2f1bd 100644 --- a/docarray/array/doc_list/io.py +++ b/docarray/array/doc_list/io.py @@ -702,13 +702,7 @@ def load_binary( ) -> Union[T, Generator['T_doc', None, None]]: """Load doc_list elements from a compressed binary file. - :param file: File or filename or serialized bytes where the data is stored. - :param protocol: protocol to use. It can be 'pickle-array', 'protobuf-array', 'pickle' or 'protobuf' - :param compress: compress algorithm to use between `lz4`, `bz2`, `lzma`, `zlib`, `gzip` - :param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf` - :param streaming: if `True` returns a generator over `Document` objects. In case protocol is pickle the `Documents` are streamed from disk to save memory usage - :return: a `DocList` object !!! note If `file` is `str` it can specify `protocol` and `compress` as file extensions. @@ -716,6 +710,15 @@ def load_binary( string interpolation of the respective `protocol` and `compress` methods. For example if `file=my_docarray.protobuf.lz4` then the binary data will be loaded assuming `protocol=protobuf` and `compress=lz4`. + + :param file: File or filename or serialized bytes where the data is stored. + :param protocol: protocol to use. It can be 'pickle-array', 'protobuf-array', 'pickle' or 'protobuf' + :param compress: compress algorithm to use between `lz4`, `bz2`, `lzma`, `zlib`, `gzip` + :param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf` + :param streaming: if `True` returns a generator over `Document` objects. + + :return: a `DocList` object + """ load_protocol: Optional[str] = protocol load_compress: Optional[str] = compress From 228ddffeec3648831aecce2857d3e6719224451a Mon Sep 17 00:00:00 2001 From: samsja <55492238+samsja@users.noreply.github.com> Date: Thu, 13 Apr 2023 16:34:13 +0200 Subject: [PATCH 33/33] feat: apply charllote suggestion Co-authored-by: Charlotte Gerhaher Signed-off-by: samsja <55492238+samsja@users.noreply.github.com> --- docarray/base_doc/mixins/update.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docarray/base_doc/mixins/update.py b/docarray/base_doc/mixins/update.py index fd962237b02..471e97483ba 100644 --- a/docarray/base_doc/mixins/update.py +++ b/docarray/base_doc/mixins/update.py @@ -43,8 +43,9 @@ def update(self, other: T): --- ```python + from typing import List, Optional + from docarray import BaseDoc - from docarray.documents import Text class MyDocument(BaseDoc):