From b7d97dc7352bcb179d7251d79889b787d8572068 Mon Sep 17 00:00:00 2001 From: samsja Date: Mon, 27 Mar 2023 13:26:06 +0200 Subject: [PATCH 01/22] refactor: rename document to doc Signed-off-by: samsja --- docarray/__init__.py | 7 +- docarray/array/abstract_array.py | 38 ++++----- docarray/array/array/array.py | 26 +++--- docarray/array/array/io.py | 20 ++--- docarray/array/array/pushpull.py | 18 ++-- docarray/array/stacked/array_stacked.py | 20 ++--- docarray/base_document/__init__.py | 8 +- .../{any_document.py => any_doc.py} | 10 +-- .../base_document/{document.py => doc.py} | 4 +- .../{document_response.py => doc_response.py} | 6 +- docarray/base_document/mixins/io.py | 6 +- docarray/base_document/mixins/update.py | 4 +- docarray/data/torch_dataset.py | 18 ++-- docarray/display/document_array_summary.py | 4 +- docarray/display/document_summary.py | 24 +++--- docarray/documents/audio.py | 6 +- docarray/documents/helper.py | 34 ++++---- docarray/documents/image.py | 8 +- docarray/documents/legacy/legacy_document.py | 4 +- docarray/documents/mesh/mesh_3d.py | 8 +- docarray/documents/mesh/vertices_and_faces.py | 4 +- .../documents/point_cloud/point_cloud_3d.py | 8 +- .../point_cloud/points_and_colors.py | 4 +- docarray/documents/text.py | 8 +- docarray/documents/video.py | 8 +- docarray/helper.py | 16 ++-- docarray/index/abstract.py | 84 +++++++++---------- docarray/index/backends/hnswlib.py | 42 +++++----- docarray/store/abstract_doc_store.py | 6 +- docarray/store/file.py | 6 +- docarray/store/jac.py | 6 +- docarray/store/s3.py | 6 +- docarray/typing/bytes/audio_bytes.py | 2 +- docarray/typing/bytes/image_bytes.py | 4 +- docarray/typing/bytes/video_bytes.py | 4 +- docarray/typing/tensor/audio/audio_ndarray.py | 4 +- .../tensor/audio/audio_tensorflow_tensor.py | 4 +- .../typing/tensor/audio/audio_torch_tensor.py | 4 +- docarray/typing/tensor/image/image_ndarray.py | 4 +- .../tensor/image/image_tensorflow_tensor.py | 4 +- .../typing/tensor/image/image_torch_tensor.py | 4 +- docarray/typing/tensor/ndarray.py | 7 +- docarray/typing/tensor/tensorflow_tensor.py | 8 +- docarray/typing/tensor/torch_tensor.py | 8 +- docarray/typing/tensor/video/video_ndarray.py | 4 +- .../typing/tensor/video/video_tensor_mixin.py | 4 +- .../tensor/video/video_tensorflow_tensor.py | 4 +- .../typing/tensor/video/video_torch_tensor.py | 4 +- docarray/typing/url/audio_url.py | 2 +- docarray/typing/url/image_url.py | 4 +- docarray/typing/url/text_url.py | 6 +- docarray/typing/url/url_3d/mesh_url.py | 4 +- docarray/typing/url/url_3d/point_cloud_url.py | 6 +- docarray/typing/url/video_url.py | 4 +- docarray/utils/filter.py | 10 +-- docarray/utils/find.py | 36 ++++---- docarray/utils/map.py | 22 ++--- tests/benchmark_tests/test_map.py | 4 +- .../index/base_classes/test_base_doc_store.py | 51 ++++++----- tests/index/base_classes/test_configs.py | 12 +-- tests/index/hnswlib/test_find.py | 24 +++--- tests/index/hnswlib/test_index_get_del.py | 16 ++-- tests/index/hnswlib/test_persist_data.py | 6 +- tests/integrations/array/test_torch_train.py | 4 +- tests/integrations/document/test_document.py | 28 +++---- tests/integrations/document/test_proto.py | 12 +-- tests/integrations/document/test_to_json.py | 4 +- tests/integrations/externals/test_fastapi.py | 20 ++--- .../predefined_document/test_audio.py | 4 +- .../predefined_document/test_image.py | 4 +- .../predefined_document/test_mesh.py | 4 +- .../predefined_document/test_point_cloud.py | 6 +- .../predefined_document/test_text.py | 4 +- .../predefined_document/test_video.py | 4 +- .../torch/data/test_torch_dataset.py | 4 +- tests/integrations/typing/test_anyurl.py | 4 +- tests/integrations/typing/test_embedding.py | 4 +- tests/integrations/typing/test_id.py | 4 +- tests/integrations/typing/test_image_url.py | 4 +- tests/integrations/typing/test_mesh_url.py | 4 +- tests/integrations/typing/test_ndarray.py | 4 +- .../typing/test_point_cloud_url.py | 4 +- tests/integrations/typing/test_tensor.py | 6 +- .../typing/test_tensorflow_tensor.py | 6 +- .../integrations/typing/test_torch_tensor.py | 6 +- .../integrations/typing/test_typing_proto.py | 12 +-- .../units/array/stack/storage/test_storage.py | 8 +- tests/units/array/stack/test_array_stacked.py | 66 +++++++-------- .../array/stack/test_array_stacked_tf.py | 34 ++++---- tests/units/array/stack/test_init.py | 6 +- tests/units/array/stack/test_proto.py | 8 +- tests/units/array/test_array.py | 42 +++++----- tests/units/array/test_array_from_to_bytes.py | 4 +- tests/units/array/test_array_from_to_csv.py | 10 +-- tests/units/array/test_array_from_to_json.py | 4 +- .../units/array/test_array_from_to_pandas.py | 10 +-- tests/units/array/test_array_proto.py | 8 +- tests/units/array/test_array_save_load.py | 4 +- tests/units/array/test_batching.py | 4 +- tests/units/array/test_generic_array.py | 8 +- tests/units/array/test_traverse.py | 18 ++-- .../document/proto/test_document_proto.py | 50 +++++------ tests/units/document/test_any_document.py | 8 +- tests/units/document/test_base_document.py | 9 +- tests/units/document/test_from_to_bytes.py | 4 +- tests/units/document/test_update.py | 12 +-- tests/units/document/test_view.py | 4 +- tests/units/test_helper.py | 10 +-- tests/units/typing/da/test_relations.py | 12 +-- .../units/typing/tensor/test_audio_tensor.py | 6 +- tests/units/typing/tensor/test_np_ops.py | 6 +- tests/units/typing/tensor/test_torch_ops.py | 6 +- .../units/typing/tensor/test_torch_tensor.py | 4 +- .../units/typing/tensor/test_video_tensor.py | 6 +- tests/units/typing/url/test_audio_url.py | 6 +- tests/units/typing/url/test_video_url.py | 6 +- tests/units/util/test_filter.py | 6 +- tests/units/util/test_find.py | 16 ++-- tests/units/util/test_map.py | 4 +- tests/units/util/test_reduce.py | 6 +- 120 files changed, 658 insertions(+), 664 deletions(-) rename docarray/base_document/{any_document.py => any_doc.py} (64%) rename docarray/base_document/{document.py => doc.py} (96%) rename docarray/base_document/{document_response.py => doc_response.py} (88%) diff --git a/docarray/__init__.py b/docarray/__init__.py index 03e65750c14..2d4c6271119 100644 --- a/docarray/__init__.py +++ b/docarray/__init__.py @@ -1,10 +1,11 @@ __version__ = '0.30.0a3' -from docarray.array import DocumentArray, DocumentArrayStacked -from docarray.base_document.document import BaseDocument import logging -__all__ = ['BaseDocument', 'DocumentArray', 'DocumentArrayStacked'] +from docarray.array import DocumentArray, DocumentArrayStacked +from docarray.base_document.doc import BaseDoc + +__all__ = ['BaseDoc', 'DocumentArray', 'DocumentArrayStacked'] logger = logging.getLogger('docarray') diff --git a/docarray/array/abstract_array.py b/docarray/array/abstract_array.py index ab84662b81a..48faaa11359 100644 --- a/docarray/array/abstract_array.py +++ b/docarray/array/abstract_array.py @@ -19,7 +19,7 @@ import numpy as np -from docarray.base_document import BaseDocument +from docarray.base_document import BaseDoc from docarray.display.document_array_summary import DocumentArraySummary from docarray.typing.abstract_type import AbstractType from docarray.utils._typing import change_cls_name @@ -28,24 +28,24 @@ from docarray.proto import DocumentArrayProto, NodeProto from docarray.typing.tensor.abstract_tensor import AbstractTensor -T = TypeVar('T', bound='AnyDocumentArray') -T_doc = TypeVar('T_doc', bound=BaseDocument) +T = TypeVar('T', bound='AnyDocArray') +T_doc = TypeVar('T_doc', bound=BaseDoc) IndexIterType = Union[slice, Iterable[int], Iterable[bool], None] -class AnyDocumentArray(Sequence[T_doc], Generic[T_doc], AbstractType): - document_type: Type[BaseDocument] - __typed_da__: Dict[Type['AnyDocumentArray'], Dict[Type[BaseDocument], Type]] = {} +class AnyDocArray(Sequence[T_doc], Generic[T_doc], AbstractType): + document_type: Type[BaseDoc] + __typed_da__: Dict[Type['AnyDocArray'], Dict[Type[BaseDoc], Type]] = {} def __repr__(self): return f'<{self.__class__.__name__} (length={len(self)})>' @classmethod - def __class_getitem__(cls, item: Union[Type[BaseDocument], TypeVar, str]): + def __class_getitem__(cls, item: Union[Type[BaseDoc], TypeVar, str]): if not isinstance(item, type): return Generic.__class_getitem__.__func__(cls, item) # type: ignore # this do nothing that checking that item is valid type var or str - if not issubclass(item, BaseDocument): + if not issubclass(item, BaseDoc): raise ValueError( f'{cls.__name__}[item] item should be a Document not a {item} ' ) @@ -58,7 +58,7 @@ def __class_getitem__(cls, item: Union[Type[BaseDocument], TypeVar, str]): global _DocumentArrayTyped class _DocumentArrayTyped(cls): # type: ignore - document_type: Type[BaseDocument] = cast(Type[BaseDocument], item) + document_type: Type[BaseDoc] = cast(Type[BaseDoc], item) for field in _DocumentArrayTyped.document_type.__fields__.keys(): @@ -152,7 +152,7 @@ def _to_node_protobuf(self) -> 'NodeProto': @abstractmethod def traverse_flat( - self: 'AnyDocumentArray', + self: 'AnyDocArray', access_path: str, ) -> Union[List[Any], 'AbstractTensor']: """ @@ -167,14 +167,14 @@ def traverse_flat( EXAMPLE USAGE .. code-block:: python - from docarray import BaseDocument, DocumentArray, Text + from docarray import BaseDoc, DocumentArray, Text - class Author(BaseDocument): + class Author(BaseDoc): name: str - class Book(BaseDocument): + class Book(BaseDoc): author: Author content: Text @@ -192,14 +192,14 @@ class Book(BaseDocument): EXAMPLE USAGE .. code-block:: python - from docarray import BaseDocument, DocumentArray + from docarray import BaseDoc, DocumentArray - class Chapter(BaseDocument): + class Chapter(BaseDoc): content: str - class Book(BaseDocument): + class Book(BaseDoc): chapters: DocumentArray[Chapter] @@ -219,7 +219,7 @@ class Book(BaseDocument): EXAMPLE USAGE .. code-block:: python - class Image(BaseDocument): + class Image(BaseDoc): tensor: TorchTensor[3, 224, 224] @@ -250,10 +250,10 @@ def _traverse(node: Any, access_path: str): if isinstance(node, (DocumentArray, list)): for n in node: x = getattr(n, curr_attr) - yield from AnyDocumentArray._traverse(x, path_attrs) + yield from AnyDocArray._traverse(x, path_attrs) else: x = getattr(node, curr_attr) - yield from AnyDocumentArray._traverse(x, path_attrs) + yield from AnyDocArray._traverse(x, path_attrs) else: yield node diff --git a/docarray/array/array/array.py b/docarray/array/array/array.py index a699859b595..7d244e60c9a 100644 --- a/docarray/array/array/array.py +++ b/docarray/array/array/array.py @@ -17,14 +17,14 @@ from typing_inspect import is_union_type -from docarray.array.abstract_array import AnyDocumentArray +from docarray.array.abstract_array import AnyDocArray from docarray.array.array.io import IOMixinArray from docarray.array.array.pushpull import PushPullMixin from docarray.array.array.sequence_indexing_mixin import ( IndexingSequenceMixin, IndexIterType, ) -from docarray.base_document import AnyDocument, BaseDocument +from docarray.base_document import AnyDoc, BaseDoc from docarray.typing import NdArray if TYPE_CHECKING: @@ -37,7 +37,7 @@ from docarray.typing.tensor.abstract_tensor import AbstractTensor T = TypeVar('T', bound='DocumentArray') -T_doc = TypeVar('T_doc', bound=BaseDocument) +T_doc = TypeVar('T_doc', bound=BaseDoc) def _delegate_meth_to_data(meth_name: str) -> Callable: @@ -58,7 +58,7 @@ def _delegate_meth(self, *args, **kwargs): class DocumentArray( - IndexingSequenceMixin[T_doc], PushPullMixin, IOMixinArray, AnyDocumentArray[T_doc] + IndexingSequenceMixin[T_doc], PushPullMixin, IOMixinArray, AnyDocArray[T_doc] ): """ DocumentArray is a container of Documents. @@ -73,12 +73,12 @@ class DocumentArray( --- ```python - from docarray import BaseDocument, DocumentArray + from docarray import BaseDoc, DocumentArray from docarray.typing import NdArray, ImageUrl from typing import Optional - class Image(BaseDocument): + class Image(BaseDoc): tensor: Optional[NdArray[100]] url: ImageUrl @@ -121,7 +121,7 @@ class Image(BaseDocument): """ - document_type: Type[BaseDocument] = AnyDocument + document_type: Type[BaseDoc] = AnyDoc def __init__( self, @@ -153,7 +153,7 @@ def _validate_docs(self, docs: Iterable[T_doc]) -> Iterable[T_doc]: def _validate_one_doc(self, doc: T_doc) -> T_doc: """Validate if a Document is compatible with this DocumentArray""" - if not issubclass(self.document_type, AnyDocument) and not isinstance( + if not issubclass(self.document_type, AnyDoc) and not isinstance( doc, self.document_type ): raise ValueError(f'{doc} is not a {self.document_type}') @@ -216,7 +216,7 @@ def _get_data_column( if ( not is_union_type(field_type) and isinstance(field_type, type) - and issubclass(field_type, BaseDocument) + and issubclass(field_type, BaseDoc) ): # calling __class_getitem__ ourselves is a hack otherwise mypy complain # most likely a bug in mypy though @@ -250,7 +250,7 @@ def stack( Convert the DocumentArray into a DocumentArrayStacked. `Self` cannot be used afterwards :param tensor_type: Tensor Class used to wrap the stacked tensors. This is useful - if the BaseDocument has some undefined tensor type like AnyTensor or Union of NdArray and TorchTensor + if the BaseDoc has some undefined tensor type like AnyTensor or Union of NdArray and TorchTensor :return: A DocumentArrayStacked of the same document type as self """ from docarray.array.stacked.array_stacked import DocumentArrayStacked @@ -262,7 +262,7 @@ def stack( @classmethod def validate( cls: Type[T], - value: Union[T, Iterable[BaseDocument]], + value: Union[T, Iterable[BaseDoc]], field: 'ModelField', config: 'BaseConfig', ): @@ -279,8 +279,8 @@ def traverse_flat( self: 'DocumentArray', access_path: str, ) -> List[Any]: - nodes = list(AnyDocumentArray._traverse(node=self, access_path=access_path)) - flattened = AnyDocumentArray._flatten_one_level(nodes) + nodes = list(AnyDocArray._traverse(node=self, access_path=access_path)) + flattened = AnyDocArray._flatten_one_level(nodes) return flattened diff --git a/docarray/array/array/io.py b/docarray/array/array/io.py index 22b3321810b..4d6b5f5ca62 100644 --- a/docarray/array/array/io.py +++ b/docarray/array/array/io.py @@ -25,7 +25,7 @@ Union, ) -from docarray.base_document import AnyDocument, BaseDocument +from docarray.base_document import AnyDoc, BaseDoc from docarray.helper import ( _access_path_dict_to_nested_dict, _all_access_paths_valid, @@ -92,9 +92,9 @@ def __getitem__(self, item: slice): return self.content[item] -class IOMixinArray(Iterable[BaseDocument]): +class IOMixinArray(Iterable[BaseDoc]): - document_type: Type[BaseDocument] + document_type: Type[BaseDoc] @abstractmethod def __len__(self): @@ -103,7 +103,7 @@ def __len__(self): @abstractmethod def __init__( self, - docs: Optional[Iterable[BaseDocument]] = None, + docs: Optional[Iterable[BaseDoc]] = None, ): ... @@ -354,7 +354,7 @@ def from_csv( """ from docarray import DocumentArray - if cls.document_type == AnyDocument: + if cls.document_type == AnyDoc: raise TypeError( 'There is no document schema defined. ' 'Please specify the DocumentArray\'s Document type using `DocumentArray[MyDoc]`.' @@ -435,10 +435,10 @@ def from_pandas(cls, df: 'pd.DataFrame') -> 'DocumentArray': import pandas as pd - from docarray import BaseDocument, DocumentArray + from docarray import BaseDoc, DocumentArray - class Person(BaseDocument): + class Person(BaseDoc): name: str follower: int @@ -459,7 +459,7 @@ class Person(BaseDocument): """ from docarray import DocumentArray - if cls.document_type == AnyDocument: + if cls.document_type == AnyDoc: raise TypeError( 'There is no document schema defined. ' 'Please specify the DocumentArray\'s Document type using `DocumentArray[MyDoc]`.' @@ -611,7 +611,7 @@ def _load_binary_stream( protocol: str = 'protobuf', compress: Optional[str] = None, show_progress: bool = False, - ) -> Generator['BaseDocument', None, None]: + ) -> Generator['BaseDoc', None, None]: """Yield `Document` objects from a binary file :param protocol: protocol to use. It can be 'pickle' or 'protobuf' @@ -668,7 +668,7 @@ def load_binary( compress: Optional[str] = None, show_progress: bool = False, streaming: bool = False, - ) -> Union[T, Generator['BaseDocument', None, None]]: + ) -> Union[T, Generator['BaseDoc', None, None]]: """Load array elements from a compressed binary file. :param file: File or filename or serialized bytes where the data is stored. diff --git a/docarray/array/array/pushpull.py b/docarray/array/array/pushpull.py index e93fc4afec1..0bb2489d3e8 100644 --- a/docarray/array/array/pushpull.py +++ b/docarray/array/array/pushpull.py @@ -19,18 +19,18 @@ SUPPORTED_PUSH_PULL_PROTOCOLS = get_args(PUSH_PULL_PROTOCOL) if TYPE_CHECKING: # pragma: no cover - from docarray import BaseDocument, DocumentArray + from docarray import BaseDoc, DocumentArray from docarray.store.abstract_doc_store import AbstractDocStore SelfPushPullMixin = TypeVar('SelfPushPullMixin', bound='PushPullMixin') -class PushPullMixin(Iterable['BaseDocument']): +class PushPullMixin(Iterable['BaseDoc']): """Mixin class for push/pull functionality.""" __backends__: Dict[str, Type['AbstractDocStore']] = {} - document_type: Type['BaseDocument'] + document_type: Type['BaseDoc'] @abstractmethod def __len__(self) -> int: @@ -103,7 +103,7 @@ def push( @classmethod def push_stream( cls: Type[SelfPushPullMixin], - docs: Iterator['BaseDocument'], + docs: Iterator['BaseDoc'], url: str, public: bool = True, show_progress: bool = False, @@ -137,9 +137,9 @@ def pull( :param local_cache: store the downloaded DocumentArray to local folder :return: a :class:`DocumentArray` object """ - from docarray.base_document import AnyDocument + from docarray.base_document import AnyDoc - if cls.document_type == AnyDocument: + if cls.document_type == AnyDoc: raise TypeError( 'There is no document schema defined. ' 'Please specify the DocumentArray\'s Document type using `DocumentArray[MyDoc]`.' @@ -157,7 +157,7 @@ def pull_stream( url: str, show_progress: bool = False, local_cache: bool = False, - ) -> Iterator['BaseDocument']: + ) -> Iterator['BaseDoc']: """Pull a stream of Documents from the specified url. :param url: url specifying the protocol and save name of the DocumentArray. Should be of the form ``protocol://namespace/name``. e.g. ``s3://bucket/path/to/namespace/name``, ``file:///path/to/folder/name`` @@ -165,9 +165,9 @@ def pull_stream( :param local_cache: store the downloaded DocumentArray to local folder :return: Iterator of Documents """ - from docarray.base_document import AnyDocument + from docarray.base_document import AnyDoc - if cls.document_type == AnyDocument: + if cls.document_type == AnyDoc: raise TypeError( 'There is no document schema defined. ' 'Please specify the DocumentArray\'s Document type using `DocumentArray[MyDoc]`.' diff --git a/docarray/array/stacked/array_stacked.py b/docarray/array/stacked/array_stacked.py index c609a2acc31..fad2e6678c7 100644 --- a/docarray/array/stacked/array_stacked.py +++ b/docarray/array/stacked/array_stacked.py @@ -18,11 +18,11 @@ from pydantic import BaseConfig, parse_obj_as -from docarray.array.abstract_array import AnyDocumentArray +from docarray.array.abstract_array import AnyDocArray from docarray.array.array.array import DocumentArray from docarray.array.stacked.column_storage import ColumnStorage, ColumnStorageView from docarray.array.stacked.list_advance_indexing import ListAdvancedIndexing -from docarray.base_document import BaseDocument +from docarray.base_document import BaseDoc from docarray.base_document.mixins.io import _type_to_protobuf from docarray.typing import NdArray from docarray.typing.tensor.abstract_tensor import AbstractTensor @@ -48,12 +48,12 @@ else: TensorFlowTensor = None # type: ignore -T_doc = TypeVar('T_doc', bound=BaseDocument) +T_doc = TypeVar('T_doc', bound=BaseDoc) T = TypeVar('T', bound='DocumentArrayStacked') IndexIterType = Union[slice, Iterable[int], Iterable[bool], None] -class DocumentArrayStacked(AnyDocumentArray[T_doc]): +class DocumentArrayStacked(AnyDocArray[T_doc]): """ DocumentArrayStacked is a container of Documents appropriates to perform computation that require batches of data (ex: matrix multiplication, distance @@ -64,7 +64,7 @@ class DocumentArrayStacked(AnyDocumentArray[T_doc]): column based instead of row based. Each field of the schema of the DocumentArrayStack (the :attr:`~docarray.array.stacked.DocumentArrayStacked.document_type` which is a - `BaseDocument`) will be stored in a column. If the field is a tensor, the data from all Documents will be stored as a single, stacked (torch/np/tf) tensor. + `BaseDoc`) will be stored in a column. If the field is a tensor, the data from all Documents will be stored as a single, stacked (torch/np/tf) tensor. If the tensor field is `AnyTensor` or a Union of tensor types, the :attr:`~docarray.array.stacked.DocumentArrayStacked.tensor_type` will be used to determine @@ -83,7 +83,7 @@ class DocumentArrayStacked(AnyDocumentArray[T_doc]): :param docs: a DocumentArray :param tensor_type: Tensor Class used to wrap the stacked tensors. This is useful - if the BaseDocument of this DocumentArrayStacked has some undefined tensor type like + if the BaseDoc of this DocumentArrayStacked has some undefined tensor type like AnyTensor or Union of NdArray and TorchTensor """ @@ -158,12 +158,12 @@ def __init__( cast(AbstractTensor, tensor_columns[field_name])[i] = val - elif issubclass(field_type, BaseDocument): + elif issubclass(field_type, BaseDoc): doc_columns[field_name] = getattr(docs, field_name).stack( tensor_type=self.tensor_type ) - elif issubclass(field_type, AnyDocumentArray): + elif issubclass(field_type, AnyDocArray): docs_list = list() for doc in docs: da = getattr(doc, field_name) @@ -521,8 +521,8 @@ def traverse_flat( self, access_path: str, ) -> Union[List[Any], 'TorchTensor', 'NdArray']: - nodes = list(AnyDocumentArray._traverse(node=self, access_path=access_path)) - flattened = AnyDocumentArray._flatten_one_level(nodes) + nodes = list(AnyDocArray._traverse(node=self, access_path=access_path)) + flattened = AnyDocArray._flatten_one_level(nodes) cls_to_check = (NdArray, TorchTensor) if TorchTensor is not None else (NdArray,) diff --git a/docarray/base_document/__init__.py b/docarray/base_document/__init__.py index f7ce8d3d7ab..5018394f7de 100644 --- a/docarray/base_document/__init__.py +++ b/docarray/base_document/__init__.py @@ -1,6 +1,6 @@ -from docarray.base_document.any_document import AnyDocument +from docarray.base_document.any_doc import AnyDoc from docarray.base_document.base_node import BaseNode -from docarray.base_document.document import BaseDocument -from docarray.base_document.document_response import DocumentResponse +from docarray.base_document.doc import BaseDoc +from docarray.base_document.doc_response import DocResponse -__all__ = ['AnyDocument', 'BaseDocument', 'BaseNode', 'DocumentResponse'] +__all__ = ['AnyDoc', 'BaseDoc', 'BaseNode', 'DocResponse'] diff --git a/docarray/base_document/any_document.py b/docarray/base_document/any_doc.py similarity index 64% rename from docarray/base_document/any_document.py rename to docarray/base_document/any_doc.py index 4b4172c356d..f29b6f6e01e 100644 --- a/docarray/base_document/any_document.py +++ b/docarray/base_document/any_doc.py @@ -1,11 +1,11 @@ from typing import Type -from .document import BaseDocument +from .doc import BaseDoc -class AnyDocument(BaseDocument): +class AnyDoc(BaseDoc): """ - AnyDocument is a Document that is not tied to any schema + AnyDoc is a Document that is not tied to any schema """ def __init__(self, **kwargs): @@ -13,7 +13,7 @@ def __init__(self, **kwargs): self.__dict__.update(kwargs) @classmethod - def _get_field_type(cls, field: str) -> Type['BaseDocument']: + def _get_field_type(cls, field: str) -> Type['BaseDoc']: """ Accessing the nested python Class define in the schema. Could be useful for reconstruction of Document in @@ -21,4 +21,4 @@ def _get_field_type(cls, field: str) -> Type['BaseDocument']: :param field: name of the field :return: """ - return AnyDocument + return AnyDoc diff --git a/docarray/base_document/document.py b/docarray/base_document/doc.py similarity index 96% rename from docarray/base_document/document.py rename to docarray/base_document/doc.py index ab69eda0157..ab3881caf6e 100644 --- a/docarray/base_document/document.py +++ b/docarray/base_document/doc.py @@ -15,10 +15,10 @@ _console: Console = Console() -T = TypeVar('T', bound='BaseDocument') +T = TypeVar('T', bound='BaseDoc') -class BaseDocument(BaseModel, IOMixin, UpdateMixin, BaseNode): +class BaseDoc(BaseModel, IOMixin, UpdateMixin, BaseNode): """ The base class for Documents """ diff --git a/docarray/base_document/document_response.py b/docarray/base_document/doc_response.py similarity index 88% rename from docarray/base_document/document_response.py rename to docarray/base_document/doc_response.py index cb756c32412..ee58adc8a8b 100644 --- a/docarray/base_document/document_response.py +++ b/docarray/base_document/doc_response.py @@ -11,7 +11,7 @@ def __init__(self, *args, **kwargs): Response = JSONResponse = NoImportResponse # type: ignore -class DocumentResponse(JSONResponse): +class DocResponse(JSONResponse): """ This is a custom Response class for FastAPI and starlette. This is needed to handle serialization of the Document types when using FastAPI @@ -19,10 +19,10 @@ class DocumentResponse(JSONResponse): EXAMPLE USAGE .. code-block:: python from docarray.documets import Text - from docarray.base_document import DocumentResponse + from docarray.base_document import DocResponse - @app.post("/doc/", response_model=Text, response_class=DocumentResponse) + @app.post("/doc/", response_model=Text, response_class=DocResponse) async def create_item(doc: Text) -> Text: return doc """ diff --git a/docarray/base_document/mixins/io.py b/docarray/base_document/mixins/io.py index 4c79a20fc98..190c9b1a99d 100644 --- a/docarray/base_document/mixins/io.py +++ b/docarray/base_document/mixins/io.py @@ -118,7 +118,7 @@ def _type_to_protobuf(value: Any) -> 'NodeProto': class IOMixin(Iterable[Tuple[str, Any]]): """ - IOMixin to define all the bytes/protobuf/json related part of BaseDocument + IOMixin to define all the bytes/protobuf/json related part of BaseDoc """ __fields__: Dict[str, 'ModelField'] @@ -345,12 +345,12 @@ def _get_access_paths(cls) -> List[str]: :return: list of all access paths """ - from docarray import BaseDocument + from docarray import BaseDoc paths = [] for field in cls.__fields__.keys(): field_type = cls._get_field_type(field) - if not is_union_type(field_type) and issubclass(field_type, BaseDocument): + if not is_union_type(field_type) and issubclass(field_type, BaseDoc): sub_paths = field_type._get_access_paths() for path in sub_paths: paths.append(f'{field}__{path}') diff --git a/docarray/base_document/mixins/update.py b/docarray/base_document/mixins/update.py index 1ed36100409..0f00ab8ee32 100644 --- a/docarray/base_document/mixins/update.py +++ b/docarray/base_document/mixins/update.py @@ -42,11 +42,11 @@ def update(self, other: T): .. code-block:: python - from docarray import BaseDocument + from docarray import BaseDoc from docarray.documents import Text - class MyDocument(BaseDocument): + class MyDocument(BaseDoc): content: str title: Optional[str] = None tags_: List diff --git a/docarray/data/torch_dataset.py b/docarray/data/torch_dataset.py index 3032826b4a4..6d0d7f5ef68 100644 --- a/docarray/data/torch_dataset.py +++ b/docarray/data/torch_dataset.py @@ -2,11 +2,11 @@ from torch.utils.data import Dataset -from docarray import BaseDocument, DocumentArray, DocumentArrayStacked +from docarray import BaseDoc, DocumentArray, DocumentArrayStacked from docarray.typing import TorchTensor from docarray.utils._typing import change_cls_name -T_doc = TypeVar('T_doc', bound=BaseDocument) +T_doc = TypeVar('T_doc', bound=BaseDoc) class MultiModalDataset(Dataset, Generic[T_doc]): @@ -51,16 +51,16 @@ def prepend_number(text: str): .. code-block:: python import torch from torch.utils.data import DataLoader - from docarray import DocumentArray, BaseDocument + from docarray import DocumentArray, BaseDoc from docarray.data import MultiModalDataset from docarray.documents import Text - class Thesis(BaseDocument): + class Thesis(BaseDoc): title: Text - class Student(BaseDocument): + class Student(BaseDoc): thesis: Thesis @@ -92,8 +92,8 @@ def add_nonsense(student: Student): print(batch.thesis.title.embedding) """ - document_type: Optional[Type[BaseDocument]] = None - __typed_ds__: Dict[Type[BaseDocument], Type['MultiModalDataset']] = {} + document_type: Optional[Type[BaseDoc]] = None + __typed_ds__: Dict[Type[BaseDoc], Type['MultiModalDataset']] = {} def __init__( self, da: 'DocumentArray[T_doc]', preprocessing: Dict[str, Callable] @@ -132,8 +132,8 @@ def collate_fn(cls, batch: List[T_doc]): return batch_da @classmethod - def __class_getitem__(cls, item: Type[BaseDocument]) -> Type['MultiModalDataset']: - if not issubclass(item, BaseDocument): + def __class_getitem__(cls, item: Type[BaseDoc]) -> Type['MultiModalDataset']: + if not issubclass(item, BaseDoc): raise ValueError( f'{cls.__name__}[item] item should be a Document not a {item} ' ) diff --git a/docarray/display/document_array_summary.py b/docarray/display/document_array_summary.py index e7187ca6636..7ed5e4ca503 100644 --- a/docarray/display/document_array_summary.py +++ b/docarray/display/document_array_summary.py @@ -4,11 +4,11 @@ if TYPE_CHECKING: from docarray.array import DocumentArrayStacked - from docarray.array.abstract_array import AnyDocumentArray + from docarray.array.abstract_array import AnyDocArray class DocumentArraySummary: - def __init__(self, da: 'AnyDocumentArray'): + def __init__(self, da: 'AnyDocArray'): self.da = da def summary(self) -> None: diff --git a/docarray/display/document_summary.py b/docarray/display/document_summary.py index e8c6f818ea9..2606ddcd4b1 100644 --- a/docarray/display/document_summary.py +++ b/docarray/display/document_summary.py @@ -6,7 +6,7 @@ from typing_extensions import TYPE_CHECKING from typing_inspect import is_optional_type, is_union_type -from docarray.base_document.document import BaseDocument +from docarray.base_document.doc import BaseDoc from docarray.display.tensor_display import TensorDisplay from docarray.typing import ID from docarray.typing.tensor.abstract_tensor import AbstractTensor @@ -20,7 +20,7 @@ class DocumentSummary: def __init__( self, - doc: Optional['BaseDocument'] = None, + doc: Optional['BaseDoc'] = None, ): self.doc = doc @@ -32,7 +32,7 @@ def summary(self) -> None: rich.print(t) @staticmethod - def schema_summary(cls: Type['BaseDocument']) -> None: + def schema_summary(cls: Type['BaseDoc']) -> None: """Print a summary of the Documents schema.""" from rich.console import Console from rich.panel import Panel @@ -49,13 +49,13 @@ def schema_summary(cls: Type['BaseDocument']) -> None: console.print(panel) @staticmethod - def _get_schema(cls: Type['BaseDocument'], doc_name: Optional[str] = None) -> Tree: + def _get_schema(cls: Type['BaseDoc'], doc_name: Optional[str] = None) -> Tree: """Get Documents schema as a rich.tree.Tree object.""" import re from rich.tree import Tree - from docarray import BaseDocument, DocumentArray + from docarray import BaseDoc, DocumentArray root = cls.__name__ if doc_name is None else f'{doc_name}: {cls.__name__}' tree = Tree(root, highlight=True) @@ -74,7 +74,7 @@ def _get_schema(cls: Type['BaseDocument'], doc_name: Optional[str] = None) -> Tr if is_union_type(field_type) or is_optional_type(field_type): sub_tree = Tree(node_name, highlight=True) for arg in field_type.__args__: - if issubclass(arg, BaseDocument): + if issubclass(arg, BaseDoc): sub_tree.add(DocumentSummary._get_schema(cls=arg)) elif issubclass(arg, DocumentArray): sub_tree.add( @@ -82,7 +82,7 @@ def _get_schema(cls: Type['BaseDocument'], doc_name: Optional[str] = None) -> Tr ) tree.add(sub_tree) - elif issubclass(field_type, BaseDocument): + elif issubclass(field_type, BaseDoc): tree.add( DocumentSummary._get_schema(cls=field_type, doc_name=field_name) ) @@ -112,7 +112,7 @@ def __rich_console__( from rich import box, text from rich.table import Table - from docarray import BaseDocument, DocumentArray + from docarray import BaseDoc, DocumentArray table = Table( 'Attribute', @@ -125,7 +125,7 @@ def __rich_console__( for field_name, value in self.doc.__dict__.items(): col_1 = f'{field_name}: {value.__class__.__name__}' if ( - isinstance(value, (ID, DocumentArray, BaseDocument)) + isinstance(value, (ID, DocumentArray, BaseDoc)) or field_name.startswith('_') or value is None ): @@ -177,7 +177,7 @@ def _plot_recursion( :return: Tree with all children. """ - from docarray import BaseDocument, DocumentArray + from docarray import BaseDoc, DocumentArray tree = Tree(node) if tree is None else tree.add(node) # type: ignore @@ -185,14 +185,14 @@ def _plot_recursion( nested_attrs = [ k for k, v in node.doc.__dict__.items() - if isinstance(v, (DocumentArray, BaseDocument)) + if isinstance(v, (DocumentArray, BaseDoc)) ] for attr in nested_attrs: value = getattr(node.doc, attr) attr_type = value.__class__.__name__ icon = ':diamond_with_a_dot:' - if isinstance(value, BaseDocument): + if isinstance(value, BaseDoc): icon = ':large_orange_diamond:' value = [value] diff --git a/docarray/documents/audio.py b/docarray/documents/audio.py index 036d28c33ba..4db0a3dc899 100644 --- a/docarray/documents/audio.py +++ b/docarray/documents/audio.py @@ -2,7 +2,7 @@ import numpy as np -from docarray.base_document import BaseDocument +from docarray.base_document import BaseDoc from docarray.typing import AnyEmbedding, AudioUrl from docarray.typing.bytes.audio_bytes import AudioBytes from docarray.typing.tensor.abstract_tensor import AbstractTensor @@ -21,7 +21,7 @@ T = TypeVar('T', bound='AudioDoc') -class AudioDoc(BaseDocument): +class AudioDoc(BaseDoc): """ Document for handling audios. @@ -70,7 +70,7 @@ class MyAudio(Audio): .. code-block:: python - from docarray import BaseDocument + from docarray import BaseDoc from docarray.documents import AudioDoc, TextDoc diff --git a/docarray/documents/helper.py b/docarray/documents/helper.py index a7f7cc35a5f..71a56260864 100644 --- a/docarray/documents/helper.py +++ b/docarray/documents/helper.py @@ -4,19 +4,19 @@ from pydantic.config import BaseConfig from typing_extensions import TypedDict -from docarray import BaseDocument +from docarray import BaseDoc if TYPE_CHECKING: from pydantic.typing import AnyClassMethod - T_doc = TypeVar('T_doc', bound=BaseDocument) + T_doc = TypeVar('T_doc', bound=BaseDoc) def create_doc( __model_name: str, *, __config__: Optional[Type[BaseConfig]] = None, - __base__: Type['T_doc'] = BaseDocument, # type: ignore + __base__: Type['T_doc'] = BaseDoc, # type: ignore __module__: str = __name__, __validators__: Dict[str, 'AnyClassMethod'] = None, # type: ignore __cls_kwargs__: Dict[str, Any] = None, # type: ignore @@ -24,10 +24,10 @@ def create_doc( **field_definitions: Any, ) -> Type['T_doc']: """ - Dynamically create a subclass of BaseDocument. This is a wrapper around pydantic's create_model. + Dynamically create a subclass of BaseDoc. This is a wrapper around pydantic's create_model. :param __model_name: name of the created model :param __config__: config class to use for the new model - :param __base__: base class for the new model to inherit from, must be BaseDocument or its subclass + :param __base__: base class for the new model to inherit from, must be BaseDoc or its subclass :param __module__: module of the created model :param __validators__: a dict of method names and @validator class methods :param __cls_kwargs__: a dict for class creation @@ -51,13 +51,13 @@ def create_doc( tensor=(AudioNdArray, ...), ) - assert issubclass(MyAudio, BaseDocument) + assert issubclass(MyAudio, BaseDoc) assert issubclass(MyAudio, Audio) """ - if not issubclass(__base__, BaseDocument): - raise ValueError(f'{type(__base__)} is not a BaseDocument or its subclass') + if not issubclass(__base__, BaseDoc): + raise ValueError(f'{type(__base__)} is not a BaseDoc or its subclass') doc = create_model( __model_name, @@ -78,7 +78,7 @@ def create_doc_from_typeddict( **kwargs: Any, ): """ - Create a subclass of BaseDocument based on the fields of a `TypedDict`. This is a wrapper around pydantic's create_model_from_typeddict. + Create a subclass of BaseDoc based on the fields of a `TypedDict`. This is a wrapper around pydantic's create_model_from_typeddict. :param typeddict_cls: TypedDict class to use for the new Document class :param kwargs: extra arguments to pass to `create_model_from_typeddict` :return: the new Document class @@ -89,7 +89,7 @@ def create_doc_from_typeddict( from typing_extensions import TypedDict - from docarray import BaseDocument + from docarray import BaseDoc from docarray.documents import Audio from docarray.documents.helper import create_doc_from_typeddict from docarray.typing.tensor.audio import AudioNdArray @@ -102,18 +102,16 @@ class MyAudio(TypedDict): Doc = create_doc_from_typeddict(MyAudio, __base__=Audio) - assert issubclass(Doc, BaseDocument) + assert issubclass(Doc, BaseDoc) assert issubclass(Doc, Audio) """ if '__base__' in kwargs: - if not issubclass(kwargs['__base__'], BaseDocument): - raise ValueError( - f'{kwargs["__base__"]} is not a BaseDocument or its subclass' - ) + if not issubclass(kwargs['__base__'], BaseDoc): + raise ValueError(f'{kwargs["__base__"]} is not a BaseDoc or its subclass') else: - kwargs['__base__'] = BaseDocument + kwargs['__base__'] = BaseDoc doc = create_model_from_typeddict(typeddict_cls, **kwargs) @@ -122,7 +120,7 @@ class MyAudio(TypedDict): def create_doc_from_dict(model_name: str, data_dict: Dict[str, Any]) -> Type['T_doc']: """ - Create a subclass of BaseDocument based on example data given as a dictionary. + Create a subclass of BaseDoc based on example data given as a dictionary. In case the example contains None as a value, corresponding field will be viewed as the type Any. @@ -143,7 +141,7 @@ def create_doc_from_dict(model_name: str, data_dict: Dict[str, Any]) -> Type['T_ MyDoc = create_doc_from_dict(model_name='MyDoc', data_dict=data_dict) - assert issubclass(MyDoc, BaseDocument) + assert issubclass(MyDoc, BaseDoc) """ if not data_dict: diff --git a/docarray/documents/image.py b/docarray/documents/image.py index e1def7b0470..6a6c643c68b 100644 --- a/docarray/documents/image.py +++ b/docarray/documents/image.py @@ -2,7 +2,7 @@ import numpy as np -from docarray.base_document import BaseDocument +from docarray.base_document import BaseDoc from docarray.typing import AnyEmbedding, ImageBytes, ImageUrl from docarray.typing.tensor.abstract_tensor import AbstractTensor from docarray.typing.tensor.image.image_tensor import ImageTensor @@ -19,7 +19,7 @@ import tensorflow as tf # type: ignore -class ImageDoc(BaseDocument): +class ImageDoc(BaseDoc): """ Document for handling images. It can contain an ImageUrl (`Image.url`), an AnyTensor (`Image.tensor`), @@ -64,12 +64,12 @@ class MyImage(ImageDoc): .. code-block:: python - from docarray import BaseDocument + from docarray import BaseDoc from docarray.documents import ImageDoc, TextDoc # compose it - class MultiModalDoc(BaseDocument): + class MultiModalDoc(BaseDoc): image: Image text: Text diff --git a/docarray/documents/legacy/legacy_document.py b/docarray/documents/legacy/legacy_document.py index e3bd0c3b20c..fbb59369153 100644 --- a/docarray/documents/legacy/legacy_document.py +++ b/docarray/documents/legacy/legacy_document.py @@ -2,11 +2,11 @@ from typing import Any, Dict, Optional -from docarray import BaseDocument, DocumentArray +from docarray import BaseDoc, DocumentArray from docarray.typing import AnyEmbedding, AnyTensor -class LegacyDocument(BaseDocument): +class LegacyDocument(BaseDoc): """ This Document is the LegacyDocument. It follows the same schema as in DocArray v1. It can be useful to start migrating a codebase from v1 to v2. diff --git a/docarray/documents/mesh/mesh_3d.py b/docarray/documents/mesh/mesh_3d.py index a2706493423..10a45607486 100644 --- a/docarray/documents/mesh/mesh_3d.py +++ b/docarray/documents/mesh/mesh_3d.py @@ -1,6 +1,6 @@ from typing import Any, Optional, Type, TypeVar, Union -from docarray.base_document import BaseDocument +from docarray.base_document import BaseDoc from docarray.documents.mesh.vertices_and_faces import VerticesAndFaces from docarray.typing.tensor.embedding import AnyEmbedding from docarray.typing.url.url_3d.mesh_url import Mesh3DUrl @@ -8,7 +8,7 @@ T = TypeVar('T', bound='Mesh3D') -class Mesh3D(BaseDocument): +class Mesh3D(BaseDoc): """ Document for handling meshes for 3D data representation. @@ -63,12 +63,12 @@ class MyMesh3D(Mesh3D): .. code-block:: python - from docarray import BaseDocument + from docarray import BaseDoc from docarray.documents import Mesh3D, Text # compose it - class MultiModalDoc(BaseDocument): + class MultiModalDoc(BaseDoc): mesh: Mesh3D text: Text diff --git a/docarray/documents/mesh/vertices_and_faces.py b/docarray/documents/mesh/vertices_and_faces.py index a0e12e303e7..d6909414a8e 100644 --- a/docarray/documents/mesh/vertices_and_faces.py +++ b/docarray/documents/mesh/vertices_and_faces.py @@ -1,12 +1,12 @@ from typing import Any, Type, TypeVar, Union -from docarray.base_document import BaseDocument +from docarray.base_document import BaseDoc from docarray.typing.tensor.tensor import AnyTensor T = TypeVar('T', bound='VerticesAndFaces') -class VerticesAndFaces(BaseDocument): +class VerticesAndFaces(BaseDoc): """ Document for handling 3D mesh tensor data. diff --git a/docarray/documents/point_cloud/point_cloud_3d.py b/docarray/documents/point_cloud/point_cloud_3d.py index 52958a81ddb..937e2a77f20 100644 --- a/docarray/documents/point_cloud/point_cloud_3d.py +++ b/docarray/documents/point_cloud/point_cloud_3d.py @@ -2,7 +2,7 @@ import numpy as np -from docarray.base_document import BaseDocument +from docarray.base_document import BaseDoc from docarray.documents.point_cloud.points_and_colors import PointsAndColors from docarray.typing import AnyEmbedding, PointCloud3DUrl from docarray.typing.tensor.abstract_tensor import AbstractTensor @@ -19,7 +19,7 @@ T = TypeVar('T', bound='PointCloud3D') -class PointCloud3D(BaseDocument): +class PointCloud3D(BaseDoc): """ Document for handling point clouds for 3D data representation. @@ -71,12 +71,12 @@ class MyPointCloud3D(PointCloud3D): .. code-block:: python - from docarray import BaseDocument + from docarray import BaseDoc from docarray.documents import PointCloud3D, Text # compose it - class MultiModalDoc(BaseDocument): + class MultiModalDoc(BaseDoc): point_cloud: PointCloud3D text: Text diff --git a/docarray/documents/point_cloud/points_and_colors.py b/docarray/documents/point_cloud/points_and_colors.py index db588022b66..c64cd4f48bd 100644 --- a/docarray/documents/point_cloud/points_and_colors.py +++ b/docarray/documents/point_cloud/points_and_colors.py @@ -2,7 +2,7 @@ import numpy as np -from docarray.base_document import BaseDocument +from docarray.base_document import BaseDoc from docarray.typing import AnyTensor from docarray.typing.tensor.abstract_tensor import AbstractTensor from docarray.utils.misc import is_tf_available, is_torch_available @@ -18,7 +18,7 @@ T = TypeVar('T', bound='PointsAndColors') -class PointsAndColors(BaseDocument): +class PointsAndColors(BaseDoc): """ Document for handling point clouds tensor data. diff --git a/docarray/documents/text.py b/docarray/documents/text.py index d88f0c4395b..fb19397dd39 100644 --- a/docarray/documents/text.py +++ b/docarray/documents/text.py @@ -1,13 +1,13 @@ from typing import Any, Optional, Type, TypeVar, Union -from docarray.base_document import BaseDocument +from docarray.base_document import BaseDoc from docarray.typing import TextUrl from docarray.typing.tensor.embedding import AnyEmbedding T = TypeVar('T', bound='TextDoc') -class TextDoc(BaseDocument): +class TextDoc(BaseDoc): """ Document for handling text. It can contain a TextUrl (`TextDoc.url`), a str (`TextDoc.text`), @@ -60,12 +60,12 @@ class MyText(Text): .. code-block:: python - from docarray import BaseDocument + from docarray import BaseDoc from docarray.documents import ImageDoc, TextDoc # compose it - class MultiModalDoc(BaseDocument): + class MultiModalDoc(BaseDoc): image_doc: Image text_doc: Text diff --git a/docarray/documents/video.py b/docarray/documents/video.py index 3e338b632db..ceef6122561 100644 --- a/docarray/documents/video.py +++ b/docarray/documents/video.py @@ -2,7 +2,7 @@ import numpy as np -from docarray.base_document import BaseDocument +from docarray.base_document import BaseDoc from docarray.documents import AudioDoc from docarray.typing import AnyEmbedding, AnyTensor from docarray.typing.tensor.abstract_tensor import AbstractTensor @@ -23,7 +23,7 @@ T = TypeVar('T', bound='VideoDoc') -class VideoDoc(BaseDocument): +class VideoDoc(BaseDoc): """ Document for handling video. The Video Document can contain a VideoUrl (`VideoDoc.url`), an Audio Document @@ -73,12 +73,12 @@ class MyVideo(Video): .. code-block:: python - from docarray import BaseDocument + from docarray import BaseDoc from docarray.documents import TextDoc, VideoDoc # compose it - class MultiModalDoc(BaseDocument): + class MultiModalDoc(BaseDoc): video: Video text: Text diff --git a/docarray/helper.py b/docarray/helper.py index 9c843a3c675..d921f8b3f7d 100644 --- a/docarray/helper.py +++ b/docarray/helper.py @@ -16,10 +16,10 @@ ) if TYPE_CHECKING: - from docarray import BaseDocument + from docarray import BaseDoc -def _is_access_path_valid(doc_type: Type['BaseDocument'], access_path: str) -> bool: +def _is_access_path_valid(doc_type: Type['BaseDoc'], access_path: str) -> bool: """ Check if a given access path ("__"-separated) is a valid path for a given Document class. """ @@ -29,7 +29,7 @@ def _is_access_path_valid(doc_type: Type['BaseDocument'], access_path: str) -> b def _all_access_paths_valid( - doc_type: Type['BaseDocument'], access_paths: List[str] + doc_type: Type['BaseDoc'], access_paths: List[str] ) -> List[bool]: """ Check if all access paths ("__"-separated) are valid for a given Document class. @@ -127,7 +127,7 @@ def _update_nested_dicts( def _get_field_type_by_access_path( - doc_type: Type['BaseDocument'], access_path: str + doc_type: Type['BaseDoc'], access_path: str ) -> Optional[Type]: """ Get field type by "__"-separated access path. @@ -135,7 +135,7 @@ def _get_field_type_by_access_path( :param access_path: "__"-separated access path :return: field type of accessed attribute. If access path is invalid, return None. """ - from docarray import BaseDocument, DocumentArray + from docarray import BaseDoc, DocumentArray field, _, remaining = access_path.partition('__') field_valid = field in doc_type.__fields__.keys() @@ -147,7 +147,7 @@ def _get_field_type_by_access_path( d = doc_type._get_field_type(field) if issubclass(d, DocumentArray): return _get_field_type_by_access_path(d.document_type, remaining) - elif issubclass(d, BaseDocument): + elif issubclass(d, BaseDoc): return _get_field_type_by_access_path(d, remaining) else: return None @@ -180,12 +180,12 @@ def get_paths( .. code-block:: python from typing import Optional - from docarray import BaseDocument, DocumentArray + from docarray import BaseDoc, DocumentArray from docarray.helper import get_paths from docarray.typing import TextUrl, ImageUrl - class Banner(BaseDocument): + class Banner(BaseDoc): text_url: TextUrl image_url: Optional[ImageUrl] diff --git a/docarray/index/abstract.py b/docarray/index/abstract.py index 6aec9549726..2a7d99d2ff3 100644 --- a/docarray/index/abstract.py +++ b/docarray/index/abstract.py @@ -23,8 +23,8 @@ from pydantic.error_wrappers import ValidationError from typing_inspect import get_args, is_optional_type, is_union_type -from docarray import BaseDocument, DocumentArray -from docarray.array.abstract_array import AnyDocumentArray +from docarray import BaseDoc, DocumentArray +from docarray.array.abstract_array import AnyDocArray from docarray.typing import AnyTensor from docarray.utils._typing import unwrap_optional_type from docarray.utils.find import FindResult, _FindResult @@ -41,7 +41,7 @@ from docarray.typing import TensorFlowTensor -TSchema = TypeVar('TSchema', bound=BaseDocument) +TSchema = TypeVar('TSchema', bound=BaseDoc) class FindResultBatched(NamedTuple): @@ -81,12 +81,12 @@ class _ColumnInfo: config: Dict[str, Any] -class BaseDocumentIndex(ABC, Generic[TSchema]): +class BaseDocIndex(ABC, Generic[TSchema]): """Abstract class for all Document Stores""" - # the BaseDocument that defines the schema of the store + # the BaseDoc that defines the schema of the store # for subclasses this is filled automatically - _schema: Optional[Type[BaseDocument]] = None + _schema: Optional[Type[BaseDoc]] = None def __init__(self, db_config=None, **kwargs): if self._schema is None: @@ -343,9 +343,7 @@ def __getitem__( elif isinstance(doc_sequence[0], Dict): out_da = self._dict_list_to_docarray(doc_sequence) # type: ignore else: - da_cls = DocumentArray.__class_getitem__( - cast(Type[BaseDocument], self._schema) - ) + da_cls = DocumentArray.__class_getitem__(cast(Type[BaseDoc], self._schema)) out_da = da_cls(doc_sequence) return out_da[0] if return_singleton else out_da @@ -379,12 +377,12 @@ def configure(self, runtime_config=None, **kwargs): raise ValueError(f'runtime_config must be of type {self.RuntimeConfig}') self._runtime_config = runtime_config - def index(self, docs: Union[BaseDocument, Sequence[BaseDocument]], **kwargs): + def index(self, docs: Union[BaseDoc, Sequence[BaseDoc]], **kwargs): """index Documents into the index. :param docs: Documents to index. """ - if not isinstance(docs, (BaseDocument, DocumentArray)): + if not isinstance(docs, (BaseDoc, DocumentArray)): self._logger.warning( 'Passing a sequence of Documents that is not a DocumentArray comes at ' 'a performance penalty, since compatibility with the schema of Index ' @@ -397,7 +395,7 @@ def index(self, docs: Union[BaseDocument, Sequence[BaseDocument]], **kwargs): def find( self, - query: Union[AnyTensor, BaseDocument], + query: Union[AnyTensor, BaseDoc], search_field: str = 'embedding', limit: int = 10, **kwargs, @@ -414,7 +412,7 @@ def find( :return: a named tuple containing `documents` and `scores` """ self._logger.debug(f'Executing `find` for search field {search_field}') - if isinstance(query, BaseDocument): + if isinstance(query, BaseDoc): query_vec = self._get_values_by_column([query], search_field)[0] else: query_vec = query @@ -509,7 +507,7 @@ def filter_batched( def text_search( self, - query: Union[str, BaseDocument], + query: Union[str, BaseDoc], search_field: str = 'text', limit: int = 10, **kwargs, @@ -522,7 +520,7 @@ def text_search( :return: a named tuple containing `documents` and `scores` """ self._logger.debug(f'Executing `text_search` for search field {search_field}') - if isinstance(query, BaseDocument): + if isinstance(query, BaseDoc): query_text = self._get_values_by_column([query], search_field)[0] else: query_text = query @@ -537,7 +535,7 @@ def text_search( def text_search_batched( self, - queries: Union[Sequence[str], Sequence[BaseDocument]], + queries: Union[Sequence[str], Sequence[BaseDoc]], search_field: str = 'text', limit: int = 10, **kwargs, @@ -552,8 +550,8 @@ def text_search_batched( self._logger.debug( f'Executing `text_search_batched` for search field {search_field}' ) - if isinstance(queries[0], BaseDocument): - query_docs: Sequence[BaseDocument] = cast(Sequence[BaseDocument], queries) + if isinstance(queries[0], BaseDoc): + query_docs: Sequence[BaseDoc] = cast(Sequence[BaseDoc], queries) query_texts: Sequence[str] = self._get_values_by_column( query_docs, search_field ) @@ -573,7 +571,7 @@ def text_search_batched( ########################################################## @staticmethod - def _get_values_by_column(docs: Sequence[BaseDocument], col_name: str) -> List[Any]: + def _get_values_by_column(docs: Sequence[BaseDoc], col_name: str) -> List[Any]: """Get the value of a column of a document. :param docs: The DocumentArray to get the values from @@ -584,7 +582,7 @@ def _get_values_by_column(docs: Sequence[BaseDocument], col_name: str) -> List[A for doc in docs: if '__' in col_name: fields = col_name.split('__') - leaf_doc: BaseDocument = doc + leaf_doc: BaseDoc = doc for f in fields[:-1]: leaf_doc = getattr(leaf_doc, f) leaf_vals.append(getattr(leaf_doc, fields[-1])) @@ -605,7 +603,7 @@ def _transpose_col_value_dict( return (dict(zip(col_value_dict, row)) for row in zip(*col_value_dict.values())) def _get_col_value_dict( - self, docs: Union[BaseDocument, Sequence[BaseDocument]] + self, docs: Union[BaseDoc, Sequence[BaseDoc]] ) -> Dict[str, Generator[Any, None, None]]: """ Get all data from a (sequence of) document(s), flattened out by column. @@ -614,8 +612,8 @@ def _get_col_value_dict( :param docs: The document(s) to get the data from :return: A dictionary mapping column names to a generator of values """ - if isinstance(docs, BaseDocument): - docs_seq: Sequence[BaseDocument] = [docs] + if isinstance(docs, BaseDoc): + docs_seq: Sequence[BaseDoc] = [docs] else: docs_seq = docs @@ -639,7 +637,7 @@ def __class_getitem__(cls, item: Type[TSchema]): # do nothing # enables use in static contexts with type vars, e.g. as type annotation return Generic.__class_getitem__.__func__(cls, item) - if not issubclass(item, BaseDocument): + if not issubclass(item, BaseDoc): raise ValueError( f'{cls.__name__}[item] `item` should be a Document not a {item} ' ) @@ -662,7 +660,7 @@ def build_query(self) -> QueryBuilder: @classmethod def _flatten_schema( - cls, schema: Type[BaseDocument], name_prefix: str = '' + cls, schema: Type[BaseDoc], name_prefix: str = '' ) -> List[Tuple[str, Type, 'ModelField']]: """Flatten the schema of a Document into a list of column names and types. Nested Documents are handled in a recursive manner by adding `'__'` as a prefix to the column name. @@ -684,13 +682,13 @@ def _flatten_schema( for t_arg in union_args: if t_arg is type(None): pass - elif issubclass(t_arg, BaseDocument): + elif issubclass(t_arg, BaseDoc): names_types_fields.extend( cls._flatten_schema(t_arg, name_prefix=inner_prefix) ) else: names_types_fields.append((field_name, t_, field_)) - elif issubclass(t_, BaseDocument): + elif issubclass(t_, BaseDoc): names_types_fields.extend( cls._flatten_schema(t_, name_prefix=inner_prefix) ) @@ -698,12 +696,10 @@ def _flatten_schema( names_types_fields.append((name_prefix + field_name, t_, field_)) return names_types_fields - def _create_column_infos( - self, schema: Type[BaseDocument] - ) -> Dict[str, _ColumnInfo]: + def _create_column_infos(self, schema: Type[BaseDoc]) -> Dict[str, _ColumnInfo]: """Collects information about every column that is implied by a given schema. - :param schema: The schema (subclass of BaseDocument) to analyze and parse + :param schema: The schema (subclass of BaseDoc) to analyze and parse columns from :returns: A dictionary mapping from column names to column information. """ @@ -718,7 +714,7 @@ def _create_column_infos( 'Union types are not supported in the schema of a DocumentIndex.' f' Instead of using type {type_} use a single specific type.' ) - elif issubclass(type_, AnyDocumentArray): + elif issubclass(type_, AnyDocArray): raise ValueError( 'Indexing field of DocumentArray type (=subindex)' 'is not yet supported.' @@ -758,8 +754,8 @@ def _create_single_column(self, field: 'ModelField', type_: Type) -> _ColumnInfo ) def _validate_docs( - self, docs: Union[BaseDocument, Sequence[BaseDocument]] - ) -> DocumentArray[BaseDocument]: + self, docs: Union[BaseDoc, Sequence[BaseDoc]] + ) -> DocumentArray[BaseDoc]: """Validates Document against the schema of the Document Index. For validation to pass, the schema of `docs` and the schema of the Document Index need to evaluate to the same flattened columns. @@ -771,12 +767,12 @@ def _validate_docs( DocumentArray, evaluation is performed for every Document in `docs`. :return: A DocumentArray containing the Documents in `docs` """ - if isinstance(docs, BaseDocument): + if isinstance(docs, BaseDoc): docs = [docs] if isinstance(docs, DocumentArray): # validation shortcut for DocumentArray; only look at the schema reference_schema_flat = self._flatten_schema( - cast(Type[BaseDocument], self._schema) + cast(Type[BaseDoc], self._schema) ) reference_names = [name for (name, _, _) in reference_schema_flat] reference_types = [t_ for (_, t_, _) in reference_schema_flat] @@ -797,9 +793,7 @@ def _validate_docs( for i in range(len(docs)): # validate the data try: - out_docs.append( - cast(Type[BaseDocument], self._schema).parse_obj(docs[i]) - ) + out_docs.append(cast(Type[BaseDoc], self._schema).parse_obj(docs[i])) except (ValueError, ValidationError): raise ValueError( 'The schema of the input Documents is not compatible with the schema of the Document Index.' @@ -807,7 +801,7 @@ def _validate_docs( ' and that the types of your data match the types of the Document Index schema.' ) - return DocumentArray[BaseDocument].construct(out_docs) + return DocumentArray[BaseDoc].construct(out_docs) def _to_numpy(self, val: Any, allow_passthrough=False) -> Any: """ @@ -833,8 +827,8 @@ def _to_numpy(self, val: Any, allow_passthrough=False) -> Any: raise ValueError(f'Unsupported input type for {type(self)}: {type(val)}') def _convert_dict_to_doc( - self, doc_dict: Dict[str, Any], schema: Type[BaseDocument] - ) -> BaseDocument: + self, doc_dict: Dict[str, Any], schema: Type[BaseDoc] + ) -> BaseDoc: """ Convert a dict to a Document object. @@ -845,7 +839,7 @@ def _convert_dict_to_doc( for field_name, _ in schema.__fields__.items(): t_ = unwrap_optional_type(schema._get_field_type(field_name)) - if issubclass(t_, BaseDocument): + if issubclass(t_, BaseDoc): inner_dict = {} fields = [ @@ -857,7 +851,7 @@ def _convert_dict_to_doc( doc_dict[field_name] = self._convert_dict_to_doc(inner_dict, t_) - schema_cls = cast(Type[BaseDocument], schema) + schema_cls = cast(Type[BaseDoc], schema) return schema_cls(**doc_dict) def _dict_list_to_docarray( @@ -866,5 +860,5 @@ def _dict_list_to_docarray( """Convert a list of docs in dict type to a DocumentArray of the schema type.""" doc_list = [self._convert_dict_to_doc(doc_dict, self._schema) for doc_dict in dict_list] # type: ignore - da_cls = DocumentArray.__class_getitem__(cast(Type[BaseDocument], self._schema)) + da_cls = DocumentArray.__class_getitem__(cast(Type[BaseDoc], self._schema)) return da_cls(doc_list) diff --git a/docarray/index/backends/hnswlib.py b/docarray/index/backends/hnswlib.py index 4f1799fb7d7..8b5cf27804b 100644 --- a/docarray/index/backends/hnswlib.py +++ b/docarray/index/backends/hnswlib.py @@ -20,9 +20,9 @@ import hnswlib import numpy as np -from docarray import BaseDocument, DocumentArray +from docarray import BaseDoc, DocumentArray from docarray.index.abstract import ( - BaseDocumentIndex, + BaseDocIndex, _ColumnInfo, _FindResultBatched, _raise_not_composable, @@ -33,7 +33,7 @@ from docarray.utils.find import _FindResult from docarray.utils.misc import is_np_int, is_tf_available, is_torch_available -TSchema = TypeVar('TSchema', bound=BaseDocument) +TSchema = TypeVar('TSchema', bound=BaseDoc) T = TypeVar('T', bound='HnswDocumentIndex') HNSWLIB_PY_VEC_TYPES = [list, tuple, np.ndarray] @@ -65,7 +65,7 @@ def inner(self, *args, **kwargs): return inner -class HnswDocumentIndex(BaseDocumentIndex, Generic[TSchema]): +class HnswDocumentIndex(BaseDocIndex, Generic[TSchema]): def __init__(self, db_config=None, **kwargs): super().__init__(db_config=db_config, **kwargs) self._db_config = cast(HnswDocumentIndex.DBConfig, self._db_config) @@ -121,7 +121,7 @@ def __init__(self, db_config=None, **kwargs): ############################################### # Inner classes for query builder and configs # ############################################### - class QueryBuilder(BaseDocumentIndex.QueryBuilder): + class QueryBuilder(BaseDocIndex.QueryBuilder): def __init__(self, query: Optional[List[Tuple[str, Dict]]] = None): super().__init__() # list of tuples (method name, kwargs) @@ -138,11 +138,11 @@ def build(self, *args, **kwargs) -> Any: text_search_batched = _raise_not_supported('text_search') @dataclass - class DBConfig(BaseDocumentIndex.DBConfig): + class DBConfig(BaseDocIndex.DBConfig): work_dir: str = '.' @dataclass - class RuntimeConfig(BaseDocumentIndex.RuntimeConfig): + class RuntimeConfig(BaseDocIndex.RuntimeConfig): default_column_config: Dict[Type, Dict[str, Any]] = field( default_factory=lambda: { np.ndarray: { @@ -177,7 +177,7 @@ def _index(self, column_data_dic, **kwargs): # not needed, we implement `index` directly ... - def index(self, docs: Union[BaseDocument, Sequence[BaseDocument]], **kwargs): + def index(self, docs: Union[BaseDoc, Sequence[BaseDoc]], **kwargs): """index a document into the store""" if kwargs: raise ValueError(f'{list(kwargs.keys())} are not valid keyword arguments') @@ -204,11 +204,11 @@ def execute_query(self, query: List[Tuple[str, Dict]], *args, **kwargs) -> Any: f'args and kwargs not supported for `execute_query` on {type(self)}' ) - ann_docs = DocumentArray.__class_getitem__( - cast(Type[BaseDocument], self._schema) - )([]) + ann_docs = DocumentArray.__class_getitem__(cast(Type[BaseDoc], self._schema))( + [] + ) filter_conditions = [] - doc_to_score: Dict[BaseDocument, Any] = {} + doc_to_score: Dict[BaseDoc, Any] = {} for op, op_kwargs in query: if op == 'find': docs, scores = self.find(**op_kwargs) @@ -220,9 +220,7 @@ def execute_query(self, query: List[Tuple[str, Dict]], *args, **kwargs) -> Any: self._logger.debug(f'Executing query {query}') docs_filtered = ann_docs for cond in filter_conditions: - da_cls = DocumentArray.__class_getitem__( - cast(Type[BaseDocument], self._schema) - ) + da_cls = DocumentArray.__class_getitem__(cast(Type[BaseDoc], self._schema)) docs_filtered = da_cls(filter_docs(docs_filtered, cond)) self._logger.debug(f'{len(docs_filtered)} results found') @@ -364,7 +362,7 @@ def _create_docs_table(self): 'CREATE TABLE IF NOT EXISTS docs (doc_id INTEGER PRIMARY KEY, data BLOB)' ) - def _send_docs_to_sqlite(self, docs: Sequence[BaseDocument]): + def _send_docs_to_sqlite(self, docs: Sequence[BaseDoc]): ids = (self._to_hashed_id(doc.id) for doc in docs) self._sqlite_cursor.executemany( 'INSERT INTO docs VALUES (?, ?)', @@ -381,13 +379,13 @@ def _get_docs_sqlite_unsorted(self, univ_ids: Sequence[int]): 'SELECT data FROM docs WHERE doc_id IN %s' % sql_id_list, ) rows = self._sqlite_cursor.fetchall() - da_cls = DocumentArray.__class_getitem__(cast(Type[BaseDocument], self._schema)) + da_cls = DocumentArray.__class_getitem__(cast(Type[BaseDoc], self._schema)) return da_cls([self._doc_from_bytes(row[0]) for row in rows]) def _get_docs_sqlite_doc_id(self, doc_ids: Sequence[str]) -> DocumentArray[TSchema]: hashed_ids = tuple(self._to_hashed_id(id_) for id_ in doc_ids) docs_unsorted = self._get_docs_sqlite_unsorted(hashed_ids) - da_cls = DocumentArray.__class_getitem__(cast(Type[BaseDocument], self._schema)) + da_cls = DocumentArray.__class_getitem__(cast(Type[BaseDoc], self._schema)) return da_cls(sorted(docs_unsorted, key=lambda doc: doc_ids.index(doc.id))) def _get_docs_sqlite_hashed_id(self, hashed_ids: Sequence[int]) -> DocumentArray: @@ -396,7 +394,7 @@ def _get_docs_sqlite_hashed_id(self, hashed_ids: Sequence[int]) -> DocumentArray def _in_position(doc): return hashed_ids.index(self._to_hashed_id(doc.id)) - da_cls = DocumentArray.__class_getitem__(cast(Type[BaseDocument], self._schema)) + da_cls = DocumentArray.__class_getitem__(cast(Type[BaseDoc], self._schema)) return da_cls(sorted(docs_unsorted, key=_in_position)) def _delete_docs_from_sqlite(self, doc_ids: Sequence[Union[str, int]]): @@ -413,9 +411,9 @@ def _get_num_docs_sqlite(self) -> int: return self._sqlite_cursor.fetchone()[0] # serialization helpers - def _doc_to_bytes(self, doc: BaseDocument) -> bytes: + def _doc_to_bytes(self, doc: BaseDoc) -> bytes: return doc.to_protobuf().SerializeToString() - def _doc_from_bytes(self, data: bytes) -> BaseDocument: - schema_cls = cast(Type[BaseDocument], self._schema) + def _doc_from_bytes(self, data: bytes) -> BaseDoc: + schema_cls = cast(Type[BaseDoc], self._schema) return schema_cls.from_protobuf(DocumentProto.FromString(data)) diff --git a/docarray/store/abstract_doc_store.py b/docarray/store/abstract_doc_store.py index cf212ceada4..ff0252bb397 100644 --- a/docarray/store/abstract_doc_store.py +++ b/docarray/store/abstract_doc_store.py @@ -4,7 +4,7 @@ from typing_extensions import TYPE_CHECKING if TYPE_CHECKING: - from docarray import BaseDocument, DocumentArray + from docarray import BaseDoc, DocumentArray class AbstractDocStore(ABC): @@ -52,7 +52,7 @@ def push( @staticmethod @abstractmethod def push_stream( - docs: Iterator['BaseDocument'], + docs: Iterator['BaseDoc'], url: str, public: bool = True, show_progress: bool = False, @@ -93,7 +93,7 @@ def pull_stream( name: str, show_progress: bool, local_cache: bool, - ) -> Iterator['BaseDocument']: + ) -> Iterator['BaseDoc']: """Pull a stream of documents from the specified name. :param da_cls: The DocumentArray class to instantiate diff --git a/docarray/store/file.py b/docarray/store/file.py index 287acbd8cc3..fe7f3adb003 100644 --- a/docarray/store/file.py +++ b/docarray/store/file.py @@ -10,7 +10,7 @@ from docarray.utils.cache import get_cache_path if TYPE_CHECKING: - from docarray import BaseDocument, DocumentArray + from docarray import BaseDoc, DocumentArray SelfFileDocStore = TypeVar('SelfFileDocStore', bound='FileDocStore') @@ -110,7 +110,7 @@ def push( @classmethod def push_stream( cls: Type[SelfFileDocStore], - docs: Iterator['BaseDocument'], + docs: Iterator['BaseDoc'], name: str, public: bool = True, show_progress: bool = False, @@ -171,7 +171,7 @@ def pull_stream( name: str, show_progress: bool, local_cache: bool, - ) -> Iterator['BaseDocument']: + ) -> Iterator['BaseDoc']: """Pull a stream of Documents from the specified file. :param name: The file path to pull from. diff --git a/docarray/store/jac.py b/docarray/store/jac.py index 285e65e41f9..5bcdc849b20 100644 --- a/docarray/store/jac.py +++ b/docarray/store/jac.py @@ -29,7 +29,7 @@ if TYPE_CHECKING: # pragma: no cover import io - from docarray import BaseDocument, DocumentArray + from docarray import BaseDoc, DocumentArray def _get_length_from_summary(summary: List[Dict]) -> Optional[int]: @@ -231,7 +231,7 @@ def gen(): @hubble.login_required def push_stream( cls: Type[SelfJACDocStore], - docs: Iterator['BaseDocument'], + docs: Iterator['BaseDoc'], name: str, public: bool = True, show_progress: bool = False, @@ -292,7 +292,7 @@ def pull_stream( name: str, show_progress: bool = False, local_cache: bool = False, - ) -> Iterator['BaseDocument']: + ) -> Iterator['BaseDoc']: """Pull a :class:`DocumentArray` from Jina AI Cloud to local. :param name: the upload name set during :meth:`.push` diff --git a/docarray/store/s3.py b/docarray/store/s3.py index 89f5b5b1310..64399b827ab 100644 --- a/docarray/store/s3.py +++ b/docarray/store/s3.py @@ -13,7 +13,7 @@ from docarray.utils.cache import get_cache_path if TYPE_CHECKING: # pragma: no cover - from docarray import BaseDocument, DocumentArray + from docarray import BaseDoc, DocumentArray SelfS3DocStore = TypeVar('SelfS3DocStore', bound='S3DocStore') @@ -133,7 +133,7 @@ def push( @staticmethod def push_stream( - docs: Iterator['BaseDocument'], + docs: Iterator['BaseDoc'], name: str, public: bool = True, show_progress: bool = False, @@ -199,7 +199,7 @@ def pull_stream( name: str, show_progress: bool, local_cache: bool, - ) -> Iterator['BaseDocument']: + ) -> Iterator['BaseDoc']: """Pull a stream of Documents from the specified name. Name is expected to be in the format of bucket/key. diff --git a/docarray/typing/bytes/audio_bytes.py b/docarray/typing/bytes/audio_bytes.py index 2c179006087..6a6c4e14bfd 100644 --- a/docarray/typing/bytes/audio_bytes.py +++ b/docarray/typing/bytes/audio_bytes.py @@ -50,7 +50,7 @@ def load(self) -> Tuple[np.ndarray, int]: .. code-block:: python - from docarray import BaseDocument + from docarray import BaseDoc import numpy as np from docarray.typing import AudioUrl diff --git a/docarray/typing/bytes/image_bytes.py b/docarray/typing/bytes/image_bytes.py index 000c5e77176..2888f60fadd 100644 --- a/docarray/typing/bytes/image_bytes.py +++ b/docarray/typing/bytes/image_bytes.py @@ -55,12 +55,12 @@ def load( .. code-block:: python - from docarray import BaseDocument + from docarray import BaseDoc from docarray.typing import ImageUrl import numpy as np - class MyDoc(BaseDocument): + class MyDoc(BaseDoc): img_url: ImageUrl diff --git a/docarray/typing/bytes/video_bytes.py b/docarray/typing/bytes/video_bytes.py index a7705c12a9d..fb45a36063f 100644 --- a/docarray/typing/bytes/video_bytes.py +++ b/docarray/typing/bytes/video_bytes.py @@ -59,12 +59,12 @@ def load(self, **kwargs) -> VideoLoadResult: .. code-block:: python - from docarray import BaseDocument + from docarray import BaseDoc from docarray.typing import VideoUrl import numpy as np - class MyDoc(BaseDocument): + class MyDoc(BaseDoc): video_url: VideoUrl diff --git a/docarray/typing/tensor/audio/audio_ndarray.py b/docarray/typing/tensor/audio/audio_ndarray.py index 93551df374d..bae64ed36c2 100644 --- a/docarray/typing/tensor/audio/audio_ndarray.py +++ b/docarray/typing/tensor/audio/audio_ndarray.py @@ -16,12 +16,12 @@ class AudioNdArray(AbstractAudioTensor, NdArray): from typing import Optional - from docarray import BaseDocument + from docarray import BaseDoc from docarray.typing import AudioNdArray, AudioUrl import numpy as np - class MyAudioDoc(BaseDocument): + class MyAudioDoc(BaseDoc): title: str audio_tensor: Optional[AudioNdArray] url: Optional[AudioUrl] diff --git a/docarray/typing/tensor/audio/audio_tensorflow_tensor.py b/docarray/typing/tensor/audio/audio_tensorflow_tensor.py index a06a2682c96..a91b5b3183a 100644 --- a/docarray/typing/tensor/audio/audio_tensorflow_tensor.py +++ b/docarray/typing/tensor/audio/audio_tensorflow_tensor.py @@ -25,11 +25,11 @@ class AudioTensorFlowTensor( import tensorflow as tf from pydantic import parse_obj_as - from docarray import BaseDocument + from docarray import BaseDoc from docarray.typing import AudioTensorFlowTensor, AudioUrl - class MyAudioDoc(BaseDocument): + class MyAudioDoc(BaseDoc): title: str audio_tensor: Optional[AudioTensorFlowTensor] url: Optional[AudioUrl] diff --git a/docarray/typing/tensor/audio/audio_torch_tensor.py b/docarray/typing/tensor/audio/audio_torch_tensor.py index 08bd3b9641c..86d09e2212f 100644 --- a/docarray/typing/tensor/audio/audio_torch_tensor.py +++ b/docarray/typing/tensor/audio/audio_torch_tensor.py @@ -18,11 +18,11 @@ class AudioTorchTensor(AbstractAudioTensor, TorchTensor, metaclass=metaTorchAndN import torch - from docarray import BaseDocument + from docarray import BaseDoc from docarray.typing import AudioTorchTensor, AudioUrl - class MyAudioDoc(BaseDocument): + class MyAudioDoc(BaseDoc): title: str audio_tensor: Optional[AudioTorchTensor] url: Optional[AudioUrl] diff --git a/docarray/typing/tensor/image/image_ndarray.py b/docarray/typing/tensor/image/image_ndarray.py index 5912960b037..df304cd1d21 100644 --- a/docarray/typing/tensor/image/image_ndarray.py +++ b/docarray/typing/tensor/image/image_ndarray.py @@ -20,11 +20,11 @@ class ImageNdArray(AbstractImageTensor, NdArray): from typing import Optional - from docarray import BaseDocument + from docarray import BaseDoc from docarray.typing import ImageNdArray, ImageUrl - class MyImageDoc(BaseDocument): + class MyImageDoc(BaseDoc): title: str tensor: Optional[ImageNdArray] url: Optional[ImageUrl] diff --git a/docarray/typing/tensor/image/image_tensorflow_tensor.py b/docarray/typing/tensor/image/image_tensorflow_tensor.py index 62660de7987..7afcbb38086 100644 --- a/docarray/typing/tensor/image/image_tensorflow_tensor.py +++ b/docarray/typing/tensor/image/image_tensorflow_tensor.py @@ -24,11 +24,11 @@ class ImageTensorFlowTensor( from typing import Optional - from docarray import BaseDocument + from docarray import BaseDoc from docarray.typing import ImageTensorFlowTensor, ImageUrl - class MyImageDoc(BaseDocument): + class MyImageDoc(BaseDoc): title: str tensor: Optional[ImageTensorFlowTensor] url: Optional[ImageUrl] diff --git a/docarray/typing/tensor/image/image_torch_tensor.py b/docarray/typing/tensor/image/image_torch_tensor.py index bc5bdd640de..90ecac13750 100644 --- a/docarray/typing/tensor/image/image_torch_tensor.py +++ b/docarray/typing/tensor/image/image_torch_tensor.py @@ -22,11 +22,11 @@ class ImageTorchTensor(AbstractImageTensor, TorchTensor, metaclass=metaTorchAndN from typing import Optional - from docarray import BaseDocument + from docarray import BaseDoc from docarray.typing import ImageTorchTensor, ImageUrl - class MyImageDoc(BaseDocument): + class MyImageDoc(BaseDoc): title: str tensor: Optional[ImageTorchTensor] url: Optional[ImageUrl] diff --git a/docarray/typing/tensor/ndarray.py b/docarray/typing/tensor/ndarray.py index 1ba176c2748..2ed7d649868 100644 --- a/docarray/typing/tensor/ndarray.py +++ b/docarray/typing/tensor/ndarray.py @@ -50,16 +50,17 @@ class NdArray(np.ndarray, AbstractTensor, Generic[ShapeT]): .. code-block:: python - from docarray import BaseDocument + from docarray import BaseDoc from docarray.typing import NdArray import numpy as np - class MyDoc(BaseDocument): + class MyDoc(BaseDoc): arr: NdArray image_arr: NdArray[3, 224, 224] square_crop: NdArray[3, 'x', 'x'] - random_image: NdArray[3, ...] # first dimension is fixed, can have arbitrary shape + random_image: NdArray[3, ...] # first dimension is fixed, can have arbitrary shape + # create a document with tensors doc = MyDoc( diff --git a/docarray/typing/tensor/tensorflow_tensor.py b/docarray/typing/tensor/tensorflow_tensor.py index f08e68deeaa..ec686b486ba 100644 --- a/docarray/typing/tensor/tensorflow_tensor.py +++ b/docarray/typing/tensor/tensorflow_tensor.py @@ -94,16 +94,18 @@ class TensorFlowTensor(AbstractTensor, Generic[ShapeT], metaclass=metaTensorFlow .. code-block:: python - from docarray import BaseDocument + from docarray import BaseDoc from docarray.typing import TensorFlowTensor import tensorflow as tf - class MyDoc(BaseDocument): + class MyDoc(BaseDoc): tensor: TensorFlowTensor image_tensor: TensorFlowTensor[3, 224, 224] square_crop: TensorFlowTensor[3, 'x', 'x'] - random_image: TensorFlowTensor[3, ...] # first dimension is fixed, can have arbitrary shape + random_image: TensorFlowTensor[ + 3, ... + ] # first dimension is fixed, can have arbitrary shape # create a document with tensors diff --git a/docarray/typing/tensor/torch_tensor.py b/docarray/typing/tensor/torch_tensor.py index 0225f6760cd..a9c71d2e747 100644 --- a/docarray/typing/tensor/torch_tensor.py +++ b/docarray/typing/tensor/torch_tensor.py @@ -50,16 +50,18 @@ class TorchTensor( .. code-block:: python - from docarray import BaseDocument + from docarray import BaseDoc from docarray.typing import TorchTensor import torch - class MyDoc(BaseDocument): + class MyDoc(BaseDoc): tensor: TorchTensor image_tensor: TorchTensor[3, 224, 224] square_crop: TorchTensor[3, 'x', 'x'] - random_image: TorchTensor[3, ...] # first dimension is fixed, can have arbitrary shape + random_image: TorchTensor[ + 3, ... + ] # first dimension is fixed, can have arbitrary shape # create a document with tensors diff --git a/docarray/typing/tensor/video/video_ndarray.py b/docarray/typing/tensor/video/video_ndarray.py index 356ab0ac603..d52e9a1ae28 100644 --- a/docarray/typing/tensor/video/video_ndarray.py +++ b/docarray/typing/tensor/video/video_ndarray.py @@ -28,11 +28,11 @@ class VideoNdArray(NdArray, VideoTensorMixin): import numpy as np from pydantic import parse_obj_as - from docarray import BaseDocument + from docarray import BaseDoc from docarray.typing import VideoNdArray, VideoUrl - class MyVideoDoc(BaseDocument): + class MyVideoDoc(BaseDoc): title: str url: Optional[VideoUrl] video_tensor: Optional[VideoNdArray] diff --git a/docarray/typing/tensor/video/video_tensor_mixin.py b/docarray/typing/tensor/video/video_tensor_mixin.py index 5a3fe88b398..cfa5a46ea70 100644 --- a/docarray/typing/tensor/video/video_tensor_mixin.py +++ b/docarray/typing/tensor/video/video_tensor_mixin.py @@ -53,12 +53,12 @@ def save( .. code-block:: python import numpy as np - from docarray import BaseDocument + from docarray import BaseDoc from docarray.typing.tensor.audio.audio_tensor import AudioTensor from docarray.typing.tensor.video.video_tensor import VideoTensor - class MyDoc(BaseDocument): + class MyDoc(BaseDoc): video_tensor: VideoTensor audio_tensor: AudioTensor diff --git a/docarray/typing/tensor/video/video_tensorflow_tensor.py b/docarray/typing/tensor/video/video_tensorflow_tensor.py index a81fc377a44..39a7fce87ec 100644 --- a/docarray/typing/tensor/video/video_tensorflow_tensor.py +++ b/docarray/typing/tensor/video/video_tensorflow_tensor.py @@ -30,11 +30,11 @@ class VideoTensorFlowTensor( import tensorflow as tf from pydantic import parse_obj_as - from docarray import BaseDocument + from docarray import BaseDoc from docarray.typing import VideoTensorFlowTensor, VideoUrl - class MyVideoDoc(BaseDocument): + class MyVideoDoc(BaseDoc): title: str url: Optional[VideoUrl] video_tensor: Optional[VideoTensorFlowTensor] diff --git a/docarray/typing/tensor/video/video_torch_tensor.py b/docarray/typing/tensor/video/video_torch_tensor.py index caee4dfca19..9c20159e7e4 100644 --- a/docarray/typing/tensor/video/video_torch_tensor.py +++ b/docarray/typing/tensor/video/video_torch_tensor.py @@ -28,11 +28,11 @@ class VideoTorchTensor(TorchTensor, VideoTensorMixin, metaclass=metaTorchAndNode import torch from pydantic import parse_obj_as - from docarray import BaseDocument + from docarray import BaseDoc from docarray.typing import VideoTorchTensor, VideoUrl - class MyVideoDoc(BaseDocument): + class MyVideoDoc(BaseDoc): title: str url: Optional[VideoUrl] video_tensor: Optional[VideoTorchTensor] diff --git a/docarray/typing/url/audio_url.py b/docarray/typing/url/audio_url.py index 2e0f492d286..c971f6d364f 100644 --- a/docarray/typing/url/audio_url.py +++ b/docarray/typing/url/audio_url.py @@ -54,7 +54,7 @@ def load(self: T) -> Tuple[np.ndarray, int]: .. code-block:: python - from docarray import BaseDocument + from docarray import BaseDoc import numpy as np from docarray.typing import AudioUrl diff --git a/docarray/typing/url/image_url.py b/docarray/typing/url/image_url.py index 63daecf7d14..b5c40b71e2d 100644 --- a/docarray/typing/url/image_url.py +++ b/docarray/typing/url/image_url.py @@ -53,12 +53,12 @@ def load( .. code-block:: python - from docarray import BaseDocument + from docarray import BaseDoc from docarray.typing import ImageUrl import numpy as np - class MyDoc(BaseDocument): + class MyDoc(BaseDoc): img_url: ImageUrl diff --git a/docarray/typing/url/text_url.py b/docarray/typing/url/text_url.py index f1fac003f79..049de511d1f 100644 --- a/docarray/typing/url/text_url.py +++ b/docarray/typing/url/text_url.py @@ -1,4 +1,4 @@ -from typing import Optional, TYPE_CHECKING, TypeVar, Type, Union, Any +from typing import TYPE_CHECKING, Any, Optional, Type, TypeVar, Union from docarray.typing.proto_register import _register_proto from docarray.typing.url.any_url import AnyUrl @@ -47,11 +47,11 @@ def load(self, charset: str = 'utf-8', timeout: Optional[float] = None) -> str: .. code-block:: python - from docarray import BaseDocument + from docarray import BaseDoc from docarray.typing import TextUrl - class MyDoc(BaseDocument): + class MyDoc(BaseDoc): remote_url: TextUrl local_url: TextUrl diff --git a/docarray/typing/url/url_3d/mesh_url.py b/docarray/typing/url/url_3d/mesh_url.py index 93a843cd897..e820ba8a4d5 100644 --- a/docarray/typing/url/url_3d/mesh_url.py +++ b/docarray/typing/url/url_3d/mesh_url.py @@ -33,13 +33,13 @@ def load( .. code-block:: python - from docarray import BaseDocument + from docarray import BaseDoc import numpy as np from docarray.typing import Mesh3DUrl, NdArray - class MyDoc(BaseDocument): + class MyDoc(BaseDoc): mesh_url: Mesh3DUrl diff --git a/docarray/typing/url/url_3d/point_cloud_url.py b/docarray/typing/url/url_3d/point_cloud_url.py index dd91486f808..502a29c740d 100644 --- a/docarray/typing/url/url_3d/point_cloud_url.py +++ b/docarray/typing/url/url_3d/point_cloud_url.py @@ -36,12 +36,12 @@ def load( .. code-block:: python import numpy as np - from docarray import BaseDocument + from docarray import BaseDoc from docarray.typing import PointCloud3DUrl - class MyDoc(BaseDocument): + class MyDoc(BaseDoc): point_cloud_url: PointCloud3DUrl @@ -97,7 +97,7 @@ def display( .. code-block:: python import numpy as np - from docarray import BaseDocument + from docarray import BaseDoc from docarray.documents import PointCloud3D diff --git a/docarray/typing/url/video_url.py b/docarray/typing/url/video_url.py index 63bc9a59790..44c2d33f9b2 100644 --- a/docarray/typing/url/video_url.py +++ b/docarray/typing/url/video_url.py @@ -57,12 +57,12 @@ def load(self: T, **kwargs) -> VideoLoadResult: from typing import Optional - from docarray import BaseDocument + from docarray import BaseDoc from docarray.typing import VideoUrl, VideoNdArray, AudioNdArray, NdArray - class MyDoc(BaseDocument): + class MyDoc(BaseDoc): video_url: VideoUrl video: Optional[VideoNdArray] audio: Optional[AudioNdArray] diff --git a/docarray/utils/filter.py b/docarray/utils/filter.py index 87b3b6e43f0..ceefa10bfd1 100644 --- a/docarray/utils/filter.py +++ b/docarray/utils/filter.py @@ -1,14 +1,14 @@ import json from typing import Dict, List, Union -from docarray.array.abstract_array import AnyDocumentArray +from docarray.array.abstract_array import AnyDocArray from docarray.array.array.array import DocumentArray def filter_docs( - docs: AnyDocumentArray, + docs: AnyDocArray, query: Union[str, Dict, List[Dict]], -) -> AnyDocumentArray: +) -> AnyDocArray: """ Filter the Documents in the index according to the given filter query. @@ -17,12 +17,12 @@ def filter_docs( .. code-block:: python - from docarray import DocumentArray, BaseDocument + from docarray import DocumentArray, BaseDoc from docarray.documents import Text, Image from docarray.util.filter import filter_docs - class MyDocument(BaseDocument): + class MyDocument(BaseDoc): caption: Text image: Image price: int diff --git a/docarray/utils/find.py b/docarray/utils/find.py index f4b59779e3e..4bbd47767da 100644 --- a/docarray/utils/find.py +++ b/docarray/utils/find.py @@ -2,10 +2,10 @@ from typing_inspect import is_union_type -from docarray.array.abstract_array import AnyDocumentArray +from docarray.array.abstract_array import AnyDocArray from docarray.array.array.array import DocumentArray from docarray.array.stacked.array_stacked import DocumentArrayStacked -from docarray.base_document import BaseDocument +from docarray.base_document import BaseDoc from docarray.helper import _get_field_type_by_access_path from docarray.typing import AnyTensor from docarray.typing.tensor.abstract_tensor import AbstractTensor @@ -22,8 +22,8 @@ class _FindResult(NamedTuple): def find( - index: AnyDocumentArray, - query: Union[AnyTensor, BaseDocument], + index: AnyDocArray, + query: Union[AnyTensor, BaseDoc], embedding_field: str = 'embedding', metric: str = 'cosine_sim', limit: int = 10, @@ -50,12 +50,12 @@ def find( .. code-block:: python - from docarray import DocumentArray, BaseDocument + from docarray import DocumentArray, BaseDoc from docarray.typing import TorchTensor from docarray.util.find import find - class MyDocument(BaseDocument): + class MyDocument(BaseDoc): embedding: TorchTensor @@ -111,7 +111,7 @@ class MyDocument(BaseDocument): def find_batched( - index: AnyDocumentArray, + index: AnyDocArray, query: Union[AnyTensor, DocumentArray], embedding_field: str = 'embedding', metric: str = 'cosine_sim', @@ -139,12 +139,12 @@ def find_batched( .. code-block:: python - from docarray import DocumentArray, BaseDocument + from docarray import DocumentArray, BaseDoc from docarray.typing import TorchTensor from docarray.util.find import find - class MyDocument(BaseDocument): + class MyDocument(BaseDoc): embedding: TorchTensor @@ -219,7 +219,7 @@ class MyDocument(BaseDocument): def _extract_embedding_single( - data: Union[DocumentArray, BaseDocument, AnyTensor], + data: Union[DocumentArray, BaseDoc, AnyTensor], embedding_field: str, ) -> AnyTensor: """Extract the embeddings from a single query, @@ -230,8 +230,8 @@ def _extract_embedding_single( :param embedding_type: type of the embedding: torch.Tensor, numpy.ndarray etc. :return: the embeddings """ - if isinstance(data, BaseDocument): - emb = next(AnyDocumentArray._traverse(data, embedding_field)) + if isinstance(data, BaseDoc): + emb = next(AnyDocArray._traverse(data, embedding_field)) else: # treat data as tensor emb = data if len(emb.shape) == 1: @@ -242,7 +242,7 @@ def _extract_embedding_single( def _extract_embeddings( - data: Union[AnyDocumentArray, BaseDocument, AnyTensor], + data: Union[AnyDocArray, BaseDoc, AnyTensor], embedding_field: str, embedding_type: Type, ) -> AnyTensor: @@ -255,10 +255,10 @@ def _extract_embeddings( """ emb: AnyTensor if isinstance(data, DocumentArray): - emb_list = list(AnyDocumentArray._traverse(data, embedding_field)) + emb_list = list(AnyDocArray._traverse(data, embedding_field)) emb = embedding_type._docarray_stack(emb_list) - elif isinstance(data, (DocumentArrayStacked, BaseDocument)): - emb = next(AnyDocumentArray._traverse(data, embedding_field)) + elif isinstance(data, (DocumentArrayStacked, BaseDoc)): + emb = next(AnyDocArray._traverse(data, embedding_field)) else: # treat data as tensor emb = cast(AnyTensor, data) @@ -267,7 +267,7 @@ def _extract_embeddings( return emb -def _da_attr_type(da: AnyDocumentArray, access_path: str) -> Type[AnyTensor]: +def _da_attr_type(da: AnyDocArray, access_path: str) -> Type[AnyTensor]: """Get the type of the attribute according to the Document type (schema) of the DocumentArray. @@ -283,7 +283,7 @@ def _da_attr_type(da: AnyDocumentArray, access_path: str) -> Type[AnyTensor]: if is_union_type(field_type): # determine type based on the fist element - field_type = type(next(AnyDocumentArray._traverse(da[0], access_path))) + field_type = type(next(AnyDocArray._traverse(da[0], access_path))) if not issubclass(field_type, AbstractTensor): raise ValueError( diff --git a/docarray/utils/map.py b/docarray/utils/map.py index 445e42d1bb9..dc0d7fb17a9 100644 --- a/docarray/utils/map.py +++ b/docarray/utils/map.py @@ -5,12 +5,12 @@ from rich.progress import track -from docarray import BaseDocument -from docarray.array.abstract_array import AnyDocumentArray +from docarray import BaseDoc +from docarray.array.abstract_array import AnyDocArray from docarray.helper import _is_lambda_or_partial_or_local_function -T = TypeVar('T', bound=AnyDocumentArray) -T_doc = TypeVar('T_doc', bound=BaseDocument) +T = TypeVar('T', bound=AnyDocArray) +T_doc = TypeVar('T_doc', bound=BaseDoc) def map_docs( @@ -48,8 +48,8 @@ def load_url_to_tensor(img: Image) -> Image: assert doc.tensor is not None :param da: DocumentArray to apply function to - :param func: a function that takes a :class:`BaseDocument` as input and outputs - a :class:`BaseDocument`. + :param func: a function that takes a :class:`BaseDoc` as input and outputs + a :class:`BaseDoc`. :param backend: `thread` for multithreading and `process` for multiprocessing. Defaults to `thread`. In general, if `func` is IO-bound then `thread` is a good choice. @@ -107,16 +107,16 @@ def map_docs_batch( """ Return an iterator that applies `func` to every **minibatch** of iterable in parallel, yielding the results. - Each element in the returned iterator is an :class:`AnyDocumentArray`. + Each element in the returned iterator is an :class:`AnyDocArray`. EXAMPLE USAGE .. code-block:: python - from docarray import BaseDocument, DocumentArray + from docarray import BaseDoc, DocumentArray from docarray.utils.map import map_docs_batch - class MyDoc(BaseDocument): + class MyDoc(BaseDoc): name: str @@ -142,8 +142,8 @@ def upper_case_name(da: DocumentArray[MyDoc]) -> DocumentArray[MyDoc]: :param batch_size: Size of each generated batch (except the last one, which might be smaller). :param shuffle: If set, shuffle the Documents before dividing into minibatches. - :param func: a function that takes an :class:`AnyDocumentArray` as input and outputs - an :class:`AnyDocumentArray` or a :class:`BaseDocument`. + :param func: a function that takes an :class:`AnyDocArray` as input and outputs + an :class:`AnyDocArray` or a :class:`BaseDoc`. :param backend: `thread` for multithreading and `process` for multiprocessing. Defaults to `thread`. In general, if `func` is IO-bound then `thread` is a good choice. diff --git a/tests/benchmark_tests/test_map.py b/tests/benchmark_tests/test_map.py index 8e3e6fc29fc..bad86da41a7 100644 --- a/tests/benchmark_tests/test_map.py +++ b/tests/benchmark_tests/test_map.py @@ -4,7 +4,7 @@ import numpy as np import pytest -from docarray import BaseDocument, DocumentArray +from docarray import BaseDoc, DocumentArray from docarray.documents import ImageDoc from docarray.typing import NdArray from docarray.utils.map import map_docs, map_docs_batch @@ -13,7 +13,7 @@ pytestmark = [pytest.mark.benchmark, pytest.mark.slow] -class MyMatrix(BaseDocument): +class MyMatrix(BaseDoc): matrix: NdArray diff --git a/tests/index/base_classes/test_base_doc_store.py b/tests/index/base_classes/test_base_doc_store.py index f008dff9d6d..4fa67344100 100644 --- a/tests/index/base_classes/test_base_doc_store.py +++ b/tests/index/base_classes/test_base_doc_store.py @@ -5,30 +5,27 @@ import pytest from pydantic import Field -from docarray import BaseDocument, DocumentArray -from docarray.index.abstract import ( - BaseDocumentIndex, - _raise_not_composable, -) +from docarray import BaseDoc, DocumentArray +from docarray.index.abstract import BaseDocIndex, _raise_not_composable from docarray.typing import ID, NdArray pytestmark = pytest.mark.index -class SimpleDoc(BaseDocument): +class SimpleDoc(BaseDoc): tens: NdArray[10] = Field(dim=1000) -class FlatDoc(BaseDocument): +class FlatDoc(BaseDoc): tens_one: NdArray = Field(dim=10) tens_two: NdArray = Field(dim=50) -class NestedDoc(BaseDocument): +class NestedDoc(BaseDoc): d: SimpleDoc -class DeepNestedDoc(BaseDocument): +class DeepNestedDoc(BaseDoc): d: NestedDoc @@ -40,9 +37,9 @@ def _identity(*x, **y): return x, y -class DummyDocIndex(BaseDocumentIndex): +class DummyDocIndex(BaseDocIndex): @dataclass - class RuntimeConfig(BaseDocumentIndex.RuntimeConfig): + class RuntimeConfig(BaseDocIndex.RuntimeConfig): default_column_config: Dict[Type, Dict[str, Any]] = field( default_factory=lambda: { str: {'hi': 'there'}, @@ -52,10 +49,10 @@ class RuntimeConfig(BaseDocumentIndex.RuntimeConfig): ) @dataclass - class DBConfig(BaseDocumentIndex.DBConfig): + class DBConfig(BaseDocIndex.DBConfig): work_dir: str = '.' - class QueryBuilder(BaseDocumentIndex.QueryBuilder): + class QueryBuilder(BaseDocIndex.QueryBuilder): def build(self): return None @@ -183,7 +180,7 @@ def test_flatten_schema(): def test_columns_db_type_with_user_defined_mapping(tmp_path): - class MyDoc(BaseDocument): + class MyDoc(BaseDoc): tens: NdArray[10] = Field(dim=1000, col_type=np.ndarray) store = DummyDocIndex[MyDoc](work_dir=str(tmp_path)) @@ -192,7 +189,7 @@ class MyDoc(BaseDocument): def test_columns_db_type_with_user_defined_mapping_additional_params(tmp_path): - class MyDoc(BaseDocument): + class MyDoc(BaseDoc): tens: NdArray[10] = Field(dim=1000, col_type='varchar', max_len=1024) store = DummyDocIndex[MyDoc](work_dir=str(tmp_path)) @@ -202,7 +199,7 @@ class MyDoc(BaseDocument): def test_columns_illegal_mapping(tmp_path): - class MyDoc(BaseDocument): + class MyDoc(BaseDoc): tens: NdArray[10] = Field(dim=1000, col_type='non_valid_type') with pytest.raises( @@ -224,11 +221,11 @@ class OtherNestedDoc(NestedDoc): # SIMPLE store = DummyDocIndex[SimpleDoc]() in_list = [SimpleDoc(tens=np.random.random((10,)))] - assert isinstance(store._validate_docs(in_list), DocumentArray[BaseDocument]) + assert isinstance(store._validate_docs(in_list), DocumentArray[BaseDoc]) in_da = DocumentArray[SimpleDoc](in_list) assert store._validate_docs(in_da) == in_da in_other_list = [OtherSimpleDoc(tens=np.random.random((10,)))] - assert isinstance(store._validate_docs(in_other_list), DocumentArray[BaseDocument]) + assert isinstance(store._validate_docs(in_other_list), DocumentArray[BaseDoc]) in_other_da = DocumentArray[OtherSimpleDoc](in_other_list) assert store._validate_docs(in_other_da) == in_other_da @@ -257,7 +254,7 @@ class OtherNestedDoc(NestedDoc): in_list = [ FlatDoc(tens_one=np.random.random((10,)), tens_two=np.random.random((50,))) ] - assert isinstance(store._validate_docs(in_list), DocumentArray[BaseDocument]) + assert isinstance(store._validate_docs(in_list), DocumentArray[BaseDoc]) in_da = DocumentArray[FlatDoc]( [FlatDoc(tens_one=np.random.random((10,)), tens_two=np.random.random((50,)))] ) @@ -265,7 +262,7 @@ class OtherNestedDoc(NestedDoc): in_other_list = [ OtherFlatDoc(tens_one=np.random.random((10,)), tens_two=np.random.random((50,))) ] - assert isinstance(store._validate_docs(in_other_list), DocumentArray[BaseDocument]) + assert isinstance(store._validate_docs(in_other_list), DocumentArray[BaseDoc]) in_other_da = DocumentArray[OtherFlatDoc]( [ OtherFlatDoc( @@ -284,13 +281,13 @@ class OtherNestedDoc(NestedDoc): # NESTED store = DummyDocIndex[NestedDoc]() in_list = [NestedDoc(d=SimpleDoc(tens=np.random.random((10,))))] - assert isinstance(store._validate_docs(in_list), DocumentArray[BaseDocument]) + assert isinstance(store._validate_docs(in_list), DocumentArray[BaseDoc]) in_da = DocumentArray[NestedDoc]( [NestedDoc(d=SimpleDoc(tens=np.random.random((10,))))] ) assert store._validate_docs(in_da) == in_da in_other_list = [OtherNestedDoc(d=OtherSimpleDoc(tens=np.random.random((10,))))] - assert isinstance(store._validate_docs(in_other_list), DocumentArray[BaseDocument]) + assert isinstance(store._validate_docs(in_other_list), DocumentArray[BaseDoc]) in_other_da = DocumentArray[OtherNestedDoc]( [OtherNestedDoc(d=OtherSimpleDoc(tens=np.random.random((10,))))] ) @@ -305,16 +302,16 @@ class OtherNestedDoc(NestedDoc): def test_docs_validation_unions(): - class OptionalDoc(BaseDocument): + class OptionalDoc(BaseDoc): tens: Optional[NdArray[10]] = Field(dim=1000) - class UnionDoc(BaseDocument): + class UnionDoc(BaseDoc): tens: Union[NdArray[10], str] = Field(dim=1000) # OPTIONAL store = DummyDocIndex[SimpleDoc]() in_list = [OptionalDoc(tens=np.random.random((10,)))] - assert isinstance(store._validate_docs(in_list), DocumentArray[BaseDocument]) + assert isinstance(store._validate_docs(in_list), DocumentArray[BaseDoc]) in_da = DocumentArray[OptionalDoc](in_list) assert store._validate_docs(in_da) == in_da @@ -324,9 +321,9 @@ class UnionDoc(BaseDocument): # OTHER UNION store = DummyDocIndex[SimpleDoc]() in_list = [UnionDoc(tens=np.random.random((10,)))] - assert isinstance(store._validate_docs(in_list), DocumentArray[BaseDocument]) + assert isinstance(store._validate_docs(in_list), DocumentArray[BaseDoc]) in_da = DocumentArray[UnionDoc](in_list) - assert isinstance(store._validate_docs(in_da), DocumentArray[BaseDocument]) + assert isinstance(store._validate_docs(in_da), DocumentArray[BaseDoc]) with pytest.raises(ValueError): store._validate_docs([UnionDoc(tens='hello')]) diff --git a/tests/index/base_classes/test_configs.py b/tests/index/base_classes/test_configs.py index 7f473ac03e3..8cae5524ec9 100644 --- a/tests/index/base_classes/test_configs.py +++ b/tests/index/base_classes/test_configs.py @@ -4,14 +4,14 @@ import pytest from pydantic import Field -from docarray import BaseDocument -from docarray.index.abstract import BaseDocumentIndex +from docarray import BaseDoc +from docarray.index.abstract import BaseDocIndex from docarray.typing import NdArray pytestmark = pytest.mark.index -class SimpleDoc(BaseDocument): +class SimpleDoc(BaseDoc): tens: NdArray[10] = Field(dim=1000) @@ -20,13 +20,13 @@ class FakeQueryBuilder: @dataclass -class DBConfig(BaseDocumentIndex.DBConfig): +class DBConfig(BaseDocIndex.DBConfig): work_dir: str = '.' other: int = 5 @dataclass -class RuntimeConfig(BaseDocumentIndex.RuntimeConfig): +class RuntimeConfig(BaseDocIndex.RuntimeConfig): default_column_config: Dict[Type, Dict[str, Any]] = field( default_factory=lambda: { str: { @@ -42,7 +42,7 @@ def _identity(*x, **y): return x, y -class DummyDocIndex(BaseDocumentIndex): +class DummyDocIndex(BaseDocIndex): DBConfig = DBConfig RuntimeConfig = RuntimeConfig diff --git a/tests/index/hnswlib/test_find.py b/tests/index/hnswlib/test_find.py index cfc7679bcea..0aca0383a94 100644 --- a/tests/index/hnswlib/test_find.py +++ b/tests/index/hnswlib/test_find.py @@ -3,37 +3,37 @@ import torch from pydantic import Field -from docarray import BaseDocument +from docarray import BaseDoc from docarray.index import HnswDocumentIndex from docarray.typing import NdArray, TorchTensor pytestmark = [pytest.mark.slow, pytest.mark.index] -class SimpleDoc(BaseDocument): +class SimpleDoc(BaseDoc): tens: NdArray[10] = Field(dim=1000) -class FlatDoc(BaseDocument): +class FlatDoc(BaseDoc): tens_one: NdArray = Field(dim=10) tens_two: NdArray = Field(dim=50) -class NestedDoc(BaseDocument): +class NestedDoc(BaseDoc): d: SimpleDoc -class DeepNestedDoc(BaseDocument): +class DeepNestedDoc(BaseDoc): d: NestedDoc -class TorchDoc(BaseDocument): +class TorchDoc(BaseDoc): tens: TorchTensor[10] @pytest.mark.parametrize('space', ['cosine', 'l2', 'ip']) def test_find_simple_schema(tmp_path, space): - class SimpleSchema(BaseDocument): + class SimpleSchema(BaseDoc): tens: NdArray[10] = Field(space=space) store = HnswDocumentIndex[SimpleSchema](work_dir=str(tmp_path)) @@ -83,7 +83,7 @@ def test_find_torch(tmp_path, space): def test_find_tensorflow(tmp_path): from docarray.typing import TensorFlowTensor - class TfDoc(BaseDocument): + class TfDoc(BaseDoc): tens: TensorFlowTensor[10] store = HnswDocumentIndex[TfDoc](work_dir=str(tmp_path)) @@ -113,7 +113,7 @@ class TfDoc(BaseDocument): @pytest.mark.parametrize('space', ['cosine', 'l2', 'ip']) def test_find_flat_schema(tmp_path, space): - class FlatSchema(BaseDocument): + class FlatSchema(BaseDoc): tens_one: NdArray = Field(dim=10, space=space) tens_two: NdArray = Field(dim=50, space=space) @@ -147,14 +147,14 @@ class FlatSchema(BaseDocument): @pytest.mark.parametrize('space', ['cosine', 'l2', 'ip']) def test_find_nested_schema(tmp_path, space): - class SimpleDoc(BaseDocument): + class SimpleDoc(BaseDoc): tens: NdArray[10] = Field(space=space) - class NestedDoc(BaseDocument): + class NestedDoc(BaseDoc): d: SimpleDoc tens: NdArray[10] = Field(space=space) - class DeepNestedDoc(BaseDocument): + class DeepNestedDoc(BaseDoc): d: NestedDoc tens: NdArray = Field(space=space, dim=10) diff --git a/tests/index/hnswlib/test_index_get_del.py b/tests/index/hnswlib/test_index_get_del.py index 0d4eb02f537..fa5c5f051b2 100644 --- a/tests/index/hnswlib/test_index_get_del.py +++ b/tests/index/hnswlib/test_index_get_del.py @@ -6,7 +6,7 @@ import torch from pydantic import Field -from docarray import BaseDocument, DocumentArray +from docarray import BaseDoc, DocumentArray from docarray.documents import ImageDoc, TextDoc from docarray.index import HnswDocumentIndex from docarray.typing import NdArray, NdArrayEmbedding, TorchTensor @@ -14,24 +14,24 @@ pytestmark = [pytest.mark.slow, pytest.mark.index] -class SimpleDoc(BaseDocument): +class SimpleDoc(BaseDoc): tens: NdArray[10] = Field(dim=1000) -class FlatDoc(BaseDocument): +class FlatDoc(BaseDoc): tens_one: NdArray = Field(dim=10) tens_two: NdArray = Field(dim=50) -class NestedDoc(BaseDocument): +class NestedDoc(BaseDoc): d: SimpleDoc -class DeepNestedDoc(BaseDocument): +class DeepNestedDoc(BaseDoc): d: NestedDoc -class TorchDoc(BaseDocument): +class TorchDoc(BaseDoc): tens: TorchTensor[10] @@ -66,7 +66,7 @@ def test_index_simple_schema(ten_simple_docs, tmp_path, use_docarray): def test_schema_with_user_defined_mapping(tmp_path): - class MyDoc(BaseDocument): + class MyDoc(BaseDoc): tens: NdArray[10] = Field(dim=1000, col_type=np.ndarray) store = HnswDocumentIndex[MyDoc](work_dir=str(tmp_path)) @@ -114,7 +114,7 @@ def test_index_torch(tmp_path): def test_index_tf(tmp_path): from docarray.typing import TensorFlowTensor - class TfDoc(BaseDocument): + class TfDoc(BaseDoc): tens: TensorFlowTensor[10] docs = [TfDoc(tens=np.random.randn(10)) for _ in range(10)] diff --git a/tests/index/hnswlib/test_persist_data.py b/tests/index/hnswlib/test_persist_data.py index a0a86eee9ab..1ac02d11d42 100644 --- a/tests/index/hnswlib/test_persist_data.py +++ b/tests/index/hnswlib/test_persist_data.py @@ -2,18 +2,18 @@ import pytest from pydantic import Field -from docarray import BaseDocument +from docarray import BaseDoc from docarray.index import HnswDocumentIndex from docarray.typing import NdArray pytestmark = [pytest.mark.slow, pytest.mark.index] -class SimpleDoc(BaseDocument): +class SimpleDoc(BaseDoc): tens: NdArray[10] = Field(dim=1000) -class NestedDoc(BaseDocument): +class NestedDoc(BaseDoc): d: SimpleDoc tens: NdArray[50] diff --git a/tests/integrations/array/test_torch_train.py b/tests/integrations/array/test_torch_train.py index fb7c34befa5..8ed42a2ac19 100644 --- a/tests/integrations/array/test_torch_train.py +++ b/tests/integrations/array/test_torch_train.py @@ -2,12 +2,12 @@ import torch -from docarray import BaseDocument, DocumentArray +from docarray import BaseDoc, DocumentArray from docarray.typing import TorchTensor def test_torch_train(): - class Mmdoc(BaseDocument): + class Mmdoc(BaseDoc): text: str tensor: Optional[TorchTensor[3, 224, 224]] diff --git a/tests/integrations/document/test_document.py b/tests/integrations/document/test_document.py index 35cbba24d53..3a421a93b5a 100644 --- a/tests/integrations/document/test_document.py +++ b/tests/integrations/document/test_document.py @@ -5,18 +5,18 @@ from pydantic import BaseModel, ValidationError from typing_extensions import TypedDict -from docarray import BaseDocument, DocumentArray +from docarray import BaseDoc, DocumentArray from docarray.documents import AudioDoc, ImageDoc, TextDoc from docarray.documents.helper import ( create_doc, - create_doc_from_typeddict, create_doc_from_dict, + create_doc_from_typeddict, ) from docarray.typing import AudioNdArray def test_multi_modal_doc(): - class MyMultiModalDoc(BaseDocument): + class MyMultiModalDoc(BaseDoc): image: ImageDoc text: TextDoc @@ -24,7 +24,7 @@ class MyMultiModalDoc(BaseDocument): image=ImageDoc(tensor=np.zeros((3, 224, 224))), text=TextDoc(text='hello') ) - assert isinstance(doc.image, BaseDocument) + assert isinstance(doc.image, BaseDoc) assert isinstance(doc.image, ImageDoc) assert isinstance(doc.text, TextDoc) @@ -33,7 +33,7 @@ class MyMultiModalDoc(BaseDocument): def test_nested_chunks_document(): - class ChunksDocument(BaseDocument): + class ChunksDocument(BaseDoc): text: str images: DocumentArray[ImageDoc] @@ -58,13 +58,13 @@ def test_create_doc(): 'MyMultiModalDoc', image=(ImageDoc, ...), text=(TextDoc, ...) ) - assert issubclass(MyMultiModalDoc, BaseDocument) + assert issubclass(MyMultiModalDoc, BaseDoc) doc = MyMultiModalDoc( image=ImageDoc(tensor=np.zeros((3, 224, 224))), text=TextDoc(text='hello') ) - assert isinstance(doc.image, BaseDocument) + assert isinstance(doc.image, BaseDoc) assert isinstance(doc.image, ImageDoc) assert isinstance(doc.text, TextDoc) @@ -78,7 +78,7 @@ def test_create_doc(): tensor=(Optional[AudioNdArray], ...), ) - assert issubclass(MyAudio, BaseDocument) + assert issubclass(MyAudio, BaseDoc) assert issubclass(MyAudio, AudioDoc) @@ -92,7 +92,7 @@ class MyMultiModalDoc(TypedDict): Doc = create_doc_from_typeddict(MyMultiModalDoc) - assert issubclass(Doc, BaseDocument) + assert issubclass(Doc, BaseDoc) class MyAudio(TypedDict): title: str @@ -100,7 +100,7 @@ class MyAudio(TypedDict): Doc = create_doc_from_typeddict(MyAudio, __base__=AudioDoc) - assert issubclass(Doc, BaseDocument) + assert issubclass(Doc, BaseDoc) assert issubclass(Doc, AudioDoc) @@ -113,7 +113,7 @@ def test_create_doc_from_dict(): MyDoc = create_doc_from_dict(model_name='MyDoc', data_dict=data_dict) - assert issubclass(MyDoc, BaseDocument) + assert issubclass(MyDoc, BaseDoc) doc = MyDoc( image=ImageDoc(tensor=np.random.rand(3, 224, 224)), @@ -121,7 +121,7 @@ def test_create_doc_from_dict(): id=111, ) - assert isinstance(doc, BaseDocument) + assert isinstance(doc, BaseDoc) assert isinstance(doc.text, TextDoc) assert isinstance(doc.image, ImageDoc) assert isinstance(doc.id, int) @@ -142,9 +142,9 @@ def test_create_doc_from_dict(): data_dict = {'text': 'some text', 'other': None} MyDoc = create_doc_from_dict(model_name='MyDoc', data_dict=data_dict) - assert issubclass(MyDoc, BaseDocument) + assert issubclass(MyDoc, BaseDoc) doc1 = MyDoc(text='txt', other=10) doc2 = MyDoc(text='txt', other='also text') - assert isinstance(doc1, BaseDocument) and isinstance(doc2, BaseDocument) + assert isinstance(doc1, BaseDoc) and isinstance(doc2, BaseDoc) diff --git a/tests/integrations/document/test_proto.py b/tests/integrations/document/test_proto.py index a509d9a211e..2717dd7f423 100644 --- a/tests/integrations/document/test_proto.py +++ b/tests/integrations/document/test_proto.py @@ -2,7 +2,7 @@ import pytest import torch -from docarray import BaseDocument, DocumentArray +from docarray import BaseDoc, DocumentArray from docarray.documents import ImageDoc, TextDoc from docarray.typing import ( AnyEmbedding, @@ -30,7 +30,7 @@ @pytest.mark.proto def test_multi_modal_doc_proto(): - class MyMultiModalDoc(BaseDocument): + class MyMultiModalDoc(BaseDoc): image: ImageDoc text: TextDoc @@ -43,10 +43,10 @@ class MyMultiModalDoc(BaseDocument): @pytest.mark.proto def test_all_types(): - class NestedDoc(BaseDocument): + class NestedDoc(BaseDoc): tensor: NdArray - class MyDoc(BaseDocument): + class MyDoc(BaseDoc): img_url: ImageUrl txt_url: TextUrl mesh_url: Mesh3DUrl @@ -126,10 +126,10 @@ class MyDoc(BaseDocument): @pytest.mark.tensorflow def test_tensorflow_types(): - class NestedDoc(BaseDocument): + class NestedDoc(BaseDoc): tensor: TensorFlowTensor - class MyDoc(BaseDocument): + class MyDoc(BaseDoc): tf_tensor: TensorFlowTensor tf_tensor_param: TensorFlowTensor[224, 224, 3] generic_tf_tensor: AnyTensor diff --git a/tests/integrations/document/test_to_json.py b/tests/integrations/document/test_to_json.py index 10fd7de1597..9bc06be14da 100644 --- a/tests/integrations/document/test_to_json.py +++ b/tests/integrations/document/test_to_json.py @@ -2,14 +2,14 @@ import pytest import torch -from docarray.base_document import BaseDocument +from docarray.base_document import BaseDoc from docarray.base_document.io.json import orjson_dumps from docarray.typing import AnyUrl, NdArray, TorchTensor @pytest.fixture() def doc_and_class(): - class Mmdoc(BaseDocument): + class Mmdoc(BaseDoc): img: NdArray url: AnyUrl txt: str diff --git a/tests/integrations/externals/test_fastapi.py b/tests/integrations/externals/test_fastapi.py index 840be48f3a0..03bc4650775 100644 --- a/tests/integrations/externals/test_fastapi.py +++ b/tests/integrations/externals/test_fastapi.py @@ -3,15 +3,15 @@ from fastapi import FastAPI from httpx import AsyncClient -from docarray import BaseDocument -from docarray.base_document import DocumentResponse +from docarray import BaseDoc +from docarray.base_document import DocResponse from docarray.documents import ImageDoc, TextDoc from docarray.typing import NdArray @pytest.mark.asyncio async def test_fast_api(): - class Mmdoc(BaseDocument): + class Mmdoc(BaseDoc): img: ImageDoc text: TextDoc title: str @@ -22,7 +22,7 @@ class Mmdoc(BaseDocument): app = FastAPI() - @app.post("/doc/", response_model=Mmdoc, response_class=DocumentResponse) + @app.post("/doc/", response_model=Mmdoc, response_class=DocResponse) async def create_item(doc: Mmdoc): return doc @@ -38,10 +38,10 @@ async def create_item(doc: Mmdoc): @pytest.mark.asyncio async def test_image(): - class InputDoc(BaseDocument): + class InputDoc(BaseDoc): img: ImageDoc - class OutputDoc(BaseDocument): + class OutputDoc(BaseDoc): embedding_clip: NdArray embedding_bert: NdArray @@ -49,7 +49,7 @@ class OutputDoc(BaseDocument): app = FastAPI() - @app.post("/doc/", response_model=OutputDoc, response_class=DocumentResponse) + @app.post("/doc/", response_model=OutputDoc, response_class=DocResponse) async def create_item(doc: InputDoc) -> OutputDoc: ## call my fancy model to generate the embeddings doc = OutputDoc( @@ -75,10 +75,10 @@ async def create_item(doc: InputDoc) -> OutputDoc: @pytest.mark.asyncio async def test_sentence_to_embeddings(): - class InputDoc(BaseDocument): + class InputDoc(BaseDoc): text: str - class OutputDoc(BaseDocument): + class OutputDoc(BaseDoc): embedding_clip: NdArray embedding_bert: NdArray @@ -86,7 +86,7 @@ class OutputDoc(BaseDocument): app = FastAPI() - @app.post("/doc/", response_model=OutputDoc, response_class=DocumentResponse) + @app.post("/doc/", response_model=OutputDoc, response_class=DocResponse) async def create_item(doc: InputDoc) -> OutputDoc: ## call my fancy model to generate the embeddings return OutputDoc( diff --git a/tests/integrations/predefined_document/test_audio.py b/tests/integrations/predefined_document/test_audio.py index e665f3d9d93..c25aab9b1ab 100644 --- a/tests/integrations/predefined_document/test_audio.py +++ b/tests/integrations/predefined_document/test_audio.py @@ -6,7 +6,7 @@ import torch from pydantic import parse_obj_as -from docarray import BaseDocument +from docarray import BaseDoc from docarray.documents import AudioDoc from docarray.typing import AudioUrl from docarray.typing.tensor.audio import AudioNdArray, AudioTorchTensor @@ -202,7 +202,7 @@ def test_audio_bytes(): def test_audio_shortcut_doc(): - class MyDoc(BaseDocument): + class MyDoc(BaseDoc): audio: AudioDoc audio2: AudioDoc audio3: AudioDoc diff --git a/tests/integrations/predefined_document/test_image.py b/tests/integrations/predefined_document/test_image.py index b2b7077ad43..92e19d09a14 100644 --- a/tests/integrations/predefined_document/test_image.py +++ b/tests/integrations/predefined_document/test_image.py @@ -3,7 +3,7 @@ import torch from pydantic import parse_obj_as -from docarray import BaseDocument +from docarray import BaseDoc from docarray.documents import ImageDoc from docarray.utils.misc import is_tf_available @@ -50,7 +50,7 @@ def test_image_tensorflow(): def test_image_shortcut_doc(): - class MyDoc(BaseDocument): + class MyDoc(BaseDoc): image: ImageDoc image2: ImageDoc image3: ImageDoc diff --git a/tests/integrations/predefined_document/test_mesh.py b/tests/integrations/predefined_document/test_mesh.py index a4e7b072a0a..7b6edecb11d 100644 --- a/tests/integrations/predefined_document/test_mesh.py +++ b/tests/integrations/predefined_document/test_mesh.py @@ -2,7 +2,7 @@ import pytest from pydantic import parse_obj_as -from docarray.base_document.document import BaseDocument +from docarray.base_document.doc import BaseDoc from docarray.documents import Mesh3D from tests import TOYDATA_DIR @@ -29,7 +29,7 @@ def test_str_init(): def test_doc(): - class MyDoc(BaseDocument): + class MyDoc(BaseDoc): mesh1: Mesh3D mesh2: Mesh3D diff --git a/tests/integrations/predefined_document/test_point_cloud.py b/tests/integrations/predefined_document/test_point_cloud.py index 7251d6c7380..76b7e0236d8 100644 --- a/tests/integrations/predefined_document/test_point_cloud.py +++ b/tests/integrations/predefined_document/test_point_cloud.py @@ -3,7 +3,7 @@ import torch from pydantic import parse_obj_as -from docarray import BaseDocument +from docarray import BaseDoc from docarray.documents import PointCloud3D from docarray.utils.misc import is_tf_available from tests import TOYDATA_DIR @@ -46,7 +46,7 @@ def test_point_cloud_tensorflow(): def test_point_cloud_shortcut_doc(): - class MyDoc(BaseDocument): + class MyDoc(BaseDoc): pc: PointCloud3D pc2: PointCloud3D pc3: PointCloud3D @@ -63,7 +63,7 @@ class MyDoc(BaseDocument): @pytest.mark.tensorflow def test_point_cloud_shortcut_doc_tf(): - class MyDoc(BaseDocument): + class MyDoc(BaseDoc): pc: PointCloud3D pc2: PointCloud3D diff --git a/tests/integrations/predefined_document/test_text.py b/tests/integrations/predefined_document/test_text.py index ed8395012e1..da5d31092fe 100644 --- a/tests/integrations/predefined_document/test_text.py +++ b/tests/integrations/predefined_document/test_text.py @@ -1,6 +1,6 @@ from pydantic import parse_obj_as -from docarray import BaseDocument +from docarray import BaseDoc from docarray.documents import TextDoc @@ -15,7 +15,7 @@ def test_str_init(): def test_doc(): - class MyDoc(BaseDocument): + class MyDoc(BaseDoc): text1: TextDoc text2: TextDoc diff --git a/tests/integrations/predefined_document/test_video.py b/tests/integrations/predefined_document/test_video.py index a79766ea5e1..e208e3890c9 100644 --- a/tests/integrations/predefined_document/test_video.py +++ b/tests/integrations/predefined_document/test_video.py @@ -3,7 +3,7 @@ import torch from pydantic import parse_obj_as -from docarray import BaseDocument +from docarray import BaseDoc from docarray.documents import VideoDoc from docarray.typing import AudioNdArray, NdArray, VideoNdArray from docarray.utils.misc import is_tf_available @@ -48,7 +48,7 @@ def test_video_tensorflow(): def test_video_shortcut_doc(): - class MyDoc(BaseDocument): + class MyDoc(BaseDoc): video: VideoDoc video2: VideoDoc video3: VideoDoc diff --git a/tests/integrations/torch/data/test_torch_dataset.py b/tests/integrations/torch/data/test_torch_dataset.py index 8b95f28e5f5..569b66db49b 100644 --- a/tests/integrations/torch/data/test_torch_dataset.py +++ b/tests/integrations/torch/data/test_torch_dataset.py @@ -2,12 +2,12 @@ import torch from torch.utils.data import DataLoader -from docarray import BaseDocument, DocumentArray +from docarray import BaseDoc, DocumentArray from docarray.data import MultiModalDataset from docarray.documents import ImageDoc, TextDoc -class PairTextImage(BaseDocument): +class PairTextImage(BaseDoc): text: TextDoc image: ImageDoc diff --git a/tests/integrations/typing/test_anyurl.py b/tests/integrations/typing/test_anyurl.py index 6fd8016310d..fbd6abd417e 100644 --- a/tests/integrations/typing/test_anyurl.py +++ b/tests/integrations/typing/test_anyurl.py @@ -1,9 +1,9 @@ -from docarray import BaseDocument +from docarray import BaseDoc from docarray.typing import AnyUrl def test_set_any_url(): - class MyDocument(BaseDocument): + class MyDocument(BaseDoc): any_url: AnyUrl d = MyDocument(any_url="https://jina.ai") diff --git a/tests/integrations/typing/test_embedding.py b/tests/integrations/typing/test_embedding.py index 1346fc482b7..c3db75d9f57 100644 --- a/tests/integrations/typing/test_embedding.py +++ b/tests/integrations/typing/test_embedding.py @@ -1,11 +1,11 @@ import numpy as np -from docarray import BaseDocument +from docarray import BaseDoc from docarray.typing import AnyEmbedding def test_set_embedding(): - class MyDocument(BaseDocument): + class MyDocument(BaseDoc): embedding: AnyEmbedding d = MyDocument(embedding=np.zeros((3, 224, 224))) diff --git a/tests/integrations/typing/test_id.py b/tests/integrations/typing/test_id.py index 1b62b821758..9e0ac05ffb1 100644 --- a/tests/integrations/typing/test_id.py +++ b/tests/integrations/typing/test_id.py @@ -1,9 +1,9 @@ -from docarray import BaseDocument +from docarray import BaseDoc from docarray.typing import ID def test_set_id(): - class MyDocument(BaseDocument): + class MyDocument(BaseDoc): id: ID d = MyDocument(id="123") diff --git a/tests/integrations/typing/test_image_url.py b/tests/integrations/typing/test_image_url.py index 648d3552ef9..008ea536b63 100644 --- a/tests/integrations/typing/test_image_url.py +++ b/tests/integrations/typing/test_image_url.py @@ -1,9 +1,9 @@ -from docarray import BaseDocument +from docarray import BaseDoc from docarray.typing import ImageUrl def test_set_image_url(): - class MyDocument(BaseDocument): + class MyDocument(BaseDoc): image_url: ImageUrl d = MyDocument(image_url="https://jina.ai/img.png") diff --git a/tests/integrations/typing/test_mesh_url.py b/tests/integrations/typing/test_mesh_url.py index b50547629e0..50a5eb05699 100644 --- a/tests/integrations/typing/test_mesh_url.py +++ b/tests/integrations/typing/test_mesh_url.py @@ -1,9 +1,9 @@ -from docarray import BaseDocument +from docarray import BaseDoc from docarray.typing import Mesh3DUrl def test_set_mesh_url(): - class MyDocument(BaseDocument): + class MyDocument(BaseDoc): mesh_url: Mesh3DUrl d = MyDocument(mesh_url="https://jina.ai/mesh.obj") diff --git a/tests/integrations/typing/test_ndarray.py b/tests/integrations/typing/test_ndarray.py index 7b8efbbbdf3..5bdcc95667d 100644 --- a/tests/integrations/typing/test_ndarray.py +++ b/tests/integrations/typing/test_ndarray.py @@ -1,11 +1,11 @@ import numpy as np -from docarray import BaseDocument +from docarray import BaseDoc from docarray.typing import NdArray def test_set_tensor(): - class MyDocument(BaseDocument): + class MyDocument(BaseDoc): tensor: NdArray d = MyDocument(tensor=np.zeros((3, 224, 224))) diff --git a/tests/integrations/typing/test_point_cloud_url.py b/tests/integrations/typing/test_point_cloud_url.py index 3b46e64e08b..64bc06bb086 100644 --- a/tests/integrations/typing/test_point_cloud_url.py +++ b/tests/integrations/typing/test_point_cloud_url.py @@ -1,9 +1,9 @@ -from docarray import BaseDocument +from docarray import BaseDoc from docarray.typing import PointCloud3DUrl def test_set_point_cloud_url(): - class MyDocument(BaseDocument): + class MyDocument(BaseDoc): point_cloud_url: PointCloud3DUrl d = MyDocument(point_cloud_url="https://jina.ai/mesh.obj") diff --git a/tests/integrations/typing/test_tensor.py b/tests/integrations/typing/test_tensor.py index a2c92f4090e..ba15e2d5c94 100644 --- a/tests/integrations/typing/test_tensor.py +++ b/tests/integrations/typing/test_tensor.py @@ -2,7 +2,7 @@ import pytest import torch -from docarray import BaseDocument +from docarray import BaseDoc from docarray.typing import AnyTensor, NdArray, TorchTensor from docarray.utils.misc import is_tf_available @@ -17,7 +17,7 @@ def test_set_tensor(): - class MyDocument(BaseDocument): + class MyDocument(BaseDoc): tensor: AnyTensor d = MyDocument(tensor=np.zeros((3, 224, 224))) @@ -35,7 +35,7 @@ class MyDocument(BaseDocument): @pytest.mark.tensorflow def test_set_tensor_tensorflow(): - class MyDocument(BaseDocument): + class MyDocument(BaseDoc): tensor: AnyTensor d = MyDocument(tensor=tf.zeros((3, 224, 224))) diff --git a/tests/integrations/typing/test_tensorflow_tensor.py b/tests/integrations/typing/test_tensorflow_tensor.py index b0337df749d..84505968090 100644 --- a/tests/integrations/typing/test_tensorflow_tensor.py +++ b/tests/integrations/typing/test_tensorflow_tensor.py @@ -1,6 +1,6 @@ import pytest -from docarray import BaseDocument +from docarray import BaseDoc from docarray.utils.misc import is_tf_available tf_available = is_tf_available() @@ -13,7 +13,7 @@ @pytest.mark.tensorflow def test_set_tensorflow_tensor(): - class MyDocument(BaseDocument): + class MyDocument(BaseDoc): t: TensorFlowTensor doc = MyDocument(t=tf.zeros((3, 224, 224))) @@ -25,7 +25,7 @@ class MyDocument(BaseDocument): @pytest.mark.tensorflow def test_set_tf_embedding(): - class MyDocument(BaseDocument): + class MyDocument(BaseDoc): embedding: TensorFlowEmbedding doc = MyDocument(embedding=tf.zeros((128,))) diff --git a/tests/integrations/typing/test_torch_tensor.py b/tests/integrations/typing/test_torch_tensor.py index e9f1b5549dd..2a84489cd97 100644 --- a/tests/integrations/typing/test_torch_tensor.py +++ b/tests/integrations/typing/test_torch_tensor.py @@ -1,11 +1,11 @@ import torch -from docarray import BaseDocument +from docarray import BaseDoc from docarray.typing import TorchEmbedding, TorchTensor def test_set_torch_tensor(): - class MyDocument(BaseDocument): + class MyDocument(BaseDoc): tensor: TorchTensor d = MyDocument(tensor=torch.zeros((3, 224, 224))) @@ -16,7 +16,7 @@ class MyDocument(BaseDocument): def test_set_torch_embedding(): - class MyDocument(BaseDocument): + class MyDocument(BaseDoc): embedding: TorchEmbedding d = MyDocument(embedding=torch.zeros((128,))) diff --git a/tests/integrations/typing/test_typing_proto.py b/tests/integrations/typing/test_typing_proto.py index 9d5b8040ee3..a6f3f571659 100644 --- a/tests/integrations/typing/test_typing_proto.py +++ b/tests/integrations/typing/test_typing_proto.py @@ -2,8 +2,8 @@ import pytest import torch -from docarray import BaseDocument -from docarray.base_document import AnyDocument +from docarray import BaseDoc +from docarray.base_document import AnyDoc from docarray.typing import ( AnyEmbedding, AnyUrl, @@ -18,7 +18,7 @@ @pytest.mark.proto def test_proto_all_types(): - class Mymmdoc(BaseDocument): + class Mymmdoc(BaseDoc): tensor: NdArray torch_tensor: TorchTensor embedding: AnyEmbedding @@ -39,7 +39,7 @@ class Mymmdoc(BaseDocument): point_cloud_url='http://jina.ai/mesh.obj', ) - new_doc = AnyDocument.from_protobuf(doc.to_protobuf()) + new_doc = AnyDoc.from_protobuf(doc.to_protobuf()) for field, value in new_doc: if field == 'embedding': @@ -55,7 +55,7 @@ def test_proto_all_types_proto3(): from docarray.typing import TensorFlowTensor - class Mymmdoc(BaseDocument): + class Mymmdoc(BaseDoc): tensor: NdArray torch_tensor: TorchTensor tf_tensor: TensorFlowTensor @@ -78,7 +78,7 @@ class Mymmdoc(BaseDocument): point_cloud_url='http://jina.ai/mesh.obj', ) - new_doc = AnyDocument.from_protobuf(doc.to_protobuf()) + new_doc = AnyDoc.from_protobuf(doc.to_protobuf()) for field, value in new_doc: if field == 'embedding': diff --git a/tests/units/array/stack/storage/test_storage.py b/tests/units/array/stack/storage/test_storage.py index 335128e2add..280b986c886 100644 --- a/tests/units/array/stack/storage/test_storage.py +++ b/tests/units/array/stack/storage/test_storage.py @@ -1,16 +1,16 @@ import numpy as np -from docarray import BaseDocument +from docarray import BaseDoc from docarray.array import DocumentArrayStacked from docarray.array.stacked.column_storage import ColumnStorageView from docarray.typing import AnyTensor def test_column_storage_init(): - class InnerDoc(BaseDocument): + class InnerDoc(BaseDoc): price: int - class MyDoc(BaseDocument): + class MyDoc(BaseDoc): tensor: AnyTensor name: str doc: InnerDoc @@ -32,7 +32,7 @@ class MyDoc(BaseDocument): def test_column_storage_view(): - class MyDoc(BaseDocument): + class MyDoc(BaseDoc): tensor: AnyTensor name: str diff --git a/tests/units/array/stack/test_array_stacked.py b/tests/units/array/stack/test_array_stacked.py index dcaa3e89a0a..66d82ea523e 100644 --- a/tests/units/array/stack/test_array_stacked.py +++ b/tests/units/array/stack/test_array_stacked.py @@ -5,7 +5,7 @@ import torch from pydantic import parse_obj_as -from docarray import BaseDocument, DocumentArray +from docarray import BaseDoc, DocumentArray from docarray.array import DocumentArrayStacked from docarray.documents import ImageDoc from docarray.typing import AnyEmbedding, AnyTensor, NdArray, TorchTensor @@ -13,7 +13,7 @@ @pytest.fixture() def batch(): - class ImageDoc(BaseDocument): + class ImageDoc(BaseDoc): tensor: TorchTensor[3, 224, 224] batch = DocumentArrayStacked[ImageDoc]( @@ -25,10 +25,10 @@ class ImageDoc(BaseDocument): @pytest.fixture() def nested_batch(): - class ImageDoc(BaseDocument): + class ImageDoc(BaseDoc): tensor: TorchTensor[3, 224, 224] - class MMdoc(BaseDocument): + class MMdoc(BaseDoc): img: DocumentArray[ImageDoc] batch = DocumentArray[MMdoc]( @@ -72,7 +72,7 @@ def test_iterator(batch): def test_stack_setter(): - class ImageDoc(BaseDocument): + class ImageDoc(BaseDoc): tensor: TorchTensor[3, 224, 224] batch = DocumentArray[ImageDoc]( @@ -89,7 +89,7 @@ class ImageDoc(BaseDocument): def test_stack_setter_np(): - class ImageDoc(BaseDocument): + class ImageDoc(BaseDoc): tensor: NdArray[3, 224, 224] batch = DocumentArray[ImageDoc]( @@ -113,7 +113,7 @@ def test_stack_optional(batch): def test_stack_numpy(): - class ImageDoc(BaseDocument): + class ImageDoc(BaseDoc): tensor: NdArray[3, 224, 224] batch = DocumentArray[ImageDoc]( @@ -146,10 +146,10 @@ def test_stack(batch): def test_stack_mod_nested_document(): - class ImageDoc(BaseDocument): + class ImageDoc(BaseDoc): tensor: TorchTensor[3, 224, 224] - class MMdoc(BaseDocument): + class MMdoc(BaseDoc): img: ImageDoc batch = DocumentArray[MMdoc]( @@ -185,7 +185,7 @@ def test_stack_nested_documentarray(nested_batch): def test_convert_to_da(batch): - class ImageDoc(BaseDocument): + class ImageDoc(BaseDoc): tensor: TorchTensor[3, 224, 224] batch = DocumentArray[ImageDoc]( @@ -200,10 +200,10 @@ class ImageDoc(BaseDocument): def test_unstack_nested_document(): - class ImageDoc(BaseDocument): + class ImageDoc(BaseDoc): tensor: TorchTensor[3, 224, 224] - class MMdoc(BaseDocument): + class MMdoc(BaseDoc): img: ImageDoc batch = DocumentArray[MMdoc]( @@ -227,7 +227,7 @@ def test_unstack_nested_documentarray(nested_batch): def test_stack_call(): - class ImageDoc(BaseDocument): + class ImageDoc(BaseDoc): tensor: TorchTensor[3, 224, 224] da = DocumentArray[ImageDoc]( @@ -242,7 +242,7 @@ class ImageDoc(BaseDocument): def test_stack_union(): - class ImageDoc(BaseDocument): + class ImageDoc(BaseDoc): tensor: Union[NdArray[3, 224, 224], TorchTensor[3, 224, 224]] batch = DocumentArray[ImageDoc]( @@ -260,7 +260,7 @@ class ImageDoc(BaseDocument): [(TorchTensor, torch.zeros(3, 224, 224)), (NdArray, np.zeros((3, 224, 224)))], ) def test_any_tensor_with_torch(tensor_type, tensor): - class ImageDoc(BaseDocument): + class ImageDoc(BaseDoc): tensor: AnyTensor da = DocumentArrayStacked[ImageDoc]( @@ -278,10 +278,10 @@ class ImageDoc(BaseDocument): def test_any_tensor_with_optional(): tensor = torch.zeros(3, 224, 224) - class ImageDoc(BaseDocument): + class ImageDoc(BaseDoc): tensor: Optional[AnyTensor] - class TopDoc(BaseDocument): + class TopDoc(BaseDoc): img: ImageDoc da = DocumentArrayStacked[TopDoc]( @@ -297,7 +297,7 @@ class TopDoc(BaseDocument): def test_dict_stack(): - class MyDoc(BaseDocument): + class MyDoc(BaseDoc): my_dict: Dict[str, int] da = DocumentArrayStacked[MyDoc]( @@ -308,7 +308,7 @@ class MyDoc(BaseDocument): def test_get_from_slice_stacked(): - class Doc(BaseDocument): + class Doc(BaseDoc): text: str tensor: NdArray @@ -331,7 +331,7 @@ class Doc(BaseDocument): def test_stack_embedding(): - class MyDoc(BaseDocument): + class MyDoc(BaseDoc): embedding: AnyEmbedding da = DocumentArrayStacked[MyDoc]([MyDoc(embedding=np.zeros(10)) for _ in range(10)]) @@ -342,7 +342,7 @@ class MyDoc(BaseDocument): @pytest.mark.parametrize('tensor_backend', [TorchTensor, NdArray]) def test_stack_none(tensor_backend): - class MyDoc(BaseDocument): + class MyDoc(BaseDoc): tensor: Optional[AnyTensor] da = DocumentArrayStacked[MyDoc]( @@ -362,7 +362,7 @@ def test_to_device(): def test_to_device_with_nested_da(): - class Video(BaseDocument): + class Video(BaseDoc): images: DocumentArray[ImageDoc] da_image = DocumentArrayStacked[ImageDoc]( @@ -376,7 +376,7 @@ class Video(BaseDocument): def test_to_device_nested(): - class MyDoc(BaseDocument): + class MyDoc(BaseDoc): tensor: TorchTensor docs: ImageDoc @@ -400,7 +400,7 @@ def test_to_device_numpy(): def test_keep_dtype_torch(): - class MyDoc(BaseDocument): + class MyDoc(BaseDoc): tensor: TorchTensor da = DocumentArray[MyDoc]( @@ -414,7 +414,7 @@ class MyDoc(BaseDocument): def test_keep_dtype_np(): - class MyDoc(BaseDocument): + class MyDoc(BaseDoc): tensor: NdArray da = DocumentArray[MyDoc]( @@ -435,7 +435,7 @@ def test_del_item(batch): def test_np_scalar(): - class MyDoc(BaseDocument): + class MyDoc(BaseDoc): scalar: NdArray da = DocumentArray[MyDoc]([MyDoc(scalar=np.array(2.0)) for _ in range(3)]) @@ -455,7 +455,7 @@ class MyDoc(BaseDocument): def test_torch_scalar(): - class MyDoc(BaseDocument): + class MyDoc(BaseDoc): scalar: TorchTensor da = DocumentArray[MyDoc]( @@ -475,7 +475,7 @@ class MyDoc(BaseDocument): def test_np_nan(): - class MyDoc(BaseDocument): + class MyDoc(BaseDoc): scalar: Optional[NdArray] da = DocumentArray[MyDoc]([MyDoc() for _ in range(3)]) @@ -494,7 +494,7 @@ class MyDoc(BaseDocument): def test_torch_nan(): - class MyDoc(BaseDocument): + class MyDoc(BaseDoc): scalar: Optional[TorchTensor] da = DocumentArray[MyDoc]([MyDoc() for _ in range(3)]) @@ -514,7 +514,7 @@ class MyDoc(BaseDocument): def test_from_storage(): - class ImageDoc(BaseDocument): + class ImageDoc(BaseDoc): tensor: TorchTensor[3, 224, 224] batch = DocumentArrayStacked[ImageDoc]( @@ -525,7 +525,7 @@ class ImageDoc(BaseDocument): def test_validate_from_da(): - class ImageDoc(BaseDocument): + class ImageDoc(BaseDoc): tensor: TorchTensor[3, 224, 224] batch = DocumentArray[ImageDoc]( @@ -552,10 +552,10 @@ def test_validation_column_tensor_fail(batch): @pytest.fixture() def batch_nested_doc(): - class Inner(BaseDocument): + class Inner(BaseDoc): hello: str - class Doc(BaseDocument): + class Doc(BaseDoc): inner: Inner batch = DocumentArrayStacked[Doc]( diff --git a/tests/units/array/stack/test_array_stacked_tf.py b/tests/units/array/stack/test_array_stacked_tf.py index f18a4604fde..5b06a3c1b3c 100644 --- a/tests/units/array/stack/test_array_stacked_tf.py +++ b/tests/units/array/stack/test_array_stacked_tf.py @@ -2,7 +2,7 @@ import pytest -from docarray import BaseDocument, DocumentArray +from docarray import BaseDoc, DocumentArray from docarray.array import DocumentArrayStacked from docarray.typing import AnyTensor, NdArray from docarray.utils.misc import is_tf_available @@ -17,7 +17,7 @@ @pytest.fixture() def batch(): - class Image(BaseDocument): + class Image(BaseDoc): tensor: TensorFlowTensor[3, 224, 224] import tensorflow as tf @@ -31,10 +31,10 @@ class Image(BaseDocument): @pytest.fixture() def nested_batch(): - class Image(BaseDocument): + class Image(BaseDoc): tensor: TensorFlowTensor[3, 224, 224] - class MMdoc(BaseDocument): + class MMdoc(BaseDoc): img: DocumentArray[Image] import tensorflow as tf @@ -81,7 +81,7 @@ def test_iterator(batch): @pytest.mark.tensorflow def test_set_after_stacking(): - class Image(BaseDocument): + class Image(BaseDoc): tensor: TensorFlowTensor[3, 224, 224] batch = DocumentArrayStacked[Image]( @@ -105,10 +105,10 @@ def test_stack_optional(batch): @pytest.mark.tensorflow def test_stack_mod_nested_document(): - class Image(BaseDocument): + class Image(BaseDoc): tensor: TensorFlowTensor[3, 224, 224] - class MMdoc(BaseDocument): + class MMdoc(BaseDoc): img: Image batch = DocumentArray[MMdoc]( @@ -146,10 +146,10 @@ def test_convert_to_da(batch): @pytest.mark.tensorflow def test_unstack_nested_document(): - class Image(BaseDocument): + class Image(BaseDoc): tensor: TensorFlowTensor[3, 224, 224] - class MMdoc(BaseDocument): + class MMdoc(BaseDoc): img: Image batch = DocumentArrayStacked[MMdoc]( @@ -173,7 +173,7 @@ def test_unstack_nested_documentarray(nested_batch): @pytest.mark.tensorflow def test_stack_call(): - class Image(BaseDocument): + class Image(BaseDoc): tensor: TensorFlowTensor[3, 224, 224] da = DocumentArray[Image]( @@ -189,7 +189,7 @@ class Image(BaseDocument): @pytest.mark.tensorflow def test_stack_union(): - class Image(BaseDocument): + class Image(BaseDoc): tensor: Union[NdArray[3, 224, 224], TensorFlowTensor[3, 224, 224]] DocumentArrayStacked[Image]( @@ -216,7 +216,7 @@ def test_setitem_tensor_direct(batch): def test_any_tensor_with_tf(): tensor = tf.zeros((3, 224, 224)) - class Image(BaseDocument): + class Image(BaseDoc): tensor: AnyTensor da = DocumentArrayStacked[Image]( @@ -235,10 +235,10 @@ class Image(BaseDocument): def test_any_tensor_with_optional(): tensor = tf.zeros((3, 224, 224)) - class Image(BaseDocument): + class Image(BaseDoc): tensor: Optional[AnyTensor] - class TopDoc(BaseDocument): + class TopDoc(BaseDoc): img: Image da = DocumentArrayStacked[TopDoc]( @@ -256,7 +256,7 @@ class TopDoc(BaseDocument): @pytest.mark.tensorflow def test_get_from_slice_stacked(): - class Doc(BaseDocument): + class Doc(BaseDoc): text: str tensor: TensorFlowTensor @@ -273,7 +273,7 @@ class Doc(BaseDocument): @pytest.mark.tensorflow def test_stack_none(): - class MyDoc(BaseDocument): + class MyDoc(BaseDoc): tensor: Optional[AnyTensor] da = DocumentArrayStacked[MyDoc]( @@ -284,7 +284,7 @@ class MyDoc(BaseDocument): @pytest.mark.tensorflow def test_keep_dtype_tf(): - class MyDoc(BaseDocument): + class MyDoc(BaseDoc): tensor: TensorFlowTensor da = DocumentArray[MyDoc]( diff --git a/tests/units/array/stack/test_init.py b/tests/units/array/stack/test_init.py index dd7f0b12925..aedd761aadc 100644 --- a/tests/units/array/stack/test_init.py +++ b/tests/units/array/stack/test_init.py @@ -1,12 +1,12 @@ import numpy as np -from docarray import BaseDocument +from docarray import BaseDoc from docarray.array.stacked.array_stacked import DocumentArrayStacked from docarray.typing import AnyTensor, NdArray def test_da_init(): - class MyDoc(BaseDocument): + class MyDoc(BaseDoc): tensor: AnyTensor name: str @@ -19,7 +19,7 @@ class MyDoc(BaseDocument): def test_da_iter(): - class MyDoc(BaseDocument): + class MyDoc(BaseDoc): tensor: AnyTensor name: str diff --git a/tests/units/array/stack/test_proto.py b/tests/units/array/stack/test_proto.py index 27dc4bd006b..6a0f2881b08 100644 --- a/tests/units/array/stack/test_proto.py +++ b/tests/units/array/stack/test_proto.py @@ -2,14 +2,14 @@ import pytest import torch -from docarray import BaseDocument, DocumentArray +from docarray import BaseDoc, DocumentArray from docarray.array import DocumentArrayStacked from docarray.typing import NdArray, TorchTensor @pytest.fixture() def batch(): - class Image(BaseDocument): + class Image(BaseDoc): tensor: TorchTensor[3, 224, 224] batch = DocumentArray[Image]( @@ -26,7 +26,7 @@ def test_proto_stacked_mode_torch(batch): @pytest.mark.proto def test_proto_stacked_mode_numpy(): - class MyDoc(BaseDocument): + class MyDoc(BaseDoc): tensor: NdArray[3, 224, 224] da = DocumentArray[MyDoc]( @@ -40,7 +40,7 @@ class MyDoc(BaseDocument): @pytest.mark.proto def test_stacked_proto(): - class CustomDocument(BaseDocument): + class CustomDocument(BaseDoc): image: NdArray da = DocumentArray[CustomDocument]( diff --git a/tests/units/array/test_array.py b/tests/units/array/test_array.py index af95e05bfb1..0126fbb69e9 100644 --- a/tests/units/array/test_array.py +++ b/tests/units/array/test_array.py @@ -4,7 +4,7 @@ import pytest import torch -from docarray import BaseDocument, DocumentArray +from docarray import BaseDoc, DocumentArray from docarray.typing import ImageUrl, NdArray, TorchTensor from docarray.utils.misc import is_tf_available @@ -15,7 +15,7 @@ @pytest.fixture() def da(): - class Text(BaseDocument): + class Text(BaseDoc): text: str return DocumentArray[Text]([Text(text=f'hello {i}') for i in range(10)]) @@ -27,7 +27,7 @@ def test_iterate(da): def test_append(): - class Text(BaseDocument): + class Text(BaseDoc): text: str da = DocumentArray[Text]([]) @@ -39,7 +39,7 @@ class Text(BaseDocument): def test_extend(): - class Text(BaseDocument): + class Text(BaseDoc): text: str da = DocumentArray[Text]([Text(text='hello', id=str(i)) for i in range(10)]) @@ -58,7 +58,7 @@ def test_slice(da): def test_document_array(): - class Text(BaseDocument): + class Text(BaseDoc): text: str da = DocumentArray([Text(text='hello') for _ in range(10)]) @@ -72,7 +72,7 @@ def test_empty_array(): def test_document_array_fixed_type(): - class Text(BaseDocument): + class Text(BaseDoc): text: str da = DocumentArray[Text]([Text(text='hello') for _ in range(10)]) @@ -81,7 +81,7 @@ class Text(BaseDocument): def test_get_bulk_attributes_function(): - class Mmdoc(BaseDocument): + class Mmdoc(BaseDoc): text: str tensor: NdArray @@ -105,10 +105,10 @@ class Mmdoc(BaseDocument): def test_set_attributes(): - class InnerDoc(BaseDocument): + class InnerDoc(BaseDoc): text: str - class Mmdoc(BaseDocument): + class Mmdoc(BaseDoc): inner: InnerDoc N = 10 @@ -125,7 +125,7 @@ class Mmdoc(BaseDocument): def test_get_bulk_attributes(): - class Mmdoc(BaseDocument): + class Mmdoc(BaseDoc): text: str tensor: NdArray @@ -149,10 +149,10 @@ class Mmdoc(BaseDocument): def test_get_bulk_attributes_document(): - class InnerDoc(BaseDocument): + class InnerDoc(BaseDoc): text: str - class Mmdoc(BaseDocument): + class Mmdoc(BaseDoc): inner: InnerDoc N = 10 @@ -165,7 +165,7 @@ class Mmdoc(BaseDocument): def test_get_bulk_attributes_optional_type(): - class Mmdoc(BaseDocument): + class Mmdoc(BaseDoc): text: str tensor: Optional[NdArray] @@ -189,7 +189,7 @@ class Mmdoc(BaseDocument): def test_get_bulk_attributes_union_type(): - class Mmdoc(BaseDocument): + class Mmdoc(BaseDoc): text: str tensor: Union[NdArray, TorchTensor] @@ -215,7 +215,7 @@ class Mmdoc(BaseDocument): @pytest.mark.tensorflow def test_get_bulk_attributes_union_type_nested(): - class MyDoc(BaseDocument): + class MyDoc(BaseDoc): embedding: Union[Optional[TorchTensor], Optional[NdArray]] embedding2: Optional[Union[TorchTensor, NdArray, TensorFlowTensor]] embedding3: Optional[Optional[TorchTensor]] @@ -244,7 +244,7 @@ class MyDoc(BaseDocument): def test_get_from_slice(): - class Doc(BaseDocument): + class Doc(BaseDoc): text: str tensor: NdArray @@ -297,12 +297,12 @@ def test_del_item(da): def test_generic_type_var(): - T = TypeVar('T', bound=BaseDocument) + T = TypeVar('T', bound=BaseDoc) def f(a: DocumentArray[T]) -> DocumentArray[T]: return a - def g(a: DocumentArray['BaseDocument']) -> DocumentArray['BaseDocument']: + def g(a: DocumentArray['BaseDoc']) -> DocumentArray['BaseDoc']: return a a = DocumentArray() @@ -311,7 +311,7 @@ def g(a: DocumentArray['BaseDocument']) -> DocumentArray['BaseDocument']: def test_construct(): - class Text(BaseDocument): + class Text(BaseDoc): text: str docs = [Text(text=f'hello {i}') for i in range(10)] @@ -322,7 +322,7 @@ class Text(BaseDocument): def test_reverse(): - class Text(BaseDocument): + class Text(BaseDoc): text: str docs = [Text(text=f'hello {i}') for i in range(10)] @@ -333,7 +333,7 @@ class Text(BaseDocument): assert da[0].text == 'hello 9' -class Image(BaseDocument): +class Image(BaseDoc): tensor: Optional[NdArray] url: ImageUrl diff --git a/tests/units/array/test_array_from_to_bytes.py b/tests/units/array/test_array_from_to_bytes.py index ede1dce7de9..7112763402e 100644 --- a/tests/units/array/test_array_from_to_bytes.py +++ b/tests/units/array/test_array_from_to_bytes.py @@ -1,11 +1,11 @@ import pytest -from docarray import BaseDocument, DocumentArray +from docarray import BaseDoc, DocumentArray from docarray.documents import ImageDoc from docarray.typing import NdArray -class MyDoc(BaseDocument): +class MyDoc(BaseDoc): embedding: NdArray text: str image: ImageDoc diff --git a/tests/units/array/test_array_from_to_csv.py b/tests/units/array/test_array_from_to_csv.py index 5fc81e0f40b..2a4049f4290 100644 --- a/tests/units/array/test_array_from_to_csv.py +++ b/tests/units/array/test_array_from_to_csv.py @@ -3,14 +3,14 @@ import pytest -from docarray import BaseDocument, DocumentArray +from docarray import BaseDoc, DocumentArray from docarray.documents import ImageDoc from tests import TOYDATA_DIR @pytest.fixture() def nested_doc_cls(): - class MyDoc(BaseDocument): + class MyDoc(BaseDoc): count: Optional[int] text: str @@ -72,14 +72,14 @@ def test_from_csv_nested(nested_doc_cls): @pytest.fixture() def nested_doc(): - class Inner(BaseDocument): + class Inner(BaseDoc): img: Optional[ImageDoc] - class Middle(BaseDocument): + class Middle(BaseDoc): img: Optional[ImageDoc] inner: Optional[Inner] - class Outer(BaseDocument): + class Outer(BaseDoc): img: Optional[ImageDoc] middle: Optional[Middle] diff --git a/tests/units/array/test_array_from_to_json.py b/tests/units/array/test_array_from_to_json.py index 9ffe5080693..2e910496a32 100644 --- a/tests/units/array/test_array_from_to_json.py +++ b/tests/units/array/test_array_from_to_json.py @@ -1,9 +1,9 @@ -from docarray import BaseDocument, DocumentArray +from docarray import BaseDoc, DocumentArray from docarray.documents import ImageDoc from docarray.typing import NdArray -class MyDoc(BaseDocument): +class MyDoc(BaseDoc): embedding: NdArray text: str image: ImageDoc diff --git a/tests/units/array/test_array_from_to_pandas.py b/tests/units/array/test_array_from_to_pandas.py index c6a54322efd..0ca762807a6 100644 --- a/tests/units/array/test_array_from_to_pandas.py +++ b/tests/units/array/test_array_from_to_pandas.py @@ -3,13 +3,13 @@ import pandas as pd import pytest -from docarray import BaseDocument, DocumentArray +from docarray import BaseDoc, DocumentArray from docarray.documents import ImageDoc @pytest.fixture() def nested_doc_cls(): - class MyDoc(BaseDocument): + class MyDoc(BaseDoc): count: Optional[int] text: str @@ -54,14 +54,14 @@ def test_to_from_pandas_df(nested_doc_cls): @pytest.fixture() def nested_doc(): - class Inner(BaseDocument): + class Inner(BaseDoc): img: Optional[ImageDoc] - class Middle(BaseDocument): + class Middle(BaseDoc): img: Optional[ImageDoc] inner: Optional[Inner] - class Outer(BaseDocument): + class Outer(BaseDoc): img: Optional[ImageDoc] middle: Optional[Middle] diff --git a/tests/units/array/test_array_proto.py b/tests/units/array/test_array_proto.py index d6dbb767bcd..5ba2b0fef65 100644 --- a/tests/units/array/test_array_proto.py +++ b/tests/units/array/test_array_proto.py @@ -1,14 +1,14 @@ import numpy as np import pytest -from docarray import BaseDocument, DocumentArray +from docarray import BaseDoc, DocumentArray from docarray.documents import ImageDoc, TextDoc from docarray.typing import NdArray @pytest.mark.proto def test_simple_proto(): - class CustomDoc(BaseDocument): + class CustomDoc(BaseDoc): text: str tensor: NdArray @@ -25,7 +25,7 @@ class CustomDoc(BaseDocument): @pytest.mark.proto def test_nested_proto(): - class CustomDocument(BaseDocument): + class CustomDocument(BaseDoc): text: TextDoc image: ImageDoc @@ -44,7 +44,7 @@ class CustomDocument(BaseDocument): @pytest.mark.proto def test_nested_proto_any_doc(): - class CustomDocument(BaseDocument): + class CustomDocument(BaseDoc): text: TextDoc image: ImageDoc diff --git a/tests/units/array/test_array_save_load.py b/tests/units/array/test_array_save_load.py index 8f5143fd57e..6d5d9a8da4a 100644 --- a/tests/units/array/test_array_save_load.py +++ b/tests/units/array/test_array_save_load.py @@ -3,12 +3,12 @@ import numpy as np import pytest -from docarray import BaseDocument, DocumentArray +from docarray import BaseDoc, DocumentArray from docarray.documents import ImageDoc from docarray.typing import NdArray -class MyDoc(BaseDocument): +class MyDoc(BaseDoc): embedding: NdArray text: str image: ImageDoc diff --git a/tests/units/array/test_batching.py b/tests/units/array/test_batching.py index 26a14fbf79a..f7bce9bea96 100644 --- a/tests/units/array/test_batching.py +++ b/tests/units/array/test_batching.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from docarray import BaseDocument, DocumentArray +from docarray import BaseDoc, DocumentArray from docarray.typing import NdArray @@ -9,7 +9,7 @@ @pytest.mark.parametrize('stack', [False, True]) @pytest.mark.parametrize('batch_size,n_batches', [(16, 7), (10, 10)]) def test_batch(shuffle, stack, batch_size, n_batches): - class MyDoc(BaseDocument): + class MyDoc(BaseDoc): id: int tensor: NdArray diff --git a/tests/units/array/test_generic_array.py b/tests/units/array/test_generic_array.py index 23f4773c4e9..7d0fb36b0af 100644 --- a/tests/units/array/test_generic_array.py +++ b/tests/units/array/test_generic_array.py @@ -1,9 +1,9 @@ -from docarray import BaseDocument, DocumentArray -from docarray.base_document import AnyDocument +from docarray import BaseDoc, DocumentArray +from docarray.base_document import AnyDoc def test_generic_init(): - class Text(BaseDocument): + class Text(BaseDoc): text: str da = DocumentArray[Text]([]) @@ -14,6 +14,6 @@ class Text(BaseDocument): def test_normal_access_init(): da = DocumentArray([]) - da.document_type == AnyDocument + da.document_type == AnyDoc assert isinstance(da, DocumentArray) diff --git a/tests/units/array/test_traverse.py b/tests/units/array/test_traverse.py index c94044f596f..700ecedd3a4 100644 --- a/tests/units/array/test_traverse.py +++ b/tests/units/array/test_traverse.py @@ -3,8 +3,8 @@ import pytest import torch -from docarray import BaseDocument, DocumentArray -from docarray.array.abstract_array import AnyDocumentArray +from docarray import BaseDoc, DocumentArray +from docarray.array.abstract_array import AnyDocArray from docarray.documents import TextDoc from docarray.typing import TorchTensor @@ -15,15 +15,15 @@ @pytest.fixture def multi_model_docs(): - class SubSubDoc(BaseDocument): + class SubSubDoc(BaseDoc): sub_sub_text: TextDoc sub_sub_tensor: TorchTensor[2] - class SubDoc(BaseDocument): + class SubDoc(BaseDoc): sub_text: TextDoc sub_da: DocumentArray[SubSubDoc] - class MultiModalDoc(BaseDocument): + class MultiModalDoc(BaseDoc): mm_text: TextDoc mm_tensor: Optional[TorchTensor[3, 2, 2]] mm_da: DocumentArray[SubDoc] @@ -78,7 +78,7 @@ def test_traverse_flat(multi_model_docs, access_path, len_result): def test_traverse_stacked_da(): - class Image(BaseDocument): + class Image(BaseDoc): tensor: TorchTensor[3, 224, 224] batch = DocumentArray[Image]( @@ -106,13 +106,13 @@ class Image(BaseDocument): ], ) def test_flatten_one_level(input_list, output_list): - flattened = AnyDocumentArray._flatten_one_level(sequence=input_list) + flattened = AnyDocArray._flatten_one_level(sequence=input_list) assert flattened == output_list def test_flatten_one_level_list_of_da(): - doc = BaseDocument() + doc = BaseDoc() input_list = [DocumentArray([doc, doc, doc])] - flattened = AnyDocumentArray._flatten_one_level(sequence=input_list) + flattened = AnyDocArray._flatten_one_level(sequence=input_list) assert flattened == [doc, doc, doc] diff --git a/tests/units/document/proto/test_document_proto.py b/tests/units/document/proto/test_document_proto.py index 73df031be07..bd132966c38 100644 --- a/tests/units/document/proto/test_document_proto.py +++ b/tests/units/document/proto/test_document_proto.py @@ -5,7 +5,7 @@ import torch from docarray import DocumentArray -from docarray.base_document import BaseDocument +from docarray.base_document import BaseDoc from docarray.typing import NdArray, TorchTensor from docarray.utils.misc import is_tf_available @@ -15,7 +15,7 @@ @pytest.mark.proto def test_proto_simple(): - class CustomDoc(BaseDocument): + class CustomDoc(BaseDoc): text: str doc = CustomDoc(text='hello') @@ -25,7 +25,7 @@ class CustomDoc(BaseDocument): @pytest.mark.proto def test_proto_ndarray(): - class CustomDoc(BaseDocument): + class CustomDoc(BaseDoc): tensor: NdArray tensor = np.zeros((3, 224, 224)) @@ -38,10 +38,10 @@ class CustomDoc(BaseDocument): @pytest.mark.proto def test_proto_with_nested_doc(): - class CustomInnerDoc(BaseDocument): + class CustomInnerDoc(BaseDoc): tensor: NdArray - class CustomDoc(BaseDocument): + class CustomDoc(BaseDoc): text: str inner: CustomInnerDoc @@ -52,10 +52,10 @@ class CustomDoc(BaseDocument): @pytest.mark.proto def test_proto_with_chunks_doc(): - class CustomInnerDoc(BaseDocument): + class CustomInnerDoc(BaseDoc): tensor: NdArray - class CustomDoc(BaseDocument): + class CustomDoc(BaseDoc): text: str chunks: DocumentArray[CustomInnerDoc] @@ -74,10 +74,10 @@ class CustomDoc(BaseDocument): @pytest.mark.proto def test_proto_with_nested_doc_pytorch(): - class CustomInnerDoc(BaseDocument): + class CustomInnerDoc(BaseDoc): tensor: TorchTensor - class CustomDoc(BaseDocument): + class CustomDoc(BaseDoc): text: str inner: CustomInnerDoc @@ -90,10 +90,10 @@ class CustomDoc(BaseDocument): @pytest.mark.proto def test_proto_with_chunks_doc_pytorch(): - class CustomInnerDoc(BaseDocument): + class CustomInnerDoc(BaseDoc): tensor: TorchTensor - class CustomDoc(BaseDocument): + class CustomDoc(BaseDoc): text: str chunks: DocumentArray[CustomInnerDoc] @@ -112,7 +112,7 @@ class CustomDoc(BaseDocument): @pytest.mark.proto def test_optional_field_in_doc(): - class CustomDoc(BaseDocument): + class CustomDoc(BaseDoc): text: Optional[str] CustomDoc.from_protobuf(CustomDoc().to_protobuf()) @@ -120,10 +120,10 @@ class CustomDoc(BaseDocument): @pytest.mark.proto def test_optional_field_nested_in_doc(): - class InnerDoc(BaseDocument): + class InnerDoc(BaseDoc): title: str - class CustomDoc(BaseDocument): + class CustomDoc(BaseDoc): text: Optional[InnerDoc] CustomDoc.from_protobuf(CustomDoc().to_protobuf()) @@ -131,7 +131,7 @@ class CustomDoc(BaseDocument): @pytest.mark.proto def test_integer_field(): - class Meow(BaseDocument): + class Meow(BaseDoc): age: int wealth: float registered: bool @@ -145,7 +145,7 @@ class Meow(BaseDocument): @pytest.mark.proto def test_list_set_dict_tuple_field(): - class MyDoc(BaseDocument): + class MyDoc(BaseDoc): list_: List dict_: Dict tuple_: Tuple @@ -178,7 +178,7 @@ class MyDoc(BaseDocument): ], ) def test_ndarray_dtype(dtype): - class MyDoc(BaseDocument): + class MyDoc(BaseDoc): tensor: NdArray doc = MyDoc(tensor=np.ndarray([1, 2, 3], dtype=dtype)) @@ -201,7 +201,7 @@ class MyDoc(BaseDocument): ], ) def test_torch_dtype(dtype): - class MyDoc(BaseDocument): + class MyDoc(BaseDoc): tensor: TorchTensor doc = MyDoc(tensor=torch.zeros([5, 5], dtype=dtype)) @@ -212,7 +212,7 @@ class MyDoc(BaseDocument): @pytest.mark.proto def test_nested_dict(): - class MyDoc(BaseDocument): + class MyDoc(BaseDoc): data: Dict doc = MyDoc(data={'data': (1, 2)}) @@ -222,7 +222,7 @@ class MyDoc(BaseDocument): @pytest.mark.proto def test_tuple_complex(): - class MyDoc(BaseDocument): + class MyDoc(BaseDoc): data: Tuple doc = MyDoc(data=(1, 2)) @@ -234,7 +234,7 @@ class MyDoc(BaseDocument): @pytest.mark.proto def test_list_complex(): - class MyDoc(BaseDocument): + class MyDoc(BaseDoc): data: List doc = MyDoc(data=[(1, 2)]) @@ -246,7 +246,7 @@ class MyDoc(BaseDocument): @pytest.mark.proto def test_nested_tensor_list(): - class MyDoc(BaseDocument): + class MyDoc(BaseDoc): data: List doc = MyDoc(data=[np.zeros(10)]) @@ -261,7 +261,7 @@ class MyDoc(BaseDocument): @pytest.mark.proto def test_nested_tensor_dict(): - class MyDoc(BaseDocument): + class MyDoc(BaseDoc): data: Dict doc = MyDoc(data={'hello': np.zeros(10)}) @@ -276,7 +276,7 @@ class MyDoc(BaseDocument): @pytest.mark.proto def test_super_complex_nested(): - class MyDoc(BaseDocument): + class MyDoc(BaseDoc): data: Dict data = {'hello': (torch.zeros(55), 1, 'hi', [torch.ones(55), np.zeros(10), (1, 2)])} @@ -289,7 +289,7 @@ class MyDoc(BaseDocument): @pytest.mark.tensorflow def test_super_complex_nested_tensorflow(): - class MyDoc(BaseDocument): + class MyDoc(BaseDoc): data: Dict data = {'hello': (torch.zeros(55), 1, 'hi', [tf.ones(55), np.zeros(10), (1, 2)])} diff --git a/tests/units/document/test_any_document.py b/tests/units/document/test_any_document.py index 457b61b3bee..5fae5d05b00 100644 --- a/tests/units/document/test_any_document.py +++ b/tests/units/document/test_any_document.py @@ -1,15 +1,15 @@ import numpy as np -from docarray.base_document import AnyDocument, BaseDocument +from docarray.base_document import AnyDoc, BaseDoc from docarray.typing import NdArray def test_any_doc(): - class InnerDocument(BaseDocument): + class InnerDocument(BaseDoc): text: str tensor: NdArray - class CustomDoc(BaseDocument): + class CustomDoc(BaseDoc): inner: InnerDocument text: str @@ -17,7 +17,7 @@ class CustomDoc(BaseDocument): text='bye', inner=InnerDocument(text='hello', tensor=np.zeros((3, 224, 224))) ) - any_doc = AnyDocument(**doc.__dict__) + any_doc = AnyDoc(**doc.__dict__) assert any_doc.text == doc.text assert any_doc.inner.text == doc.inner.text diff --git a/tests/units/document/test_base_document.py b/tests/units/document/test_base_document.py index 6a76c58f56b..91d02e600f5 100644 --- a/tests/units/document/test_base_document.py +++ b/tests/units/document/test_base_document.py @@ -1,15 +1,16 @@ -from typing import Optional, List -from docarray.base_document.document import BaseDocument +from typing import List, Optional + +from docarray.base_document.doc import BaseDoc def test_base_document_init(): - doc = BaseDocument() + doc = BaseDoc() assert doc.id is not None def test_update(): - class MyDocument(BaseDocument): + class MyDocument(BaseDoc): content: str title: Optional[str] = None tags_: List diff --git a/tests/units/document/test_from_to_bytes.py b/tests/units/document/test_from_to_bytes.py index c05719dc7d2..5a3eb620780 100644 --- a/tests/units/document/test_from_to_bytes.py +++ b/tests/units/document/test_from_to_bytes.py @@ -1,11 +1,11 @@ import pytest -from docarray import BaseDocument +from docarray import BaseDoc from docarray.documents import ImageDoc from docarray.typing import NdArray -class MyDoc(BaseDocument): +class MyDoc(BaseDoc): embedding: NdArray text: str image: ImageDoc diff --git a/tests/units/document/test_update.py b/tests/units/document/test_update.py index a57afc139a3..0ed1745f3fb 100644 --- a/tests/units/document/test_update.py +++ b/tests/units/document/test_update.py @@ -2,16 +2,16 @@ import pytest -from docarray import BaseDocument, DocumentArray +from docarray import BaseDoc, DocumentArray from docarray.documents import ImageDoc -class InnerDoc(BaseDocument): +class InnerDoc(BaseDoc): integer: int inner_list: List -class MMDoc(BaseDocument): +class MMDoc(BaseDoc): text: str = '' price: int = 0 categories: Optional[List[str]] = None @@ -75,7 +75,7 @@ def test_update_complex(doc1, doc2): def test_update_simple(): - class MyDocument(BaseDocument): + class MyDocument(BaseDoc): content: str title: Optional[str] = None tags_: List @@ -92,10 +92,10 @@ class MyDocument(BaseDocument): def test_update_different_schema_fails(): - class DocA(BaseDocument): + class DocA(BaseDoc): content: str - class DocB(BaseDocument): + class DocB(BaseDoc): image: Optional[ImageDoc] = None docA = DocA(content='haha') diff --git a/tests/units/document/test_view.py b/tests/units/document/test_view.py index 3ba4836e63c..1fdbe2f5a9f 100644 --- a/tests/units/document/test_view.py +++ b/tests/units/document/test_view.py @@ -1,13 +1,13 @@ import numpy as np -from docarray import BaseDocument +from docarray import BaseDoc from docarray.array import DocumentArrayStacked from docarray.array.stacked.column_storage import ColumnStorageView from docarray.typing import AnyTensor def test_document_view(): - class MyDoc(BaseDocument): + class MyDoc(BaseDoc): tensor: AnyTensor name: str diff --git a/tests/units/test_helper.py b/tests/units/test_helper.py index cd3131eb0ae..9dd300a9dec 100644 --- a/tests/units/test_helper.py +++ b/tests/units/test_helper.py @@ -2,7 +2,7 @@ import pytest -from docarray import BaseDocument, DocumentArray +from docarray import BaseDoc, DocumentArray from docarray.documents import ImageDoc from docarray.helper import ( _access_path_dict_to_nested_dict, @@ -16,14 +16,14 @@ @pytest.fixture() def nested_doc(): - class Inner(BaseDocument): + class Inner(BaseDoc): img: Optional[ImageDoc] - class Middle(BaseDocument): + class Middle(BaseDoc): img: Optional[ImageDoc] inner: Optional[Inner] - class Outer(BaseDocument): + class Outer(BaseDoc): img: Optional[ImageDoc] middle: Optional[Middle] da: DocumentArray[Inner] @@ -51,7 +51,7 @@ def test_is_access_path_not_valid(nested_doc): def test_get_access_paths(): - class Painting(BaseDocument): + class Painting(BaseDoc): title: str img: ImageDoc diff --git a/tests/units/typing/da/test_relations.py b/tests/units/typing/da/test_relations.py index 424d22b633b..dad579ad81d 100644 --- a/tests/units/typing/da/test_relations.py +++ b/tests/units/typing/da/test_relations.py @@ -1,8 +1,8 @@ -from docarray import BaseDocument, DocumentArray +from docarray import BaseDoc, DocumentArray def test_instance_and_equivalence(): - class MyDoc(BaseDocument): + class MyDoc(BaseDoc): text: str docs = DocumentArray[MyDoc]([MyDoc(text='hello')]) @@ -14,7 +14,7 @@ class MyDoc(BaseDocument): def test_subclassing(): - class MyDoc(BaseDocument): + class MyDoc(BaseDoc): text: str class MyDocArray(DocumentArray[MyDoc]): @@ -28,6 +28,6 @@ class MyDocArray(DocumentArray[MyDoc]): assert isinstance(docs, MyDocArray) assert isinstance(docs, DocumentArray[MyDoc]) - assert issubclass(MyDoc, BaseDocument) - assert not issubclass(DocumentArray[MyDoc], DocumentArray[BaseDocument]) - assert not issubclass(MyDocArray, DocumentArray[BaseDocument]) + assert issubclass(MyDoc, BaseDoc) + assert not issubclass(DocumentArray[MyDoc], DocumentArray[BaseDoc]) + assert not issubclass(MyDocArray, DocumentArray[BaseDoc]) diff --git a/tests/units/typing/tensor/test_audio_tensor.py b/tests/units/typing/tensor/test_audio_tensor.py index b10954e1862..3bc20a1c7f6 100644 --- a/tests/units/typing/tensor/test_audio_tensor.py +++ b/tests/units/typing/tensor/test_audio_tensor.py @@ -5,7 +5,7 @@ import torch from pydantic import parse_obj_as -from docarray import BaseDocument +from docarray import BaseDoc from docarray.typing.tensor.audio.audio_ndarray import AudioNdArray from docarray.typing.tensor.audio.audio_torch_tensor import AudioTorchTensor from docarray.utils.misc import is_tf_available @@ -26,7 +26,7 @@ ], ) def test_set_audio_tensor(tensor, cls_audio_tensor, cls_tensor): - class MyAudioDoc(BaseDocument): + class MyAudioDoc(BaseDoc): tensor: cls_audio_tensor doc = MyAudioDoc(tensor=tensor) @@ -37,7 +37,7 @@ class MyAudioDoc(BaseDocument): @pytest.mark.tensorflow def test_set_audio_tensorflow_tensor(): - class MyAudioDoc(BaseDocument): + class MyAudioDoc(BaseDoc): tensor: AudioTensorFlowTensor doc = MyAudioDoc(tensor=tf.zeros((1000, 2))) diff --git a/tests/units/typing/tensor/test_np_ops.py b/tests/units/typing/tensor/test_np_ops.py index 2b4ecc9df47..2398b19fa54 100644 --- a/tests/units/typing/tensor/test_np_ops.py +++ b/tests/units/typing/tensor/test_np_ops.py @@ -1,14 +1,14 @@ import numpy as np -from docarray import BaseDocument +from docarray import BaseDoc from docarray.typing import NdArray def test_tensor_ops(): - class A(BaseDocument): + class A(BaseDoc): tensor: NdArray[3, 224, 224] - class B(BaseDocument): + class B(BaseDoc): tensor: NdArray[3, 112, 224] tensor = A(tensor=np.ones((3, 224, 224))).tensor diff --git a/tests/units/typing/tensor/test_torch_ops.py b/tests/units/typing/tensor/test_torch_ops.py index 1a7d1d09386..8452d2e2aa8 100644 --- a/tests/units/typing/tensor/test_torch_ops.py +++ b/tests/units/typing/tensor/test_torch_ops.py @@ -1,14 +1,14 @@ import torch -from docarray import BaseDocument +from docarray import BaseDoc from docarray.typing import TorchTensor def test_tensor_ops(): - class A(BaseDocument): + class A(BaseDoc): tensor: TorchTensor[3, 224, 224] - class B(BaseDocument): + class B(BaseDoc): tensor: TorchTensor[3, 112, 224] tensor = A(tensor=torch.ones(3, 224, 224)).tensor diff --git a/tests/units/typing/tensor/test_torch_tensor.py b/tests/units/typing/tensor/test_torch_tensor.py index f728e06e323..8b1d5f8250b 100644 --- a/tests/units/typing/tensor/test_torch_tensor.py +++ b/tests/units/typing/tensor/test_torch_tensor.py @@ -165,9 +165,9 @@ def test_parametrized_operations(): def test_deepcopy(): - from docarray import BaseDocument + from docarray import BaseDoc - class MMdoc(BaseDocument): + class MMdoc(BaseDoc): embedding: TorchEmbedding doc = MMdoc(embedding=torch.randn(32)) diff --git a/tests/units/typing/tensor/test_video_tensor.py b/tests/units/typing/tensor/test_video_tensor.py index 9fc2f4129a9..f9981dedbc9 100644 --- a/tests/units/typing/tensor/test_video_tensor.py +++ b/tests/units/typing/tensor/test_video_tensor.py @@ -5,7 +5,7 @@ import torch from pydantic.tools import parse_obj_as -from docarray import BaseDocument +from docarray import BaseDoc from docarray.typing import ( AudioNdArray, AudioTorchTensor, @@ -30,7 +30,7 @@ ], ) def test_set_video_tensor(tensor, cls_video_tensor, cls_tensor): - class MyVideoDoc(BaseDocument): + class MyVideoDoc(BaseDoc): tensor: cls_video_tensor doc = MyVideoDoc(tensor=tensor) @@ -42,7 +42,7 @@ class MyVideoDoc(BaseDocument): @pytest.mark.tensorflow def test_set_video_tensor_tensorflow(): - class MyVideoDoc(BaseDocument): + class MyVideoDoc(BaseDoc): tensor: VideoTensorFlowTensor doc = MyVideoDoc(tensor=tf.zeros((1, 224, 224, 3))) diff --git a/tests/units/typing/url/test_audio_url.py b/tests/units/typing/url/test_audio_url.py index 9882f8c46cc..1f326effa7a 100644 --- a/tests/units/typing/url/test_audio_url.py +++ b/tests/units/typing/url/test_audio_url.py @@ -5,7 +5,7 @@ import torch from pydantic.tools import parse_obj_as, schema_json_of -from docarray import BaseDocument +from docarray import BaseDoc from docarray.base_document.io.json import orjson_dumps from docarray.typing import AudioTorchTensor, AudioUrl from docarray.utils.misc import is_tf_available @@ -43,7 +43,7 @@ def test_audio_url(file_url): [*AUDIO_FILES, REMOTE_AUDIO_FILE], ) def test_load_audio_url_to_audio_torch_tensor_field(file_url): - class MyAudioDoc(BaseDocument): + class MyAudioDoc(BaseDoc): audio_url: AudioUrl tensor: Optional[AudioTorchTensor] @@ -62,7 +62,7 @@ class MyAudioDoc(BaseDocument): [*AUDIO_FILES, REMOTE_AUDIO_FILE], ) def test_load_audio_url_to_audio_tensorflow_tensor_field(file_url): - class MyAudioDoc(BaseDocument): + class MyAudioDoc(BaseDoc): audio_url: AudioUrl tensor: Optional[AudioTensorFlowTensor] diff --git a/tests/units/typing/url/test_video_url.py b/tests/units/typing/url/test_video_url.py index 7a92e672cab..c69bd1f3054 100644 --- a/tests/units/typing/url/test_video_url.py +++ b/tests/units/typing/url/test_video_url.py @@ -5,7 +5,7 @@ import torch from pydantic.tools import parse_obj_as, schema_json_of -from docarray import BaseDocument +from docarray import BaseDoc from docarray.base_document.io.json import orjson_dumps from docarray.typing import ( AudioNdArray, @@ -76,7 +76,7 @@ def test_load_one_of_named_tuple_results(file_url, field, attr_cls): [LOCAL_VIDEO_FILE, REMOTE_VIDEO_FILE], ) def test_load_video_url_to_video_torch_tensor_field(file_url): - class MyVideoDoc(BaseDocument): + class MyVideoDoc(BaseDoc): video_url: VideoUrl tensor: Optional[VideoTorchTensor] @@ -95,7 +95,7 @@ class MyVideoDoc(BaseDocument): [LOCAL_VIDEO_FILE, REMOTE_VIDEO_FILE], ) def test_load_video_url_to_video_tensorflow_tensor_field(file_url): - class MyVideoDoc(BaseDocument): + class MyVideoDoc(BaseDoc): video_url: VideoUrl tensor: Optional[VideoTensorFlowTensor] diff --git a/tests/units/util/test_filter.py b/tests/units/util/test_filter.py index 58999f3d4b6..c9602a32c83 100644 --- a/tests/units/util/test_filter.py +++ b/tests/units/util/test_filter.py @@ -3,12 +3,12 @@ import pytest -from docarray import BaseDocument, DocumentArray +from docarray import BaseDoc, DocumentArray from docarray.documents import ImageDoc, TextDoc from docarray.utils.filter import filter_docs -class MMDoc(BaseDocument): +class MMDoc(BaseDoc): text_doc: TextDoc text: str = '' image: Optional[ImageDoc] = None @@ -246,7 +246,7 @@ def test_logic_filter(docs, dict_api): @pytest.mark.parametrize('dict_api', [True, False]) def test_from_docstring(dict_api): - class MyDocument(BaseDocument): + class MyDocument(BaseDoc): caption: TextDoc image: ImageDoc price: int diff --git a/tests/units/util/test_find.py b/tests/units/util/test_find.py index efa4a9f9438..342695c072e 100644 --- a/tests/units/util/test_find.py +++ b/tests/units/util/test_find.py @@ -4,16 +4,16 @@ import pytest import torch -from docarray import BaseDocument, DocumentArray +from docarray import BaseDoc, DocumentArray from docarray.typing import NdArray, TorchTensor from docarray.utils.find import find, find_batched -class TorchDoc(BaseDocument): +class TorchDoc(BaseDoc): tensor: TorchTensor -class NdDoc(BaseDocument): +class NdDoc(BaseDoc): tensor: NdArray @@ -257,7 +257,7 @@ def test_find_batched_np_stacked(random_nd_batch_query, random_nd_index, stack_w def test_find_optional(): - class MyDoc(BaseDocument): + class MyDoc(BaseDoc): embedding: Optional[TorchTensor] query = MyDoc(embedding=torch.rand(10)) @@ -275,7 +275,7 @@ class MyDoc(BaseDocument): def test_find_union(): - class MyDoc(BaseDocument): + class MyDoc(BaseDoc): embedding: Union[TorchTensor, NdArray] query = MyDoc(embedding=torch.rand(10)) @@ -294,11 +294,11 @@ class MyDoc(BaseDocument): @pytest.mark.parametrize('stack', [False, True]) def test_find_nested(stack): - class InnerDoc(BaseDocument): + class InnerDoc(BaseDoc): title: str embedding: TorchTensor - class MyDoc(BaseDocument): + class MyDoc(BaseDoc): inner: InnerDoc query = MyDoc(inner=InnerDoc(title='query', embedding=torch.rand(2))) @@ -323,7 +323,7 @@ class MyDoc(BaseDocument): def test_find_nested_union_optional(): - class MyDoc(BaseDocument): + class MyDoc(BaseDoc): embedding: Union[Optional[TorchTensor], Optional[NdArray]] embedding2: Optional[Union[TorchTensor, NdArray]] embedding3: Optional[Optional[TorchTensor]] diff --git a/tests/units/util/test_map.py b/tests/units/util/test_map.py index a098338f822..65227998d73 100644 --- a/tests/units/util/test_map.py +++ b/tests/units/util/test_map.py @@ -2,7 +2,7 @@ import pytest -from docarray import BaseDocument, DocumentArray +from docarray import BaseDoc, DocumentArray from docarray.documents import ImageDoc from docarray.typing import ImageUrl, NdArray from docarray.utils.map import map_docs, map_docs_batch @@ -67,7 +67,7 @@ def load_from_da(da: DocumentArray) -> DocumentArray: return da -class MyImage(BaseDocument): +class MyImage(BaseDoc): tensor: Optional[NdArray] url: ImageUrl diff --git a/tests/units/util/test_reduce.py b/tests/units/util/test_reduce.py index 48e8c3b28d3..362381b5580 100644 --- a/tests/units/util/test_reduce.py +++ b/tests/units/util/test_reduce.py @@ -2,17 +2,17 @@ import pytest -from docarray import BaseDocument, DocumentArray +from docarray import BaseDoc, DocumentArray from docarray.documents import ImageDoc from docarray.utils.reduce import reduce, reduce_all -class InnerDoc(BaseDocument): +class InnerDoc(BaseDoc): integer: int inner_list: List -class MMDoc(BaseDocument): +class MMDoc(BaseDoc): text: str = '' price: int = 0 categories: Optional[List[str]] = None From 13c731d81a974e9924c3ce9af7327fcf43ec0d91 Mon Sep 17 00:00:00 2001 From: samsja Date: Mon, 27 Mar 2023 13:30:29 +0200 Subject: [PATCH 02/22] refactor: rename document to doc in da Signed-off-by: samsja --- .github/workflows/ci.yml | 2 +- README.md | 39 ++--- docarray/__init__.py | 4 +- docarray/array/__init__.py | 6 +- docarray/array/abstract_array.py | 68 ++++---- docarray/array/array/array.py | 80 ++++----- docarray/array/array/io.py | 96 +++++------ docarray/array/array/pushpull.py | 30 ++-- docarray/array/stacked/array_stacked.py | 114 ++++++------- docarray/array/stacked/column_storage.py | 12 +- docarray/base_document/mixins/io.py | 2 +- docarray/base_document/mixins/update.py | 8 +- docarray/data/torch_dataset.py | 18 +-- docarray/display/document_array_summary.py | 18 +-- docarray/display/document_summary.py | 14 +- docarray/documents/legacy/legacy_document.py | 12 +- docarray/helper.py | 10 +- docarray/index/abstract.py | 66 ++++---- docarray/index/backends/hnswlib.py | 22 ++- docarray/proto/__init__.py | 20 +-- docarray/proto/docarray.proto | 16 +- docarray/proto/pb/docarray_pb2.py | 68 ++++---- docarray/proto/pb2/docarray_pb2.py | 152 +++++++++--------- docarray/store/abstract_doc_store.py | 46 +++--- docarray/store/file.py | 28 ++-- docarray/store/jac.py | 66 ++++---- docarray/store/s3.py | 32 ++-- docarray/typing/tensor/abstract_tensor.py | 2 +- docarray/utils/filter.py | 12 +- docarray/utils/find.py | 42 +++-- docarray/utils/map.py | 18 +-- docarray/utils/reduce.py | 58 +++---- docs/api_references/array/da.md | 4 +- docs/api_references/array/da_stack.md | 4 +- .../multimodal_training_and_serving.md | 28 ++-- tests/benchmark_tests/test_map.py | 14 +- .../index/base_classes/test_base_doc_store.py | 44 +++-- tests/index/hnswlib/test_index_get_del.py | 12 +- tests/integrations/array/test_torch_train.py | 4 +- tests/integrations/document/test_document.py | 8 +- tests/integrations/document/test_proto.py | 10 +- tests/integrations/store/__init__.py | 4 +- tests/integrations/store/test_file.py | 44 +++-- tests/integrations/store/test_jac.py | 37 ++--- tests/integrations/store/test_s3.py | 34 ++-- .../torch/data/test_torch_dataset.py | 30 ++-- .../units/array/stack/storage/test_storage.py | 8 +- tests/units/array/stack/test_array_stacked.py | 100 ++++++------ .../array/stack/test_array_stacked_tf.py | 46 +++--- tests/units/array/stack/test_init.py | 6 +- tests/units/array/stack/test_proto.py | 18 +-- tests/units/array/test_array.py | 54 +++---- tests/units/array/test_array_from_to_bytes.py | 10 +- tests/units/array/test_array_from_to_csv.py | 14 +- tests/units/array/test_array_from_to_json.py | 6 +- .../units/array/test_array_from_to_pandas.py | 10 +- tests/units/array/test_array_proto.py | 14 +- tests/units/array/test_array_save_load.py | 12 +- tests/units/array/test_batching.py | 4 +- tests/units/array/test_generic_array.py | 10 +- tests/units/array/test_indexing.py | 8 +- tests/units/array/test_traverse.py | 14 +- .../document/proto/test_document_proto.py | 10 +- tests/units/document/test_update.py | 18 +-- tests/units/document/test_view.py | 4 +- tests/units/test_helper.py | 6 +- tests/units/typing/da/test_relations.py | 22 +-- tests/units/util/test_filter.py | 8 +- tests/units/util/test_find.py | 18 +-- tests/units/util/test_map.py | 16 +- tests/units/util/test_reduce.py | 32 ++-- 71 files changed, 939 insertions(+), 987 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d39a6bec70d..51759432772 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -61,7 +61,7 @@ jobs: poetry install --without dev poetry run pip install tensorflow==2.11.0 - name: Test basic import - run: poetry run python -c 'from docarray import DocumentArray, BaseDocument' + run: poetry run python -c 'from docarray import DocArray, BaseDocument' check-mypy: diff --git a/README.md b/README.md index 004fa1d4082..c62e8f20a90 100644 --- a/README.md +++ b/README.md @@ -77,9 +77,9 @@ doc = MultiModalDocument( ) ``` -### Collect multiple `Documents` into a `DocumentArray`: +### Collect multiple `Documents` into a `DocArray`: ```python -from docarray import DocumentArray, BaseDocument +from docarray import DocArray, BaseDocument from docarray.typing import AnyTensor, ImageUrl import numpy as np @@ -90,9 +90,9 @@ class Image(BaseDocument): ``` ```python -from docarray import DocumentArray +from docarray import DocArray -da = DocumentArray[Image]( +da = DocArray[Image]( [ Image( url="https://upload.wikimedia.org/wikipedia/commons/2/2f/Alpamayo.jpg", @@ -103,7 +103,7 @@ da = DocumentArray[Image]( ) ``` -Access fields at the DocumentArray level: +Access fields at the DocArray level: ```python print(len(da.tensor)) @@ -122,7 +122,7 @@ print(da.tensor.shape) ``` ## Send -- **Serialize** any `Document` or `DocumentArray` into _protobuf_, _json_, _jsonschema_, _bytes_ or _base64_ +- **Serialize** any `Document` or `DocArray` into _protobuf_, _json_, _jsonschema_, _bytes_ or _base64_ - Use in **microservice** architecture: Send over **HTTP** or **gRPC** - Integrate seamlessly with **[FastAPI](https://github.com/tiangolo/fastapi/)** and **[Jina](https://github.com/jina-ai/jina/)** @@ -144,22 +144,22 @@ Image.from_protobuf(doc.to_protobuf()) ``` ## Store -- Persist a `DocumentArray` using a **`DocumentStore`** +- Persist a `DocArray` using a **`DocumentStore`** - Store your Documents in any supported (vector) database: **Elasticsearch**, **Qdrant**, **Weaviate**, **Redis**, **Milvus**, **ANNLite** or **SQLite** - Leverage DocumentStores to **perform vector search on your multi-modal data** ```python # NOTE: DocumentStores are not yet implemented in version 2 -from docarray import DocumentArray +from docarray import DocArray from docarray.documents import ImageDoc from docarray.stores import DocumentStore import numpy as np -da = DocumentArray([ImageDoc(embedding=np.zeros((128,))) for _ in range(1000)]) +da = DocArray([ImageDoc(embedding=np.zeros((128,))) for _ in range(1000)]) store = DocumentStore[ImageDoc]( storage='qdrant' ) # create a DocumentStore with Qdrant as backend -store.insert(da) # insert the DocumentArray into the DocumentStore +store.insert(da) # insert the DocArray into the DocumentStore # find the 10 most similar images based on the 'embedding' field match = store.find(ImageDoc(embedding=np.zeros((128,))), field='embedding', top_k=10) ``` @@ -186,7 +186,7 @@ If you come from Pydantic, you can see Documents as juiced up models, and DocArr - **ML focused types**: Tensor, TorchTensor, TFTensor, Embedding, ... - **Types that are alive**: ImageUrl can `.load()` a URL to image tensor, TextUrl can load and tokenize text documents, etc. - **Pre-built Documents** for different data modalities: Image, Text, 3DMesh, Video, Audio and more. Note that all of these will be valid Pydantic models! -- The concepts of **DocumentArray and DocumentStore** +- The concepts of **DocArray and DocumentStore** - Cloud-ready: Serialization to **Protobuf** for use with microservices and **gRPC** - Support for **vector search functionalities**, such as `find()` and `embed()` @@ -233,7 +233,7 @@ Not very easy on the eyes if you ask us. And even worse, if you need to add one So, now let's see what the same code looks like with DocArray: ```python -from docarray import DocumentArray, BaseDocument +from docarray import DocArray, BaseDocument from docarray.documents import ImageDoc, TextDoc, AudioDoc from docarray.typing import TorchTensor @@ -258,14 +258,14 @@ class MyPodcastModel(nn.Module): self.image_encoder = ImageEncoder() self.text_encoder = TextEncoder() - def forward_podcast(self, da: DocumentArray[Podcast]) -> DocumentArray[Podcast]: + def forward_podcast(self, da: DocArray[Podcast]) -> DocArray[Podcast]: da.audio.embedding = self.audio_encoder(da.audio.tensor) da.text.embedding = self.text_encoder(da.text.tensor) da.image.embedding = self.image_encoder(da.image.tensor) return da - def forward(self, da: DocumentArray[PairPodcast]) -> DocumentArray[PairPodcast]: + def forward(self, da: DocArray[PairPodcast]) -> DocArray[PairPodcast]: da.left = self.forward_podcast(da.left) da.right = self.forward_podcast(da.right) @@ -297,7 +297,7 @@ This would look like the following: ```python from typing import Optional -from docarray import DocumentArray, BaseDocument +from docarray import DocArray, BaseDocument import tensorflow as tf @@ -312,7 +312,7 @@ class MyPodcastModel(tf.keras.Model): super().__init__() self.audio_encoder = AudioEncoder() - def call(self, inputs: DocumentArray[Podcast]) -> DocumentArray[Podcast]: + def call(self, inputs: DocArray[Podcast]) -> DocArray[Podcast]: inputs.audio_tensor.embedding = self.audio_encoder( inputs.audio_tensor.tensor ) # access audio_tensor's .tensor attribute @@ -407,7 +407,7 @@ store it there, and thus make it searchable: ```python # NOTE: DocumentStores are not yet implemented in version 2 -from docarray import DocumentArray, BaseDocument +from docarray import DocArray, BaseDocument from docarray.stores import DocumentStore from docarray.documents import ImageDoc, TextDoc import numpy as np @@ -427,11 +427,11 @@ def _random_my_doc(): ) -da = DocumentArray([_random_my_doc() for _ in range(1000)]) # create some data +da = DocArray([_random_my_doc() for _ in range(1000)]) # create some data store = DocumentStore[MyDoc]( storage='qdrant' ) # create a DocumentStore with Qdrant as backend -store.insert(da) # insert the DocumentArray into the DocumentStore +store.insert(da) # insert the DocArray into the DocumentStore # find the 10 most similar images based on the image embedding field match = store.find( @@ -453,6 +453,7 @@ from docarray import BaseDocument from docarray.index import HnswDocumentIndex from docarray.typing import NdArray import logging + # get the logger and set the log level to DEBUG logging.getLogger('docarray').setLevel(logging.DEBUG) diff --git a/docarray/__init__.py b/docarray/__init__.py index 2d4c6271119..f41f2f4af6a 100644 --- a/docarray/__init__.py +++ b/docarray/__init__.py @@ -2,10 +2,10 @@ import logging -from docarray.array import DocumentArray, DocumentArrayStacked +from docarray.array import DocArray, DocArrayStacked from docarray.base_document.doc import BaseDoc -__all__ = ['BaseDoc', 'DocumentArray', 'DocumentArrayStacked'] +__all__ = ['BaseDoc', 'DocArray', 'DocArrayStacked'] logger = logging.getLogger('docarray') diff --git a/docarray/array/__init__.py b/docarray/array/__init__.py index 1b88646ebf1..9c0176426e2 100644 --- a/docarray/array/__init__.py +++ b/docarray/array/__init__.py @@ -1,4 +1,4 @@ -from docarray.array.array.array import DocumentArray -from docarray.array.stacked.array_stacked import DocumentArrayStacked +from docarray.array.array.array import DocArray +from docarray.array.stacked.array_stacked import DocArrayStacked -__all__ = ['DocumentArray', 'DocumentArrayStacked'] +__all__ = ['DocArray', 'DocArrayStacked'] diff --git a/docarray/array/abstract_array.py b/docarray/array/abstract_array.py index 48faaa11359..ece2fd4270e 100644 --- a/docarray/array/abstract_array.py +++ b/docarray/array/abstract_array.py @@ -20,12 +20,12 @@ import numpy as np from docarray.base_document import BaseDoc -from docarray.display.document_array_summary import DocumentArraySummary +from docarray.display.document_array_summary import DocArraySummary from docarray.typing.abstract_type import AbstractType from docarray.utils._typing import change_cls_name if TYPE_CHECKING: - from docarray.proto import DocumentArrayProto, NodeProto + from docarray.proto import DocArrayProto, NodeProto from docarray.typing.tensor.abstract_tensor import AbstractTensor T = TypeVar('T', bound='AnyDocArray') @@ -55,12 +55,12 @@ def __class_getitem__(cls, item: Union[Type[BaseDoc], TypeVar, str]): if item not in cls.__typed_da__[cls]: # Promote to global scope so multiprocessing can pickle it - global _DocumentArrayTyped + global _DocArrayTyped - class _DocumentArrayTyped(cls): # type: ignore + class _DocArrayTyped(cls): # type: ignore document_type: Type[BaseDoc] = cast(Type[BaseDoc], item) - for field in _DocumentArrayTyped.document_type.__fields__.keys(): + for field in _DocArrayTyped.document_type.__fields__.keys(): def _property_generator(val: str): def _getter(self): @@ -72,16 +72,16 @@ def _setter(self, value): # need docstring for the property return property(fget=_getter, fset=_setter) - setattr(_DocumentArrayTyped, field, _property_generator(field)) + setattr(_DocArrayTyped, field, _property_generator(field)) # this generates property on the fly based on the schema of the item # The global scope and qualname need to refer to this class a unique name. - # Otherwise, creating another _DocumentArrayTyped will overwrite this one. + # Otherwise, creating another _DocArrayTyped will overwrite this one. change_cls_name( - _DocumentArrayTyped, f'{cls.__name__}[{item.__name__}]', globals() + _DocArrayTyped, f'{cls.__name__}[{item.__name__}]', globals() ) - cls.__typed_da__[cls][item] = _DocumentArrayTyped + cls.__typed_da__[cls][item] = _DocArrayTyped return cls.__typed_da__[cls][item] @@ -121,27 +121,27 @@ def _set_data_column( field: str, values: Union[List, T, 'AbstractTensor'], ): - """Set all Documents in this DocumentArray using the passed values + """Set all Documents in this DocArray using the passed values :param field: name of the fields to extract - :values: the values to set at the DocumentArray level + :values: the values to set at the DocArray level """ ... @classmethod @abstractmethod - def from_protobuf(cls: Type[T], pb_msg: 'DocumentArrayProto') -> T: + def from_protobuf(cls: Type[T], pb_msg: 'DocArrayProto') -> T: """create a Document from a protobuf message""" ... @abstractmethod - def to_protobuf(self) -> 'DocumentArrayProto': - """Convert DocumentArray into a Protobuf message""" + def to_protobuf(self) -> 'DocArrayProto': + """Convert DocArray into a Protobuf message""" ... def _to_node_protobuf(self) -> 'NodeProto': - """Convert a DocumentArray into a NodeProto protobuf message. - This function should be called when a DocumentArray + """Convert a DocArray into a NodeProto protobuf message. + This function should be called when a DocArray is nested into another Document that need to be converted into a protobuf :return: the nested item protobuf message @@ -157,7 +157,7 @@ def traverse_flat( ) -> Union[List[Any], 'AbstractTensor']: """ Return a List of the accessed objects when applying the `access_path`. If this - results in a nested list or list of DocumentArrays, the list will be flattened + results in a nested list or list of DocArrays, the list will be flattened on the first level. The access path is a string that consists of attribute names, concatenated and "__"-separated. It describes the path from the first level to an arbitrary one, e.g. 'content__image__url'. @@ -167,7 +167,7 @@ def traverse_flat( EXAMPLE USAGE .. code-block:: python - from docarray import BaseDoc, DocumentArray, Text + from docarray import BaseDoc, DocArray, Text class Author(BaseDoc): @@ -179,7 +179,7 @@ class Book(BaseDoc): content: Text - da = DocumentArray[Book]( + da = DocArray[Book]( Book(author=Author(name='Jenny'), content=Text(text=f'book_{i}')) for i in range(10) # noqa: E501 ) @@ -192,7 +192,7 @@ class Book(BaseDoc): EXAMPLE USAGE .. code-block:: python - from docarray import BaseDoc, DocumentArray + from docarray import BaseDoc, DocArray class Chapter(BaseDoc): @@ -200,21 +200,19 @@ class Chapter(BaseDoc): class Book(BaseDoc): - chapters: DocumentArray[Chapter] + chapters: DocArray[Chapter] - da = DocumentArray[Book]( + da = DocArray[Book]( Book( - chapters=DocumentArray[Chapter]( - [Chapter(content='some_content') for _ in range(3)] - ) + chapters=DocArray[Chapter]([Chapter(content='some_content') for _ in range(3)]) ) for _ in range(10) ) chapters = da.traverse_flat(access_path='chapters') # list of 30 strings - If your DocumentArray is in stacked mode, and you want to access a field of + If your DocArray is in stacked mode, and you want to access a field of type AnyTensor, the stacked tensor will be returned instead of a list: EXAMPLE USAGE @@ -223,7 +221,7 @@ class Image(BaseDoc): tensor: TorchTensor[3, 224, 224] - batch = DocumentArray[Image]( + batch = DocArray[Image]( [ Image( tensor=torch.zeros(3, 224, 224), @@ -245,9 +243,9 @@ def _traverse(node: Any, access_path: str): if access_path: curr_attr, _, path_attrs = access_path.partition('__') - from docarray.array import DocumentArray + from docarray.array import DocArray - if isinstance(node, (DocumentArray, list)): + if isinstance(node, (DocArray, list)): for n in node: x = getattr(n, curr_attr) yield from AnyDocArray._traverse(x, path_attrs) @@ -259,19 +257,19 @@ def _traverse(node: Any, access_path: str): @staticmethod def _flatten_one_level(sequence: List[Any]) -> List[Any]: - from docarray import DocumentArray + from docarray import DocArray - if len(sequence) == 0 or not isinstance(sequence[0], (list, DocumentArray)): + if len(sequence) == 0 or not isinstance(sequence[0], (list, DocArray)): return sequence else: return [item for sublist in sequence for item in sublist] def summary(self): """ - Print a summary of this DocumentArray object and a summary of the schema of its + Print a summary of this DocArray object and a summary of the schema of its Document type. """ - DocumentArraySummary(self).summary() + DocArraySummary(self).summary() def _batch( self: T, @@ -280,13 +278,13 @@ def _batch( show_progress: bool = False, ) -> Generator[T, None, None]: """ - Creates a `Generator` that yields `DocumentArray` of size `batch_size`. + Creates a `Generator` that yields `DocArray` of size `batch_size`. Note, that the last batch might be smaller than `batch_size`. :param batch_size: Size of each generated batch. :param shuffle: If set, shuffle the Documents before dividing into minibatches. :param show_progress: if set, show a progress bar when batching documents. - :yield: a Generator of `DocumentArray`, each in the length of `batch_size` + :yield: a Generator of `DocArray`, each in the length of `batch_size` """ from rich.progress import track diff --git a/docarray/array/array/array.py b/docarray/array/array/array.py index 7d244e60c9a..2ea713493d9 100644 --- a/docarray/array/array/array.py +++ b/docarray/array/array/array.py @@ -31,19 +31,19 @@ from pydantic import BaseConfig from pydantic.fields import ModelField - from docarray.array.stacked.array_stacked import DocumentArrayStacked - from docarray.proto import DocumentArrayProto + from docarray.array.stacked.array_stacked import DocArrayStacked + from docarray.proto import DocArrayProto from docarray.typing import TorchTensor from docarray.typing.tensor.abstract_tensor import AbstractTensor -T = TypeVar('T', bound='DocumentArray') +T = TypeVar('T', bound='DocArray') T_doc = TypeVar('T_doc', bound=BaseDoc) def _delegate_meth_to_data(meth_name: str) -> Callable: """ create a function that mimic a function call to the data attribute of the - DocumentArray + DocArray :param meth_name: name of the method :return: a method that mimic the meth_name @@ -57,23 +57,23 @@ def _delegate_meth(self, *args, **kwargs): return _delegate_meth -class DocumentArray( +class DocArray( IndexingSequenceMixin[T_doc], PushPullMixin, IOMixinArray, AnyDocArray[T_doc] ): """ - DocumentArray is a container of Documents. + DocArray is a container of Documents. - A DocumentArray is a list of Documents of any schema. However, many - DocumentArray features are only available if these Documents are + A DocArray is a list of Documents of any schema. However, many + DocArray features are only available if these Documents are homogeneous and follow the same schema. To precise this schema you can use - the `DocumentArray[MyDocument]` syntax where MyDocument is a Document class - (i.e. schema). This creates a DocumentArray that can only contains Documents of + the `DocArray[MyDocument]` syntax where MyDocument is a Document class + (i.e. schema). This creates a DocArray that can only contains Documents of the type 'MyDocument'. --- ```python - from docarray import BaseDoc, DocumentArray + from docarray import BaseDoc, DocArray from docarray.typing import NdArray, ImageUrl from typing import Optional @@ -83,7 +83,7 @@ class Image(BaseDoc): url: ImageUrl - da = DocumentArray[Image]( + da = DocArray[Image]( Image(url='http://url.com/foo.png') for _ in range(10) ) # noqa: E510 ``` @@ -91,8 +91,8 @@ class Image(BaseDoc): --- - If your DocumentArray is homogeneous (i.e. follows the same schema), you can access - fields at the DocumentArray level (for example `da.tensor` or `da.url`). + If your DocArray is homogeneous (i.e. follows the same schema), you can access + fields at the DocArray level (for example `da.tensor` or `da.url`). You can also set fields, with `da.tensor = np.random.random([10, 100])`: print(da.url) @@ -104,7 +104,7 @@ class Image(BaseDoc): # [NdArray([0.11299577, 0.47206767, 0.481723 , 0.34754724, 0.15016037, # 0.88861321, 0.88317666, 0.93845579, 0.60486676, ... ]), ...] - You can index into a DocumentArray like a numpy array or torch tensor: + You can index into a DocArray like a numpy array or torch tensor: da[0] # index by position @@ -112,10 +112,10 @@ class Image(BaseDoc): da[[0, 2, 3]] # index by list of indices da[True, False, True, True, ...] # index by boolean mask - You can delete items from a DocumentArray like a Python List + You can delete items from a DocArray like a Python List - del da[0] # remove first element from DocumentArray - del da[0:5] # remove elements for 0 to 5 from DocumentArray + del da[0] # remove first element from DocArray + del da[0:5] # remove elements for 0 to 5 from DocArray :param docs: iterable of Document @@ -135,7 +135,7 @@ def construct( docs: Sequence[T_doc], ) -> T: """ - Create a DocumentArray without validation any data. The data must come from a + Create a DocArray without validation any data. The data must come from a trusted source :param docs: a Sequence (list) of Document with the same schema :return: @@ -146,13 +146,13 @@ def construct( def _validate_docs(self, docs: Iterable[T_doc]) -> Iterable[T_doc]: """ - Validate if an Iterable of Document are compatible with this DocumentArray + Validate if an Iterable of Document are compatible with this DocArray """ for doc in docs: yield self._validate_one_doc(doc) def _validate_one_doc(self, doc: T_doc) -> T_doc: - """Validate if a Document is compatible with this DocumentArray""" + """Validate if a Document is compatible with this DocArray""" if not issubclass(self.document_type, AnyDoc) and not isinstance( doc, self.document_type ): @@ -172,16 +172,16 @@ def __bytes__(self) -> bytes: def append(self, doc: T_doc): """ - Append a Document to the DocumentArray. The Document must be from the same class - as the document_type of this DocumentArray otherwise it will fail. + Append a Document to the DocArray. The Document must be from the same class + as the document_type of this DocArray otherwise it will fail. :param doc: A Document """ self._data.append(self._validate_one_doc(doc)) def extend(self, docs: Iterable[T_doc]): """ - Extend a DocumentArray with an Iterable of Document. The Documents must be from - the same class as the document_type of this DocumentArray otherwise it will + Extend a DocArray with an Iterable of Document. The Documents must be from + the same class as the document_type of this DocArray otherwise it will fail. :param docs: Iterable of Documents """ @@ -189,8 +189,8 @@ def extend(self, docs: Iterable[T_doc]): def insert(self, i: int, doc: T_doc): """ - Insert a Document to the DocumentArray. The Document must be from the same - class as the document_type of this DocumentArray otherwise it will fail. + Insert a Document to the DocArray. The Document must be from the same + class as the document_type of this DocArray otherwise it will fail. :param i: index to insert :param doc: A Document """ @@ -221,7 +221,7 @@ def _get_data_column( # calling __class_getitem__ ourselves is a hack otherwise mypy complain # most likely a bug in mypy though # bug reported here https://github.com/python/mypy/issues/14111 - return DocumentArray.__class_getitem__(field_type)( + return DocArray.__class_getitem__(field_type)( (getattr(doc, field) for doc in self), ) else: @@ -232,10 +232,10 @@ def _set_data_column( field: str, values: Union[List, T, 'AbstractTensor'], ): - """Set all Documents in this DocumentArray using the passed values + """Set all Documents in this DocArray using the passed values :param field: name of the fields to set - :values: the values to set at the DocumentArray level + :values: the values to set at the DocArray level """ ... @@ -245,17 +245,17 @@ def _set_data_column( def stack( self, tensor_type: Type['AbstractTensor'] = NdArray, - ) -> 'DocumentArrayStacked': + ) -> 'DocArrayStacked': """ - Convert the DocumentArray into a DocumentArrayStacked. `Self` cannot be used + Convert the DocArray into a DocArrayStacked. `Self` cannot be used afterwards :param tensor_type: Tensor Class used to wrap the stacked tensors. This is useful if the BaseDoc has some undefined tensor type like AnyTensor or Union of NdArray and TorchTensor - :return: A DocumentArrayStacked of the same document type as self + :return: A DocArrayStacked of the same document type as self """ - from docarray.array.stacked.array_stacked import DocumentArrayStacked + from docarray.array.stacked.array_stacked import DocArrayStacked - return DocumentArrayStacked.__class_getitem__(self.document_type)( + return DocArrayStacked.__class_getitem__(self.document_type)( self, tensor_type=tensor_type ) @@ -266,9 +266,9 @@ def validate( field: 'ModelField', config: 'BaseConfig', ): - from docarray.array.stacked.array_stacked import DocumentArrayStacked + from docarray.array.stacked.array_stacked import DocArrayStacked - if isinstance(value, (cls, DocumentArrayStacked)): + if isinstance(value, (cls, DocArrayStacked)): return value elif isinstance(value, Iterable): return cls(value) @@ -276,7 +276,7 @@ def validate( raise TypeError(f'Expecting an Iterable of {cls.document_type}') def traverse_flat( - self: 'DocumentArray', + self: 'DocArray', access_path: str, ) -> List[Any]: nodes = list(AnyDocArray._traverse(node=self, access_path=access_path)) @@ -285,9 +285,9 @@ def traverse_flat( return flattened @classmethod - def from_protobuf(cls: Type[T], pb_msg: 'DocumentArrayProto') -> T: + def from_protobuf(cls: Type[T], pb_msg: 'DocArrayProto') -> T: """create a Document from a protobuf message - :param pb_msg: The protobuf message from where to construct the DocumentArray + :param pb_msg: The protobuf message from where to construct the DocArray """ return super().from_protobuf(pb_msg) diff --git a/docarray/array/array/io.py b/docarray/array/array/io.py index 4d6b5f5ca62..06042aab541 100644 --- a/docarray/array/array/io.py +++ b/docarray/array/array/io.py @@ -36,8 +36,8 @@ if TYPE_CHECKING: import pandas as pd - from docarray import DocumentArray - from docarray.proto import DocumentArrayProto + from docarray import DocArray + from docarray.proto import DocArrayProto T = TypeVar('T', bound='IOMixinArray') @@ -108,19 +108,19 @@ def __init__( ... @classmethod - def from_protobuf(cls: Type[T], pb_msg: 'DocumentArrayProto') -> T: + def from_protobuf(cls: Type[T], pb_msg: 'DocArrayProto') -> T: """create a Document from a protobuf message - :param pb_msg: The protobuf message from where to construct the DocumentArray + :param pb_msg: The protobuf message from where to construct the DocArray """ return cls( cls.document_type.from_protobuf(doc_proto) for doc_proto in pb_msg.docs ) - def to_protobuf(self) -> 'DocumentArrayProto': - """Convert DocumentArray into a Protobuf message""" - from docarray.proto import DocumentArrayProto + def to_protobuf(self) -> 'DocArrayProto': + """Convert DocArray into a Protobuf message""" + from docarray.proto import DocArrayProto - da_proto = DocumentArrayProto() + da_proto = DocArrayProto() for doc in self: da_proto.docs.append(doc.to_protobuf()) @@ -134,13 +134,13 @@ def from_bytes( compress: Optional[str] = None, show_progress: bool = False, ) -> T: - """Deserialize bytes into a DocumentArray. + """Deserialize bytes into a DocArray. :param data: Bytes from which to deserialize :param protocol: protocol that was used to serialize :param compress: compress algorithm that was used to serialize :param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf` - :return: the deserialized DocumentArray + :return: the deserialized DocArray """ return cls._load_binary_all( file_ctx=nullcontext(data), @@ -270,13 +270,13 @@ def from_base64( compress: Optional[str] = None, show_progress: bool = False, ) -> T: - """Deserialize base64 strings into a DocumentArray. + """Deserialize base64 strings into a DocArray. :param data: Base64 string to deserialize :param protocol: protocol that was used to serialize :param compress: compress algorithm that was used to serialize :param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf` - :return: the deserialized DocumentArray + :return: the deserialized DocArray """ return cls._load_binary_all( file_ctx=nullcontext(base64.b64decode(data)), @@ -312,17 +312,17 @@ def from_json( cls: Type[T], file: Union[str, bytes, bytearray], ) -> T: - """Deserialize JSON strings or bytes into a DocumentArray. + """Deserialize JSON strings or bytes into a DocArray. - :param file: JSON object from where to deserialize a DocumentArray - :return: the deserialized DocumentArray + :param file: JSON object from where to deserialize a DocArray + :return: the deserialized DocArray """ json_docs = json.loads(file) return cls([cls.document_type.parse_raw(v) for v in json_docs]) def to_json(self) -> str: """Convert the object into a JSON string. Can be loaded via :meth:`.from_json`. - :return: JSON serialization of DocumentArray + :return: JSON serialization of DocArray """ return json.dumps([doc.json() for doc in self]) @@ -332,36 +332,36 @@ def from_csv( file_path: str, encoding: str = 'utf-8', dialect: Union[str, csv.Dialect] = 'excel', - ) -> 'DocumentArray': + ) -> 'DocArray': """ - Load a DocumentArray from a csv file following the schema defined in the - :attr:`~docarray.DocumentArray.document_type` attribute. + Load a DocArray from a csv file following the schema defined in the + :attr:`~docarray.DocArray.document_type` attribute. Every row of the csv file will be mapped to one document in the array. The column names (defined in the first row) have to match the field names of the Document type. For nested fields use "__"-separated access paths, such as 'image__url'. - List-like fields (including field of type DocumentArray) are not supported. + List-like fields (including field of type DocArray) are not supported. - :param file_path: path to csv file to load DocumentArray from. + :param file_path: path to csv file to load DocArray from. :param encoding: encoding used to read the csv file. Defaults to 'utf-8'. :param dialect: defines separator and how to handle whitespaces etc. Can be a csv.Dialect instance or one string of: 'excel' (for comma seperated values), 'excel-tab' (for tab separated values), 'unix' (for csv file generated on UNIX systems). - :return: DocumentArray + :return: DocArray """ - from docarray import DocumentArray + from docarray import DocArray if cls.document_type == AnyDoc: raise TypeError( 'There is no document schema defined. ' - 'Please specify the DocumentArray\'s Document type using `DocumentArray[MyDoc]`.' + 'Please specify the DocArray\'s Document type using `DocArray[MyDoc]`.' ) doc_type = cls.document_type - da = DocumentArray.__class_getitem__(doc_type)() + da = DocArray.__class_getitem__(doc_type)() with open(file_path, 'r', encoding=encoding) as fp: rows = csv.DictReader(fp, dialect=dialect) @@ -376,7 +376,7 @@ def from_csv( ) if not all(valid_paths): raise ValueError( - f'Column names do not match the schema of the DocumentArray\'s ' + f'Column names do not match the schema of the DocArray\'s ' f'document type ({cls.document_type.__name__}): ' f'{list(compress(field_names, [not v for v in valid_paths]))}' ) @@ -393,7 +393,7 @@ def to_csv( self, file_path: str, dialect: Union[str, csv.Dialect] = 'excel' ) -> None: """ - Save a DocumentArray to a csv file. + Save a DocArray to a csv file. The field names will be stored in the first row. Each row corresponds to the information of one Document. Columns for nested fields will be named after the "__"-seperated access paths, @@ -417,17 +417,17 @@ def to_csv( writer.writerow(doc_dict) @classmethod - def from_pandas(cls, df: 'pd.DataFrame') -> 'DocumentArray': + def from_pandas(cls, df: 'pd.DataFrame') -> 'DocArray': """ - Load a DocumentArray from a `pandas.DataFrame` following the schema - defined in the :attr:`~docarray.DocumentArray.document_type` attribute. + Load a DocArray from a `pandas.DataFrame` following the schema + defined in the :attr:`~docarray.DocArray.document_type` attribute. Every row of the dataframe will be mapped to one Document in the array. The column names of the dataframe have to match the field names of the Document type. For nested fields use "__"-separated access paths as column names, such as 'image__url'. - List-like fields (including field of type DocumentArray) are not supported. + List-like fields (including field of type DocArray) are not supported. EXAMPLE USAGE: @@ -435,7 +435,7 @@ def from_pandas(cls, df: 'pd.DataFrame') -> 'DocumentArray': import pandas as pd - from docarray import BaseDoc, DocumentArray + from docarray import BaseDoc, DocArray class Person(BaseDoc): @@ -447,26 +447,26 @@ class Person(BaseDoc): data=[['Maria', 12345], ['Jake', 54321]], columns=['name', 'follower'] ) - da = DocumentArray[Person].from_pandas(df) + da = DocArray[Person].from_pandas(df) assert da.name == ['Maria', 'Jake'] assert da.follower == [12345, 54321] :param df: pandas.DataFrame to extract Document's information from - :return: DocumentArray where each Document contains the information of one + :return: DocArray where each Document contains the information of one corresponding row of the `pandas.DataFrame`. """ - from docarray import DocumentArray + from docarray import DocArray if cls.document_type == AnyDoc: raise TypeError( 'There is no document schema defined. ' - 'Please specify the DocumentArray\'s Document type using `DocumentArray[MyDoc]`.' + 'Please specify the DocArray\'s Document type using `DocArray[MyDoc]`.' ) doc_type = cls.document_type - da = DocumentArray.__class_getitem__(doc_type)() + da = DocArray.__class_getitem__(doc_type)() field_names = df.columns.tolist() if field_names is None or len(field_names) == 0: @@ -477,7 +477,7 @@ class Person(BaseDoc): ) if not all(valid_paths): raise ValueError( - f'Column names do not match the schema of the DocumentArray\'s ' + f'Column names do not match the schema of the DocArray\'s ' f'document type ({cls.document_type.__name__}): ' f'{list(compress(field_names, [not v for v in valid_paths]))}' ) @@ -492,7 +492,7 @@ class Person(BaseDoc): def to_pandas(self) -> 'pd.DataFrame': """ - Save a DocumentArray to a `pandas.DataFrame`. + Save a DocArray to a `pandas.DataFrame`. The field names will be stored as column names. Each row of the dataframe corresponds to the information of one Document. Columns for nested fields will be named after the "__"-seperated access paths, @@ -533,11 +533,11 @@ def _load_binary_all( compress: Optional[str], show_progress: bool, ): - """Read a `DocumentArray` object from a binary file + """Read a `DocArray` object from a binary file :param protocol: protocol to use. It can be 'pickle-array', 'protobuf-array', 'pickle' or 'protobuf' :param compress: compress algorithm to use :param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf` - :return: a `DocumentArray` + :return: a `DocArray` """ with file_ctx as fp: if isinstance(fp, bytes): @@ -551,9 +551,9 @@ def _load_binary_all( compress = None if protocol is not None and protocol == 'protobuf-array': - from docarray.proto import DocumentArrayProto + from docarray.proto import DocArrayProto - dap = DocumentArrayProto() + dap = DocArrayProto() dap.ParseFromString(d) return cls.from_protobuf(dap) @@ -677,7 +677,7 @@ def load_binary( :param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf` :param streaming: if `True` returns a generator over `Document` objects. In case protocol is pickle the `Documents` are streamed from disk to save memory usage - :return: a DocumentArray object + :return: a DocArray object .. note:: If `file` is `str` it can specify `protocol` and `compress` as file extensions. @@ -724,12 +724,12 @@ def save_binary( compress: Optional[str] = None, show_progress: bool = False, ) -> None: - """Save DocumentArray into a binary file. + """Save DocArray into a binary file. - It will use the protocol to pick how to save the DocumentArray. - If used 'picke-array` and `protobuf-array` the DocumentArray will be stored + It will use the protocol to pick how to save the DocArray. + If used 'picke-array` and `protobuf-array` the DocArray will be stored and compressed at complete level using `pickle` or `protobuf`. - When using `protobuf` or `pickle` as protocol each Document in DocumentArray + When using `protobuf` or `pickle` as protocol each Document in DocArray will be stored individually and this would make it available for streaming. :param file: File or filename to which the data is saved. diff --git a/docarray/array/array/pushpull.py b/docarray/array/array/pushpull.py index 0bb2489d3e8..def3d144127 100644 --- a/docarray/array/array/pushpull.py +++ b/docarray/array/array/pushpull.py @@ -19,7 +19,7 @@ SUPPORTED_PUSH_PULL_PROTOCOLS = get_args(PUSH_PULL_PROTOCOL) if TYPE_CHECKING: # pragma: no cover - from docarray import BaseDoc, DocumentArray + from docarray import BaseDoc, DocArray from docarray.store.abstract_doc_store import AbstractDocStore @@ -86,10 +86,10 @@ def push( show_progress: bool = False, branding: Optional[Dict] = None, ) -> Dict: - """Push this DocumentArray object to the specified url. + """Push this DocArray object to the specified url. - :param url: url specifying the protocol and save name of the DocumentArray. Should be of the form ``protocol://namespace/name``. e.g. ``s3://bucket/path/to/namespace/name``, ``file:///path/to/folder/name`` - :param public: Only used by ``jac`` protocol. If true, anyone can pull a DocumentArray if they know its name. + :param url: url specifying the protocol and save name of the DocArray. Should be of the form ``protocol://namespace/name``. e.g. ``s3://bucket/path/to/namespace/name``, ``file:///path/to/folder/name`` + :param public: Only used by ``jac`` protocol. If true, anyone can pull a DocArray if they know its name. Setting this to false will restrict access to only the creator. :param show_progress: If true, a progress bar will be displayed. :param branding: Only used by ``jac`` protocol. A dictionary of branding information to be sent to Jina AI Cloud. {"icon": "emoji", "background": "#fff"} @@ -112,8 +112,8 @@ def push_stream( """Push a stream of documents to the specified url. :param docs: a stream of documents - :param url: url specifying the protocol and save name of the DocumentArray. Should be of the form ``protocol://namespace/name``. e.g. ``s3://bucket/path/to/namespace/name``, ``file:///path/to/folder/name`` - :param public: Only used by ``jac`` protocol. If true, anyone can pull a DocumentArray if they know its name. + :param url: url specifying the protocol and save name of the DocArray. Should be of the form ``protocol://namespace/name``. e.g. ``s3://bucket/path/to/namespace/name``, ``file:///path/to/folder/name`` + :param public: Only used by ``jac`` protocol. If true, anyone can pull a DocArray if they know its name. :param show_progress: If true, a progress bar will be displayed. :param branding: Only used by ``jac`` protocol. A dictionary of branding information to be sent to Jina AI Cloud. {"icon": "emoji", "background": "#fff"} """ @@ -129,20 +129,20 @@ def pull( url: str, show_progress: bool = False, local_cache: bool = True, - ) -> 'DocumentArray': - """Pull a :class:`DocumentArray` from the specified url. + ) -> 'DocArray': + """Pull a :class:`DocArray` from the specified url. - :param url: url specifying the protocol and save name of the DocumentArray. Should be of the form ``protocol://namespace/name``. e.g. ``s3://bucket/path/to/namespace/name``, ``file:///path/to/folder/name`` + :param url: url specifying the protocol and save name of the DocArray. Should be of the form ``protocol://namespace/name``. e.g. ``s3://bucket/path/to/namespace/name``, ``file:///path/to/folder/name`` :param show_progress: if true, display a progress bar. - :param local_cache: store the downloaded DocumentArray to local folder - :return: a :class:`DocumentArray` object + :param local_cache: store the downloaded DocArray to local folder + :return: a :class:`DocArray` object """ from docarray.base_document import AnyDoc if cls.document_type == AnyDoc: raise TypeError( 'There is no document schema defined. ' - 'Please specify the DocumentArray\'s Document type using `DocumentArray[MyDoc]`.' + 'Please specify the DocArray\'s Document type using `DocArray[MyDoc]`.' ) logging.info(f'Pulling {url}') @@ -160,9 +160,9 @@ def pull_stream( ) -> Iterator['BaseDoc']: """Pull a stream of Documents from the specified url. - :param url: url specifying the protocol and save name of the DocumentArray. Should be of the form ``protocol://namespace/name``. e.g. ``s3://bucket/path/to/namespace/name``, ``file:///path/to/folder/name`` + :param url: url specifying the protocol and save name of the DocArray. Should be of the form ``protocol://namespace/name``. e.g. ``s3://bucket/path/to/namespace/name``, ``file:///path/to/folder/name`` :param show_progress: if true, display a progress bar. - :param local_cache: store the downloaded DocumentArray to local folder + :param local_cache: store the downloaded DocArray to local folder :return: Iterator of Documents """ from docarray.base_document import AnyDoc @@ -170,7 +170,7 @@ def pull_stream( if cls.document_type == AnyDoc: raise TypeError( 'There is no document schema defined. ' - 'Please specify the DocumentArray\'s Document type using `DocumentArray[MyDoc]`.' + 'Please specify the DocArray\'s Document type using `DocArray[MyDoc]`.' ) logging.info(f'Pulling Document stream from {url}') diff --git a/docarray/array/stacked/array_stacked.py b/docarray/array/stacked/array_stacked.py index fad2e6678c7..ec64037224e 100644 --- a/docarray/array/stacked/array_stacked.py +++ b/docarray/array/stacked/array_stacked.py @@ -19,7 +19,7 @@ from pydantic import BaseConfig, parse_obj_as from docarray.array.abstract_array import AnyDocArray -from docarray.array.array.array import DocumentArray +from docarray.array.array.array import DocArray from docarray.array.stacked.column_storage import ColumnStorage, ColumnStorageView from docarray.array.stacked.list_advance_indexing import ListAdvancedIndexing from docarray.base_document import BaseDoc @@ -32,7 +32,7 @@ if TYPE_CHECKING: from pydantic.fields import ModelField - from docarray.proto import DocumentArrayStackedProto + from docarray.proto import DocArrayStackedProto torch_available = is_torch_available() if torch_available: @@ -49,41 +49,41 @@ TensorFlowTensor = None # type: ignore T_doc = TypeVar('T_doc', bound=BaseDoc) -T = TypeVar('T', bound='DocumentArrayStacked') +T = TypeVar('T', bound='DocArrayStacked') IndexIterType = Union[slice, Iterable[int], Iterable[bool], None] -class DocumentArrayStacked(AnyDocArray[T_doc]): +class DocArrayStacked(AnyDocArray[T_doc]): """ - DocumentArrayStacked is a container of Documents appropriates to perform + DocArrayStacked is a container of Documents appropriates to perform computation that require batches of data (ex: matrix multiplication, distance calculation, deep learning forward pass) - A DocumentArrayStacked has a similar interface as - {class}`~docarray.array.DocumentArray` but with an underlying implementation that is + A DocArrayStacked has a similar interface as + {class}`~docarray.array.DocArray` but with an underlying implementation that is column based instead of row based. Each field - of the schema of the DocumentArrayStack - (the :attr:`~docarray.array.stacked.DocumentArrayStacked.document_type` which is a + of the schema of the DocArrayStack + (the :attr:`~docarray.array.stacked.DocArrayStacked.document_type` which is a `BaseDoc`) will be stored in a column. If the field is a tensor, the data from all Documents will be stored as a single, stacked (torch/np/tf) tensor. If the tensor field is `AnyTensor` or a Union of tensor types, the - :attr:`~docarray.array.stacked.DocumentArrayStacked.tensor_type` will be used to determine + :attr:`~docarray.array.stacked.DocArrayStacked.tensor_type` will be used to determine the type of the stacked column. - If the field is another `BasedDocument` the column will be another DocumentArrayStacked that follows the + If the field is another `BasedDocument` the column will be another DocArrayStacked that follows the schema of the nested Document. - If the field is a `DocumentArray` or - `DocumentArrayStacked` then the column will be a list of `DocumentArrayStacked`. + If the field is a `DocArray` or + `DocArrayStacked` then the column will be a list of `DocArrayStacked`. For any other type the column is a Python list. - Every `Document` inside a `DocumentArrayStacked` is a view into the data columns stored at the `DocumentArrayStacked` level. The `Document` does + Every `Document` inside a `DocArrayStacked` is a view into the data columns stored at the `DocArrayStacked` level. The `Document` does not hold any data itself. The behavior of this Document "view" is similar to the behavior of `view = tensor[i]` in numpy/PyTorch. - :param docs: a DocumentArray + :param docs: a DocArray :param tensor_type: Tensor Class used to wrap the stacked tensors. This is useful - if the BaseDoc of this DocumentArrayStacked has some undefined tensor type like + if the BaseDoc of this DocArrayStacked has some undefined tensor type like AnyTensor or Union of NdArray and TorchTensor """ @@ -97,16 +97,16 @@ def __init__( self.tensor_type = tensor_type tensor_columns: Dict[str, AbstractTensor] = dict() - doc_columns: Dict[str, 'DocumentArrayStacked'] = dict() - da_columns: Dict[str, ListAdvancedIndexing['DocumentArrayStacked']] = dict() + doc_columns: Dict[str, 'DocArrayStacked'] = dict() + da_columns: Dict[str, ListAdvancedIndexing['DocArrayStacked']] = dict() any_columns: Dict[str, ListAdvancedIndexing] = dict() if len(docs) == 0: raise ValueError(f'docs {docs}: should not be empty') docs = ( docs - if isinstance(docs, DocumentArray) - else DocumentArray.__class_getitem__(self.document_type)(docs) + if isinstance(docs, DocArray) + else DocArray.__class_getitem__(self.document_type)(docs) ) for field_name, field in self.document_type.__fields__.items(): @@ -167,7 +167,7 @@ def __init__( docs_list = list() for doc in docs: da = getattr(doc, field_name) - if isinstance(da, DocumentArray): + if isinstance(da, DocArray): da = da.stack(tensor_type=self.tensor_type) docs_list.append(da) da_columns[field_name] = ListAdvancedIndexing(docs_list) @@ -191,9 +191,9 @@ def __init__( @classmethod def from_columns_storage(cls: Type[T], storage: ColumnStorage) -> T: """ - Create a DocumentArrayStacked directly from a storage object + Create a DocArrayStacked directly from a storage object :param storage: the underlying storage. - :return: a DocumentArrayStack + :return: a DocArrayStack """ da = cls.__new__(cls) da.tensor_type = storage.tensor_type @@ -209,7 +209,7 @@ def validate( ) -> T: if isinstance(value, cls): return value - elif isinstance(value, DocumentArray.__class_getitem__(cls.document_type)): + elif isinstance(value, DocArray.__class_getitem__(cls.document_type)): return cast(T, value.stack()) elif isinstance(value, Sequence): return cls(value) @@ -219,7 +219,7 @@ def validate( raise TypeError(f'Expecting an Iterable of {cls.document_type}') def to(self: T, device: str) -> T: - """Move all tensors of this DocumentArrayStacked to the given device + """Move all tensors of this DocArrayStacked to the given device :param device: the device to move the data to """ @@ -260,7 +260,7 @@ def __getitem__(self: T, item: Union[int, IndexIterType]) -> Union[T_doc, T]: def _get_data_column( self: T, field: str, - ) -> Union[MutableSequence, 'DocumentArrayStacked', AbstractTensor]: + ) -> Union[MutableSequence, 'DocArrayStacked', AbstractTensor]: """Return one column of the data :param field: name of the fields to extract @@ -305,12 +305,12 @@ def __setitem__(self: T, key, value): def _set_data_and_columns( self: T, index_item: Union[Tuple, Iterable, slice], - value: Union[T, DocumentArray[T_doc]], + value: Union[T, DocArray[T_doc]], ) -> None: """Delegates the setting to the data and the columns. :param index_item: the key used as index. Needs to be a valid index for both - DocumentArray (data) and column types (torch/tensorflow/numpy tensors) + DocArray (data) and column types (torch/tensorflow/numpy tensors) :value: the value to set at the `key` location """ if isinstance(index_item, tuple): @@ -318,25 +318,25 @@ def _set_data_and_columns( # set data and prepare columns processed_value: T - if isinstance(value, DocumentArray): + if isinstance(value, DocArray): if not issubclass(value.document_type, self.document_type): raise TypeError( f'{value} schema : {value.document_type} is not compatible with ' - f'this DocumentArrayStacked schema : {self.document_type}' + f'this DocArrayStacked schema : {self.document_type}' ) processed_value = cast( T, value.stack(tensor_type=self.tensor_type) ) # we need to copy data here - elif isinstance(value, DocumentArrayStacked): + elif isinstance(value, DocArrayStacked): if not issubclass(value.document_type, self.document_type): raise TypeError( f'{value} schema : {value.document_type} is not compatible with ' - f'this DocumentArrayStacked schema : {self.document_type}' + f'this DocArrayStacked schema : {self.document_type}' ) processed_value = value else: - raise TypeError(f'Can not set a DocumentArrayStacked with {type(value)}') + raise TypeError(f'Can not set a DocArrayStacked with {type(value)}') for field, col in self._storage.columns.items(): col[index_item] = processed_value._storage.columns[field] @@ -345,17 +345,17 @@ def _set_data_column( self: T, field: str, values: Union[ - Sequence[DocumentArray[T_doc]], + Sequence[DocArray[T_doc]], Sequence[Any], T, - DocumentArray, + DocArray, AbstractTensor, ], ) -> None: - """Set all Documents in this DocumentArray using the passed values + """Set all Documents in this DocArray using the passed values :param field: name of the fields to set - :values: the values to set at the DocumentArray level + :values: the values to set at the DocArray level """ if len(values) != len(self._storage): @@ -376,7 +376,7 @@ def _set_data_column( elif field in self._storage.doc_columns.keys(): values_ = parse_obj_as( - DocumentArrayStacked.__class_getitem__( + DocArrayStacked.__class_getitem__( self._storage.doc_columns[field].document_type ), values, @@ -384,7 +384,7 @@ def _set_data_column( self._storage.doc_columns[field] = values_ elif field in self._storage.da_columns.keys(): - values_ = cast(Sequence[DocumentArray[T_doc]], values) + values_ = cast(Sequence[DocArray[T_doc]], values) # TODO here we should actually check if this is correct self._storage.da_columns[field] = values_ elif field in self._storage.any_columns.keys(): @@ -392,7 +392,7 @@ def _set_data_column( values_ = cast(Sequence, values) self._storage.any_columns[field] = values_ else: - raise KeyError(f'{field} is not a valid field for this DocumentArray') + raise KeyError(f'{field} is not a valid field for this DocArray') #################### # Deleting data # @@ -422,7 +422,7 @@ def __len__(self): #################### @classmethod - def from_protobuf(cls: Type[T], pb_msg: 'DocumentArrayStackedProto') -> T: + def from_protobuf(cls: Type[T], pb_msg: 'DocArrayStackedProto') -> T: """create a Document from a protobuf message""" storage = ColumnStorage( pb_msg.tensor_columns, @@ -433,23 +433,23 @@ def from_protobuf(cls: Type[T], pb_msg: 'DocumentArrayStackedProto') -> T: return cls.from_columns_storage(storage) - def to_protobuf(self) -> 'DocumentArrayStackedProto': - """Convert DocumentArray into a Protobuf message""" + def to_protobuf(self) -> 'DocArrayStackedProto': + """Convert DocArray into a Protobuf message""" from docarray.proto import ( - DocumentArrayProto, - DocumentArrayStackedProto, + DocArrayProto, + DocArrayStackedProto, ListOfAnyProto, - ListOfDocumentArrayProto, + ListOfDocArrayProto, NdArrayProto, ) - da_proto = DocumentArrayProto() + da_proto = DocArrayProto() for doc in self: da_proto.docs.append(doc.to_protobuf()) - doc_columns_proto: Dict[str, DocumentArrayStackedProto] = dict() + doc_columns_proto: Dict[str, DocArrayStackedProto] = dict() tensor_columns_proto: Dict[str, NdArrayProto] = dict() - da_columns_proto: Dict[str, ListOfDocumentArrayProto] = dict() + da_columns_proto: Dict[str, ListOfDocArrayProto] = dict() any_columns_proto: Dict[str, ListOfAnyProto] = dict() for field, col_doc in self._storage.doc_columns.items(): @@ -457,7 +457,7 @@ def to_protobuf(self) -> 'DocumentArrayStackedProto': for field, col_tens in self._storage.tensor_columns.items(): tensor_columns_proto[field] = col_tens.to_protobuf() for field, col_da in self._storage.da_columns.items(): - list_proto = ListOfDocumentArrayProto() + list_proto = ListOfDocArrayProto() for da in col_da: list_proto.data.append(da.to_protobuf()) da_columns_proto[field] = list_proto @@ -467,21 +467,21 @@ def to_protobuf(self) -> 'DocumentArrayStackedProto': list_proto.data.append(_type_to_protobuf(data)) any_columns_proto[field] = list_proto - return DocumentArrayStackedProto( + return DocArrayStackedProto( doc_columns=doc_columns_proto, tensor_columns=tensor_columns_proto, da_columns=da_columns_proto, any_columns=any_columns_proto, ) - def unstack(self: T) -> DocumentArray[T_doc]: - """Convert DocumentArrayStacked into a DocumentArray. + def unstack(self: T) -> DocArray[T_doc]: + """Convert DocArrayStacked into a DocArray. - Note this destroys the arguments and returns a new DocumentArray + Note this destroys the arguments and returns a new DocArray """ - unstacked_doc_column: Dict[str, DocumentArray] = dict() - unstacked_da_column: Dict[str, List[DocumentArray]] = dict() + unstacked_doc_column: Dict[str, DocArray] = dict() + unstacked_da_column: Dict[str, List[DocArray]] = dict() unstacked_tensor_column: Dict[str, List[AbstractTensor]] = dict() unstacked_any_column = self._storage.any_columns @@ -515,7 +515,7 @@ def unstack(self: T) -> DocumentArray[T_doc]: del self._storage - return DocumentArray.__class_getitem__(self.document_type).construct(docs) + return DocArray.__class_getitem__(self.document_type).construct(docs) def traverse_flat( self, diff --git a/docarray/array/stacked/column_storage.py b/docarray/array/stacked/column_storage.py index 53521e27b3a..80129cfcdfd 100644 --- a/docarray/array/stacked/column_storage.py +++ b/docarray/array/stacked/column_storage.py @@ -15,7 +15,7 @@ from docarray.typing.tensor.abstract_tensor import AbstractTensor if TYPE_CHECKING: - from docarray.array.stacked.array_stacked import DocumentArrayStacked + from docarray.array.stacked.array_stacked import DocArrayStacked IndexIterType = Union[slice, Iterable[int], Iterable[bool], None] @@ -26,11 +26,11 @@ class ColumnStorage: """ ColumnStorage is a container to store the columns of the - :class:`~docarray.array.stacked.DocumentArrayStacked`. + :class:`~docarray.array.stacked.DocArrayStacked`. :param tensor_columns: a Dict of AbstractTensor - :param doc_columns: a Dict of :class:`~docarray.array.stacked.DocumentArrayStacked` - :param da_columns: a Dict of List of :class:`~docarray.array.stacked.DocumentArrayStacked` + :param doc_columns: a Dict of :class:`~docarray.array.stacked.DocArrayStacked` + :param da_columns: a Dict of List of :class:`~docarray.array.stacked.DocArrayStacked` :param any_columns: a Dict of List :param tensor_type: Class used to wrap the stacked tensors """ @@ -38,8 +38,8 @@ class ColumnStorage: def __init__( self, tensor_columns: Dict[str, AbstractTensor], - doc_columns: Dict[str, 'DocumentArrayStacked'], - da_columns: Dict[str, ListAdvancedIndexing['DocumentArrayStacked']], + doc_columns: Dict[str, 'DocArrayStacked'], + da_columns: Dict[str, ListAdvancedIndexing['DocArrayStacked']], any_columns: Dict[str, ListAdvancedIndexing], tensor_type: Type[AbstractTensor] = NdArray, ): diff --git a/docarray/base_document/mixins/io.py b/docarray/base_document/mixins/io.py index 190c9b1a99d..a69b95ef3f1 100644 --- a/docarray/base_document/mixins/io.py +++ b/docarray/base_document/mixins/io.py @@ -258,7 +258,7 @@ def _get_content_from_node_proto( elif content_key in ['document', 'document_array']: if field_name is None: raise ValueError( - 'field_name cannot be None when trying to deseriliaze a Document or a DocumentArray' + 'field_name cannot be None when trying to deseriliaze a Document or a DocArray' ) return_field = cls._get_field_type(field_name).from_protobuf( getattr(value, content_key) diff --git a/docarray/base_document/mixins/update.py b/docarray/base_document/mixins/update.py index 0f00ab8ee32..1fe37015c90 100644 --- a/docarray/base_document/mixins/update.py +++ b/docarray/base_document/mixins/update.py @@ -27,12 +27,12 @@ def update(self, other: T): - setting data properties of the second Document to the first Document if they are not None - Concatenating lists and updating sets - - Updating recursively Documents and DocumentArrays + - Updating recursively Documents and DocArrays - Updating Dictionaries of the left with the right It behaves as an update operation for Dictionaries, except that since it is applied to a static schema type, the presence of the field is - given by the field not having a None value and that DocumentArrays, + given by the field not having a None value and that DocArrays, lists and sets are concatenated. It is worth mentioning that Tuples are not merged together since they are meant to be inmutable, so they behave as regular types and the value of `self` is updated @@ -74,7 +74,7 @@ class MyDocument(BaseDoc): ) from collections import namedtuple - from docarray import DocumentArray + from docarray import DocArray from docarray.utils.reduce import reduce # Declaring namedtuple() @@ -105,7 +105,7 @@ def _group_fields(doc: 'UpdateMixin') -> _FieldGroups: field_type = doc._get_field_type(field_name) if isinstance(field_type, type) and issubclass( - field_type, DocumentArray + field_type, DocArray ): nested_docarray_fields.append(field_name) else: diff --git a/docarray/data/torch_dataset.py b/docarray/data/torch_dataset.py index 6d0d7f5ef68..3b5cb0f87a9 100644 --- a/docarray/data/torch_dataset.py +++ b/docarray/data/torch_dataset.py @@ -2,7 +2,7 @@ from torch.utils.data import Dataset -from docarray import BaseDoc, DocumentArray, DocumentArrayStacked +from docarray import BaseDoc, DocArray, DocArrayStacked from docarray.typing import TorchTensor from docarray.utils._typing import change_cls_name @@ -14,7 +14,7 @@ class MultiModalDataset(Dataset, Generic[T_doc]): A dataset that can be used inside a PyTorch DataLoader. In other words, it implements the PyTorch Dataset interface. - :param da: the DocumentArray to be used as the dataset + :param da: the DocArray to be used as the dataset :param preprocessing: a dictionary of field names and preprocessing functions The preprocessing dictionary passed to the constructor consists of keys that are @@ -24,7 +24,7 @@ class MultiModalDataset(Dataset, Generic[T_doc]): EXAMPLE USAGE .. code-block:: python from torch.utils.data import DataLoader - from docarray import DocumentArray + from docarray import DocArray from docarray.data import MultiModalDataset from docarray.documents import Text @@ -33,7 +33,7 @@ def prepend_number(text: str): return f"Number {text}" - da = DocumentArray[Text](Text(text=str(i)) for i in range(16)) + da = DocArray[Text](Text(text=str(i)) for i in range(16)) ds = MultiModalDataset[Text](da, preprocessing={'text': prepend_number}) loader = DataLoader(ds, batch_size=4, collate_fn=MultiModalDataset[Text].collate_fn) for batch in loader: @@ -51,7 +51,7 @@ def prepend_number(text: str): .. code-block:: python import torch from torch.utils.data import DataLoader - from docarray import DocumentArray, BaseDoc + from docarray import DocArray, BaseDoc from docarray.data import MultiModalDataset from docarray.documents import Text @@ -78,7 +78,7 @@ def add_nonsense(student: Student): ) - da = DocumentArray[Student](Student(thesis=Thesis(title=str(i))) for i in range(16)) + da = DocArray[Student](Student(thesis=Thesis(title=str(i))) for i in range(16)) ds = MultiModalDataset[Student]( da, preprocessing={ @@ -96,7 +96,7 @@ def add_nonsense(student: Student): __typed_ds__: Dict[Type[BaseDoc], Type['MultiModalDataset']] = {} def __init__( - self, da: 'DocumentArray[T_doc]', preprocessing: Dict[str, Callable] + self, da: 'DocArray[T_doc]', preprocessing: Dict[str, Callable] ) -> None: self.da = da self._preprocessing = preprocessing @@ -123,12 +123,12 @@ def __getitem__(self, item: int): def collate_fn(cls, batch: List[T_doc]): doc_type = cls.document_type if doc_type: - batch_da = DocumentArrayStacked[doc_type]( # type: ignore + batch_da = DocArrayStacked[doc_type]( # type: ignore batch, tensor_type=TorchTensor, ) else: - batch_da = DocumentArrayStacked(batch, tensor_type=TorchTensor) + batch_da = DocArrayStacked(batch, tensor_type=TorchTensor) return batch_da @classmethod diff --git a/docarray/display/document_array_summary.py b/docarray/display/document_array_summary.py index 7ed5e4ca503..401ee570a95 100644 --- a/docarray/display/document_array_summary.py +++ b/docarray/display/document_array_summary.py @@ -3,17 +3,17 @@ from docarray.typing.tensor.abstract_tensor import AbstractTensor if TYPE_CHECKING: - from docarray.array import DocumentArrayStacked + from docarray.array import DocArrayStacked from docarray.array.abstract_array import AnyDocArray -class DocumentArraySummary: +class DocArraySummary: def __init__(self, da: 'AnyDocArray'): self.da = da def summary(self) -> None: """ - Print a summary of this DocumentArray object and a summary of the schema of its + Print a summary of this DocArray object and a summary of the schema of its Document type. """ from rich import box @@ -21,14 +21,14 @@ def summary(self) -> None: from rich.panel import Panel from rich.table import Table - from docarray.array import DocumentArrayStacked + from docarray.array import DocArrayStacked table = Table(box=box.SIMPLE, highlight=True) table.show_header = False table.add_row('Type', self.da.__class__.__name__) table.add_row('Length', str(len(self.da)), end_section=True) - if isinstance(self.da, DocumentArrayStacked): + if isinstance(self.da, DocArrayStacked): table.add_row('Stacked columns:') stacked_fields = self._get_stacked_fields(da=self.da) for field_name in stacked_fields: @@ -50,14 +50,14 @@ def summary(self) -> None: table.add_row(f' • {field_name}:', col_2) - Console().print(Panel(table, title='DocumentArray Summary', expand=False)) + Console().print(Panel(table, title='DocArray Summary', expand=False)) self.da.document_type.schema_summary() @staticmethod - def _get_stacked_fields(da: 'DocumentArrayStacked') -> List[str]: # TODO this might + def _get_stacked_fields(da: 'DocArrayStacked') -> List[str]: # TODO this might # broken """ - Return a list of the field names of a DocumentArrayStacked instance that are + Return a list of the field names of a DocArrayStacked instance that are stacked, i.e. all the fields that are of type AbstractTensor. Nested field paths are separated by dot, such as: 'attr.nested_attr'. """ @@ -68,7 +68,7 @@ def _get_stacked_fields(da: 'DocumentArrayStacked') -> List[str]: # TODO this m fields.extend( [ f'{field_name}.{x}' - for x in DocumentArraySummary._get_stacked_fields(da=value_doc) + for x in DocArraySummary._get_stacked_fields(da=value_doc) ] ) diff --git a/docarray/display/document_summary.py b/docarray/display/document_summary.py index 2606ddcd4b1..f77dddd1e71 100644 --- a/docarray/display/document_summary.py +++ b/docarray/display/document_summary.py @@ -55,7 +55,7 @@ def _get_schema(cls: Type['BaseDoc'], doc_name: Optional[str] = None) -> Tree: from rich.tree import Tree - from docarray import BaseDoc, DocumentArray + from docarray import BaseDoc, DocArray root = cls.__name__ if doc_name is None else f'{doc_name}: {cls.__name__}' tree = Tree(root, highlight=True) @@ -76,7 +76,7 @@ def _get_schema(cls: Type['BaseDoc'], doc_name: Optional[str] = None) -> Tree: for arg in field_type.__args__: if issubclass(arg, BaseDoc): sub_tree.add(DocumentSummary._get_schema(cls=arg)) - elif issubclass(arg, DocumentArray): + elif issubclass(arg, DocArray): sub_tree.add( DocumentSummary._get_schema(cls=arg.document_type) ) @@ -87,7 +87,7 @@ def _get_schema(cls: Type['BaseDoc'], doc_name: Optional[str] = None) -> Tree: DocumentSummary._get_schema(cls=field_type, doc_name=field_name) ) - elif issubclass(field_type, DocumentArray): + elif issubclass(field_type, DocArray): sub_tree = Tree(node_name, highlight=True) sub_tree.add( DocumentSummary._get_schema(cls=field_type.document_type) @@ -112,7 +112,7 @@ def __rich_console__( from rich import box, text from rich.table import Table - from docarray import BaseDoc, DocumentArray + from docarray import BaseDoc, DocArray table = Table( 'Attribute', @@ -125,7 +125,7 @@ def __rich_console__( for field_name, value in self.doc.__dict__.items(): col_1 = f'{field_name}: {value.__class__.__name__}' if ( - isinstance(value, (ID, DocumentArray, BaseDoc)) + isinstance(value, (ID, DocArray, BaseDoc)) or field_name.startswith('_') or value is None ): @@ -177,7 +177,7 @@ def _plot_recursion( :return: Tree with all children. """ - from docarray import BaseDoc, DocumentArray + from docarray import BaseDoc, DocArray tree = Tree(node) if tree is None else tree.add(node) # type: ignore @@ -185,7 +185,7 @@ def _plot_recursion( nested_attrs = [ k for k, v in node.doc.__dict__.items() - if isinstance(v, (DocumentArray, BaseDoc)) + if isinstance(v, (DocArray, BaseDoc)) ] for attr in nested_attrs: value = getattr(node.doc, attr) diff --git a/docarray/documents/legacy/legacy_document.py b/docarray/documents/legacy/legacy_document.py index fbb59369153..e550a97c800 100644 --- a/docarray/documents/legacy/legacy_document.py +++ b/docarray/documents/legacy/legacy_document.py @@ -2,7 +2,7 @@ from typing import Any, Dict, Optional -from docarray import BaseDoc, DocumentArray +from docarray import BaseDoc, DocArray from docarray.typing import AnyEmbedding, AnyTensor @@ -16,7 +16,7 @@ class LegacyDocument(BaseDoc): of the data is similar. .. code-block:: python - from docarray import DocumentArray + from docarray import DocArray from docarray.documents.legacy import LegacyDocument import numpy as np @@ -27,15 +27,15 @@ class LegacyDocument(BaseDoc): doc.tags['price'] = 10 - doc.chunks = DocumentArray[Document]([Document() for _ in range(10)]) + doc.chunks = DocArray[Document]([Document() for _ in range(10)]) - doc.chunks = DocumentArray[Document]([Document() for _ in range(10)]) + doc.chunks = DocArray[Document]([Document() for _ in range(10)]) """ tensor: Optional[AnyTensor] - chunks: Optional[DocumentArray[LegacyDocument]] - matches: Optional[DocumentArray[LegacyDocument]] + chunks: Optional[DocArray[LegacyDocument]] + matches: Optional[DocArray[LegacyDocument]] blob: Optional[bytes] text: Optional[str] url: Optional[str] diff --git a/docarray/helper.py b/docarray/helper.py index d921f8b3f7d..3cf74379e8d 100644 --- a/docarray/helper.py +++ b/docarray/helper.py @@ -135,7 +135,7 @@ def _get_field_type_by_access_path( :param access_path: "__"-separated access path :return: field type of accessed attribute. If access path is invalid, return None. """ - from docarray import BaseDoc, DocumentArray + from docarray import BaseDoc, DocArray field, _, remaining = access_path.partition('__') field_valid = field in doc_type.__fields__.keys() @@ -145,7 +145,7 @@ def _get_field_type_by_access_path( return doc_type._get_field_type(field) else: d = doc_type._get_field_type(field) - if issubclass(d, DocumentArray): + if issubclass(d, DocArray): return _get_field_type_by_access_path(d.document_type, remaining) elif issubclass(d, BaseDoc): return _get_field_type_by_access_path(d, remaining) @@ -180,7 +180,7 @@ def get_paths( .. code-block:: python from typing import Optional - from docarray import BaseDoc, DocumentArray + from docarray import BaseDoc, DocArray from docarray.helper import get_paths from docarray.typing import TextUrl, ImageUrl @@ -191,9 +191,7 @@ class Banner(BaseDoc): # you can call it in the constructor - da = DocumentArray[Banner]( - [Banner(text_url=url) for url in get_paths(patterns='*.txt')] - ) + da = DocArray[Banner]([Banner(text_url=url) for url in get_paths(patterns='*.txt')]) # and call it after construction to set the urls da.image_url = list(get_paths(patterns='*.jpg', exclude_regex='test')) diff --git a/docarray/index/abstract.py b/docarray/index/abstract.py index 2a7d99d2ff3..3eac3d7f869 100644 --- a/docarray/index/abstract.py +++ b/docarray/index/abstract.py @@ -23,7 +23,7 @@ from pydantic.error_wrappers import ValidationError from typing_inspect import get_args, is_optional_type, is_union_type -from docarray import BaseDoc, DocumentArray +from docarray import BaseDoc, DocArray from docarray.array.abstract_array import AnyDocArray from docarray.typing import AnyTensor from docarray.utils._typing import unwrap_optional_type @@ -45,12 +45,12 @@ class FindResultBatched(NamedTuple): - documents: List[DocumentArray] + documents: List[DocArray] scores: np.ndarray class _FindResultBatched(NamedTuple): - documents: Union[List[DocumentArray], List[List[Dict[str, Any]]]] + documents: Union[List[DocArray], List[List[Dict[str, Any]]]] scores: np.ndarray @@ -251,12 +251,12 @@ def _filter( self, filter_query: Any, limit: int, - ) -> Union[DocumentArray, List[Dict]]: + ) -> Union[DocArray, List[Dict]]: """Find documents in the index based on a filter query :param filter_query: the DB specific filter query to execute :param limit: maximum number of documents to return - :return: a DocumentArray containing the documents that match the filter query + :return: a DocArray containing the documents that match the filter query """ ... @@ -265,13 +265,13 @@ def _filter_batched( self, filter_queries: Any, limit: int, - ) -> Union[List[DocumentArray], List[List[Dict]]]: + ) -> Union[List[DocArray], List[List[Dict]]]: """Find documents in the index based on multiple filter queries. Each query is considered individually, and results are returned per query. :param filter_queries: the DB specific filter queries to execute :param limit: maximum number of documents to return per query - :return: List of DocumentArrays containing the documents + :return: List of DocArrays containing the documents that match the filter queries """ ... @@ -319,7 +319,7 @@ def _text_search_batched( def __getitem__( self, key: Union[str, Sequence[str]] - ) -> Union[TSchema, DocumentArray[TSchema]]: + ) -> Union[TSchema, DocArray[TSchema]]: """Get one or multiple Documents into the index, by `id`. If no document is found, a KeyError is raised. @@ -338,12 +338,12 @@ def __getitem__( raise KeyError(f'No document with id {key} found') # cast output - if isinstance(doc_sequence, DocumentArray): - out_da: DocumentArray[TSchema] = doc_sequence + if isinstance(doc_sequence, DocArray): + out_da: DocArray[TSchema] = doc_sequence elif isinstance(doc_sequence[0], Dict): out_da = self._dict_list_to_docarray(doc_sequence) # type: ignore else: - da_cls = DocumentArray.__class_getitem__(cast(Type[BaseDoc], self._schema)) + da_cls = DocArray.__class_getitem__(cast(Type[BaseDoc], self._schema)) out_da = da_cls(doc_sequence) return out_da[0] if return_singleton else out_da @@ -382,9 +382,9 @@ def index(self, docs: Union[BaseDoc, Sequence[BaseDoc]], **kwargs): :param docs: Documents to index. """ - if not isinstance(docs, (BaseDoc, DocumentArray)): + if not isinstance(docs, (BaseDoc, DocArray)): self._logger.warning( - 'Passing a sequence of Documents that is not a DocumentArray comes at ' + 'Passing a sequence of Documents that is not a DocArray comes at ' 'a performance penalty, since compatibility with the schema of Index ' 'needs to be checked for every Document individually.' ) @@ -428,7 +428,7 @@ def find( def find_batched( self, - queries: Union[AnyTensor, DocumentArray], + queries: Union[AnyTensor, DocArray], search_field: str = 'embedding', limit: int = 10, **kwargs, @@ -437,7 +437,7 @@ def find_batched( :param queries: query vector for KNN/ANN search. Can be either a tensor-like (np.array, torch.Tensor, etc.) with a, - or a DocumentArray. + or a DocArray. If a tensor-like is passed, it should have shape (batch_size, vector_dim) :param search_field: name of the field to search on. Documents in the index are retrieved based on this similarity @@ -468,12 +468,12 @@ def filter( filter_query: Any, limit: int = 10, **kwargs, - ) -> DocumentArray: + ) -> DocArray: """Find documents in the index based on a filter query :param filter_query: the DB specific filter query to execute :param limit: maximum number of documents to return - :return: a DocumentArray containing the documents that match the filter query + :return: a DocArray containing the documents that match the filter query """ self._logger.debug(f'Executing `filter` for the query {filter_query}') docs = self._filter(filter_query, limit=limit, **kwargs) @@ -488,12 +488,12 @@ def filter_batched( filter_queries: Any, limit: int = 10, **kwargs, - ) -> List[DocumentArray]: + ) -> List[DocArray]: """Find documents in the index based on multiple filter queries. :param filter_queries: the DB specific filter query to execute :param limit: maximum number of documents to return - :return: a DocumentArray containing the documents that match the filter query + :return: a DocArray containing the documents that match the filter query """ self._logger.debug( f'Executing `filter_batched` for the queries {filter_queries}' @@ -574,7 +574,7 @@ def text_search_batched( def _get_values_by_column(docs: Sequence[BaseDoc], col_name: str) -> List[Any]: """Get the value of a column of a document. - :param docs: The DocumentArray to get the values from + :param docs: The DocArray to get the values from :param col_name: The name of the column, e.g. 'text' or 'image__tensor' :return: The value of the column of `doc` """ @@ -597,7 +597,7 @@ def _transpose_col_value_dict( """'Transpose' the output of `_get_col_value_dict()`: Yield rows of columns, where each row represent one Document. Since a generator is returned, this process comes at negligible cost. - :param docs: The DocumentArray to get the values from + :param docs: The DocArray to get the values from :return: The `docs` flattened out as rows. Each row is a dictionary mapping from column name to value """ return (dict(zip(col_value_dict, row)) for row in zip(*col_value_dict.values())) @@ -716,7 +716,7 @@ def _create_column_infos(self, schema: Type[BaseDoc]) -> Dict[str, _ColumnInfo]: ) elif issubclass(type_, AnyDocArray): raise ValueError( - 'Indexing field of DocumentArray type (=subindex)' + 'Indexing field of DocArray type (=subindex)' 'is not yet supported.' ) else: @@ -755,22 +755,22 @@ def _create_single_column(self, field: 'ModelField', type_: Type) -> _ColumnInfo def _validate_docs( self, docs: Union[BaseDoc, Sequence[BaseDoc]] - ) -> DocumentArray[BaseDoc]: + ) -> DocArray[BaseDoc]: """Validates Document against the schema of the Document Index. For validation to pass, the schema of `docs` and the schema of the Document Index need to evaluate to the same flattened columns. If Validation fails, a ValueError is raised. - :param docs: Document to evaluate. If this is a DocumentArray, validation is + :param docs: Document to evaluate. If this is a DocArray, validation is performed using its `doc_type` (parametrization), without having to check ever Document in `docs`. If this check fails, or if `docs` is not a - DocumentArray, evaluation is performed for every Document in `docs`. - :return: A DocumentArray containing the Documents in `docs` + DocArray, evaluation is performed for every Document in `docs`. + :return: A DocArray containing the Documents in `docs` """ if isinstance(docs, BaseDoc): docs = [docs] - if isinstance(docs, DocumentArray): - # validation shortcut for DocumentArray; only look at the schema + if isinstance(docs, DocArray): + # validation shortcut for DocArray; only look at the schema reference_schema_flat = self._flatten_schema( cast(Type[BaseDoc], self._schema) ) @@ -801,7 +801,7 @@ def _validate_docs( ' and that the types of your data match the types of the Document Index schema.' ) - return DocumentArray[BaseDoc].construct(out_docs) + return DocArray[BaseDoc].construct(out_docs) def _to_numpy(self, val: Any, allow_passthrough=False) -> Any: """ @@ -854,11 +854,9 @@ def _convert_dict_to_doc( schema_cls = cast(Type[BaseDoc], schema) return schema_cls(**doc_dict) - def _dict_list_to_docarray( - self, dict_list: Sequence[Dict[str, Any]] - ) -> DocumentArray: - """Convert a list of docs in dict type to a DocumentArray of the schema type.""" + def _dict_list_to_docarray(self, dict_list: Sequence[Dict[str, Any]]) -> DocArray: + """Convert a list of docs in dict type to a DocArray of the schema type.""" doc_list = [self._convert_dict_to_doc(doc_dict, self._schema) for doc_dict in dict_list] # type: ignore - da_cls = DocumentArray.__class_getitem__(cast(Type[BaseDoc], self._schema)) + da_cls = DocArray.__class_getitem__(cast(Type[BaseDoc], self._schema)) return da_cls(doc_list) diff --git a/docarray/index/backends/hnswlib.py b/docarray/index/backends/hnswlib.py index 8b5cf27804b..38b827539ab 100644 --- a/docarray/index/backends/hnswlib.py +++ b/docarray/index/backends/hnswlib.py @@ -20,7 +20,7 @@ import hnswlib import numpy as np -from docarray import BaseDoc, DocumentArray +from docarray import BaseDoc, DocArray from docarray.index.abstract import ( BaseDocIndex, _ColumnInfo, @@ -204,9 +204,7 @@ def execute_query(self, query: List[Tuple[str, Dict]], *args, **kwargs) -> Any: f'args and kwargs not supported for `execute_query` on {type(self)}' ) - ann_docs = DocumentArray.__class_getitem__(cast(Type[BaseDoc], self._schema))( - [] - ) + ann_docs = DocArray.__class_getitem__(cast(Type[BaseDoc], self._schema))([]) filter_conditions = [] doc_to_score: Dict[BaseDoc, Any] = {} for op, op_kwargs in query: @@ -220,7 +218,7 @@ def execute_query(self, query: List[Tuple[str, Dict]], *args, **kwargs) -> Any: self._logger.debug(f'Executing query {query}') docs_filtered = ann_docs for cond in filter_conditions: - da_cls = DocumentArray.__class_getitem__(cast(Type[BaseDoc], self._schema)) + da_cls = DocArray.__class_getitem__(cast(Type[BaseDoc], self._schema)) docs_filtered = da_cls(filter_docs(docs_filtered, cond)) self._logger.debug(f'{len(docs_filtered)} results found') @@ -260,7 +258,7 @@ def _filter( self, filter_query: Any, limit: int, - ) -> DocumentArray: + ) -> DocArray: raise NotImplementedError( f'{type(self)} does not support filter-only queries.' f' To perform post-filtering on a query, use' @@ -271,7 +269,7 @@ def _filter_batched( self, filter_queries: Any, limit: int, - ) -> List[DocumentArray]: + ) -> List[DocArray]: raise NotImplementedError( f'{type(self)} does not support filter-only queries.' f' To perform post-filtering on a query, use' @@ -379,22 +377,22 @@ def _get_docs_sqlite_unsorted(self, univ_ids: Sequence[int]): 'SELECT data FROM docs WHERE doc_id IN %s' % sql_id_list, ) rows = self._sqlite_cursor.fetchall() - da_cls = DocumentArray.__class_getitem__(cast(Type[BaseDoc], self._schema)) + da_cls = DocArray.__class_getitem__(cast(Type[BaseDoc], self._schema)) return da_cls([self._doc_from_bytes(row[0]) for row in rows]) - def _get_docs_sqlite_doc_id(self, doc_ids: Sequence[str]) -> DocumentArray[TSchema]: + def _get_docs_sqlite_doc_id(self, doc_ids: Sequence[str]) -> DocArray[TSchema]: hashed_ids = tuple(self._to_hashed_id(id_) for id_ in doc_ids) docs_unsorted = self._get_docs_sqlite_unsorted(hashed_ids) - da_cls = DocumentArray.__class_getitem__(cast(Type[BaseDoc], self._schema)) + da_cls = DocArray.__class_getitem__(cast(Type[BaseDoc], self._schema)) return da_cls(sorted(docs_unsorted, key=lambda doc: doc_ids.index(doc.id))) - def _get_docs_sqlite_hashed_id(self, hashed_ids: Sequence[int]) -> DocumentArray: + def _get_docs_sqlite_hashed_id(self, hashed_ids: Sequence[int]) -> DocArray: docs_unsorted = self._get_docs_sqlite_unsorted(hashed_ids) def _in_position(doc): return hashed_ids.index(self._to_hashed_id(doc.id)) - da_cls = DocumentArray.__class_getitem__(cast(Type[BaseDoc], self._schema)) + da_cls = DocArray.__class_getitem__(cast(Type[BaseDoc], self._schema)) return da_cls(sorted(docs_unsorted, key=_in_position)) def _delete_docs_from_sqlite(self, doc_ids: Sequence[Union[str, int]]): diff --git a/docarray/proto/__init__.py b/docarray/proto/__init__.py index 5bc8c078c51..1f53c5e2ca8 100644 --- a/docarray/proto/__init__.py +++ b/docarray/proto/__init__.py @@ -3,34 +3,34 @@ if __pb__version__.startswith('4'): from docarray.proto.pb.docarray_pb2 import ( DictOfAnyProto, - DocumentArrayProto, - DocumentArrayStackedProto, + DocArrayProto, + DocArrayStackedProto, DocumentProto, ListOfAnyProto, - ListOfDocumentArrayProto, + ListOfDocArrayProto, NdArrayProto, NodeProto, ) else: from docarray.proto.pb2.docarray_pb2 import ( DictOfAnyProto, - DocumentArrayProto, - DocumentArrayStackedProto, + DocArrayProto, + DocArrayStackedProto, DocumentProto, ListOfAnyProto, - ListOfDocumentArrayProto, + ListOfDocArrayProto, NdArrayProto, NodeProto, ) __all__ = [ - 'DocumentArrayProto', + 'DocArrayProto', 'DocumentProto', 'NdArrayProto', 'NodeProto', - 'DocumentArrayStackedProto', - 'DocumentArrayProto', - 'ListOfDocumentArrayProto', + 'DocArrayStackedProto', + 'DocArrayProto', + 'ListOfDocArrayProto', 'ListOfAnyProto', 'DictOfAnyProto', ] diff --git a/docarray/proto/docarray.proto b/docarray/proto/docarray.proto index ae9c86a2fc1..85c6a882f0b 100644 --- a/docarray/proto/docarray.proto +++ b/docarray/proto/docarray.proto @@ -54,8 +54,8 @@ message NodeProto { NdArrayProto ndarray = 6; // a sub Document DocumentProto document = 7; - // a sub DocumentArray - DocumentArrayProto document_array = 8; + // a sub DocArray + DocArrayProto document_array = 8; //any list ListOfAnyProto list = 9; //any set @@ -91,18 +91,18 @@ message ListOfAnyProto { repeated NodeProto data = 1; } -message DocumentArrayProto { +message DocArrayProto { repeated DocumentProto docs = 1; // a list of Documents } -message ListOfDocumentArrayProto { - repeated DocumentArrayProto data = 1; +message ListOfDocArrayProto { + repeated DocArrayProto data = 1; } -message DocumentArrayStackedProto{ +message DocArrayStackedProto{ map tensor_columns = 1; // a dict of document columns - map doc_columns = 2; // a dict of tensor columns - map da_columns = 3; // a dict of document array columns + map doc_columns = 2; // a dict of tensor columns + map da_columns = 3; // a dict of document array columns map any_columns = 4; // a dict of any columns. Used for the rest of the data } \ No newline at end of file diff --git a/docarray/proto/pb/docarray_pb2.py b/docarray/proto/pb/docarray_pb2.py index b66c36a7e1e..2f71d2fa67c 100644 --- a/docarray/proto/pb/docarray_pb2.py +++ b/docarray/proto/pb/docarray_pb2.py @@ -14,7 +14,7 @@ from google.protobuf import struct_pb2 as google_dot_protobuf_dot_struct__pb2 -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0e\x64ocarray.proto\x12\x08\x64ocarray\x1a\x1cgoogle/protobuf/struct.proto\"A\n\x11\x44\x65nseNdArrayProto\x12\x0e\n\x06\x62uffer\x18\x01 \x01(\x0c\x12\r\n\x05shape\x18\x02 \x03(\r\x12\r\n\x05\x64type\x18\x03 \x01(\t\"g\n\x0cNdArrayProto\x12*\n\x05\x64\x65nse\x18\x01 \x01(\x0b\x32\x1b.docarray.DenseNdArrayProto\x12+\n\nparameters\x18\x02 \x01(\x0b\x32\x17.google.protobuf.Struct\"Z\n\x0cKeyValuePair\x12#\n\x03key\x18\x01 \x01(\x0b\x32\x16.google.protobuf.Value\x12%\n\x05value\x18\x02 \x01(\x0b\x32\x16.google.protobuf.Value\";\n\x10GenericDictValue\x12\'\n\x07\x65ntries\x18\x01 \x03(\x0b\x32\x16.docarray.KeyValuePair\"\xc6\x03\n\tNodeProto\x12\x0e\n\x04text\x18\x01 \x01(\tH\x00\x12\x11\n\x07integer\x18\x02 \x01(\x05H\x00\x12\x0f\n\x05\x66loat\x18\x03 \x01(\x01H\x00\x12\x11\n\x07\x62oolean\x18\x04 \x01(\x08H\x00\x12\x0e\n\x04\x62lob\x18\x05 \x01(\x0cH\x00\x12)\n\x07ndarray\x18\x06 \x01(\x0b\x32\x16.docarray.NdArrayProtoH\x00\x12+\n\x08\x64ocument\x18\x07 \x01(\x0b\x32\x17.docarray.DocumentProtoH\x00\x12\x36\n\x0e\x64ocument_array\x18\x08 \x01(\x0b\x32\x1c.docarray.DocumentArrayProtoH\x00\x12(\n\x04list\x18\t \x01(\x0b\x32\x18.docarray.ListOfAnyProtoH\x00\x12\'\n\x03set\x18\n \x01(\x0b\x32\x18.docarray.ListOfAnyProtoH\x00\x12)\n\x05tuple\x18\x0b \x01(\x0b\x32\x18.docarray.ListOfAnyProtoH\x00\x12(\n\x04\x64ict\x18\x0c \x01(\x0b\x32\x18.docarray.DictOfAnyProtoH\x00\x12\x0e\n\x04type\x18\r \x01(\tH\x01\x42\t\n\x07\x63ontentB\x0f\n\rdocarray_type\"\x82\x01\n\rDocumentProto\x12/\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32!.docarray.DocumentProto.DataEntry\x1a@\n\tDataEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\"\n\x05value\x18\x02 \x01(\x0b\x32\x13.docarray.NodeProto:\x02\x38\x01\"\x84\x01\n\x0e\x44ictOfAnyProto\x12\x30\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\".docarray.DictOfAnyProto.DataEntry\x1a@\n\tDataEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\"\n\x05value\x18\x02 \x01(\x0b\x32\x13.docarray.NodeProto:\x02\x38\x01\"3\n\x0eListOfAnyProto\x12!\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\x13.docarray.NodeProto\";\n\x12\x44ocumentArrayProto\x12%\n\x04\x64ocs\x18\x01 \x03(\x0b\x32\x17.docarray.DocumentProto\"F\n\x18ListOfDocumentArrayProto\x12*\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\x1c.docarray.DocumentArrayProto\"\x90\x05\n\x19\x44ocumentArrayStackedProto\x12N\n\x0etensor_columns\x18\x01 \x03(\x0b\x32\x36.docarray.DocumentArrayStackedProto.TensorColumnsEntry\x12H\n\x0b\x64oc_columns\x18\x02 \x03(\x0b\x32\x33.docarray.DocumentArrayStackedProto.DocColumnsEntry\x12\x46\n\nda_columns\x18\x03 \x03(\x0b\x32\x32.docarray.DocumentArrayStackedProto.DaColumnsEntry\x12H\n\x0b\x61ny_columns\x18\x04 \x03(\x0b\x32\x33.docarray.DocumentArrayStackedProto.AnyColumnsEntry\x1aL\n\x12TensorColumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12%\n\x05value\x18\x02 \x01(\x0b\x32\x16.docarray.NdArrayProto:\x02\x38\x01\x1aV\n\x0f\x44ocColumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\x32\n\x05value\x18\x02 \x01(\x0b\x32#.docarray.DocumentArrayStackedProto:\x02\x38\x01\x1aT\n\x0e\x44\x61\x43olumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\x31\n\x05value\x18\x02 \x01(\x0b\x32\".docarray.ListOfDocumentArrayProto:\x02\x38\x01\x1aK\n\x0f\x41nyColumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\'\n\x05value\x18\x02 \x01(\x0b\x32\x18.docarray.ListOfAnyProto:\x02\x38\x01\x62\x06proto3') +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0e\x64ocarray.proto\x12\x08\x64ocarray\x1a\x1cgoogle/protobuf/struct.proto\"A\n\x11\x44\x65nseNdArrayProto\x12\x0e\n\x06\x62uffer\x18\x01 \x01(\x0c\x12\r\n\x05shape\x18\x02 \x03(\r\x12\r\n\x05\x64type\x18\x03 \x01(\t\"g\n\x0cNdArrayProto\x12*\n\x05\x64\x65nse\x18\x01 \x01(\x0b\x32\x1b.docarray.DenseNdArrayProto\x12+\n\nparameters\x18\x02 \x01(\x0b\x32\x17.google.protobuf.Struct\"Z\n\x0cKeyValuePair\x12#\n\x03key\x18\x01 \x01(\x0b\x32\x16.google.protobuf.Value\x12%\n\x05value\x18\x02 \x01(\x0b\x32\x16.google.protobuf.Value\";\n\x10GenericDictValue\x12\'\n\x07\x65ntries\x18\x01 \x03(\x0b\x32\x16.docarray.KeyValuePair\"\xc1\x03\n\tNodeProto\x12\x0e\n\x04text\x18\x01 \x01(\tH\x00\x12\x11\n\x07integer\x18\x02 \x01(\x05H\x00\x12\x0f\n\x05\x66loat\x18\x03 \x01(\x01H\x00\x12\x11\n\x07\x62oolean\x18\x04 \x01(\x08H\x00\x12\x0e\n\x04\x62lob\x18\x05 \x01(\x0cH\x00\x12)\n\x07ndarray\x18\x06 \x01(\x0b\x32\x16.docarray.NdArrayProtoH\x00\x12+\n\x08\x64ocument\x18\x07 \x01(\x0b\x32\x17.docarray.DocumentProtoH\x00\x12\x31\n\x0e\x64ocument_array\x18\x08 \x01(\x0b\x32\x17.docarray.DocArrayProtoH\x00\x12(\n\x04list\x18\t \x01(\x0b\x32\x18.docarray.ListOfAnyProtoH\x00\x12\'\n\x03set\x18\n \x01(\x0b\x32\x18.docarray.ListOfAnyProtoH\x00\x12)\n\x05tuple\x18\x0b \x01(\x0b\x32\x18.docarray.ListOfAnyProtoH\x00\x12(\n\x04\x64ict\x18\x0c \x01(\x0b\x32\x18.docarray.DictOfAnyProtoH\x00\x12\x0e\n\x04type\x18\r \x01(\tH\x01\x42\t\n\x07\x63ontentB\x0f\n\rdocarray_type\"\x82\x01\n\rDocumentProto\x12/\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32!.docarray.DocumentProto.DataEntry\x1a@\n\tDataEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\"\n\x05value\x18\x02 \x01(\x0b\x32\x13.docarray.NodeProto:\x02\x38\x01\"\x84\x01\n\x0e\x44ictOfAnyProto\x12\x30\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\".docarray.DictOfAnyProto.DataEntry\x1a@\n\tDataEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\"\n\x05value\x18\x02 \x01(\x0b\x32\x13.docarray.NodeProto:\x02\x38\x01\"3\n\x0eListOfAnyProto\x12!\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\x13.docarray.NodeProto\"6\n\rDocArrayProto\x12%\n\x04\x64ocs\x18\x01 \x03(\x0b\x32\x17.docarray.DocumentProto\"<\n\x13ListOfDocArrayProto\x12%\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\x17.docarray.DocArrayProto\"\xed\x04\n\x14\x44ocArrayStackedProto\x12I\n\x0etensor_columns\x18\x01 \x03(\x0b\x32\x31.docarray.DocArrayStackedProto.TensorColumnsEntry\x12\x43\n\x0b\x64oc_columns\x18\x02 \x03(\x0b\x32..docarray.DocArrayStackedProto.DocColumnsEntry\x12\x41\n\nda_columns\x18\x03 \x03(\x0b\x32-.docarray.DocArrayStackedProto.DaColumnsEntry\x12\x43\n\x0b\x61ny_columns\x18\x04 \x03(\x0b\x32..docarray.DocArrayStackedProto.AnyColumnsEntry\x1aL\n\x12TensorColumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12%\n\x05value\x18\x02 \x01(\x0b\x32\x16.docarray.NdArrayProto:\x02\x38\x01\x1aQ\n\x0f\x44ocColumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12-\n\x05value\x18\x02 \x01(\x0b\x32\x1e.docarray.DocArrayStackedProto:\x02\x38\x01\x1aO\n\x0e\x44\x61\x43olumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12,\n\x05value\x18\x02 \x01(\x0b\x32\x1d.docarray.ListOfDocArrayProto:\x02\x38\x01\x1aK\n\x0f\x41nyColumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\'\n\x05value\x18\x02 \x01(\x0b\x32\x18.docarray.ListOfAnyProto:\x02\x38\x01\x62\x06proto3') _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals()) _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'docarray_pb2', globals()) @@ -25,14 +25,14 @@ _DOCUMENTPROTO_DATAENTRY._serialized_options = b'8\001' _DICTOFANYPROTO_DATAENTRY._options = None _DICTOFANYPROTO_DATAENTRY._serialized_options = b'8\001' - _DOCUMENTARRAYSTACKEDPROTO_TENSORCOLUMNSENTRY._options = None - _DOCUMENTARRAYSTACKEDPROTO_TENSORCOLUMNSENTRY._serialized_options = b'8\001' - _DOCUMENTARRAYSTACKEDPROTO_DOCCOLUMNSENTRY._options = None - _DOCUMENTARRAYSTACKEDPROTO_DOCCOLUMNSENTRY._serialized_options = b'8\001' - _DOCUMENTARRAYSTACKEDPROTO_DACOLUMNSENTRY._options = None - _DOCUMENTARRAYSTACKEDPROTO_DACOLUMNSENTRY._serialized_options = b'8\001' - _DOCUMENTARRAYSTACKEDPROTO_ANYCOLUMNSENTRY._options = None - _DOCUMENTARRAYSTACKEDPROTO_ANYCOLUMNSENTRY._serialized_options = b'8\001' + _DOCARRAYSTACKEDPROTO_TENSORCOLUMNSENTRY._options = None + _DOCARRAYSTACKEDPROTO_TENSORCOLUMNSENTRY._serialized_options = b'8\001' + _DOCARRAYSTACKEDPROTO_DOCCOLUMNSENTRY._options = None + _DOCARRAYSTACKEDPROTO_DOCCOLUMNSENTRY._serialized_options = b'8\001' + _DOCARRAYSTACKEDPROTO_DACOLUMNSENTRY._options = None + _DOCARRAYSTACKEDPROTO_DACOLUMNSENTRY._serialized_options = b'8\001' + _DOCARRAYSTACKEDPROTO_ANYCOLUMNSENTRY._options = None + _DOCARRAYSTACKEDPROTO_ANYCOLUMNSENTRY._serialized_options = b'8\001' _DENSENDARRAYPROTO._serialized_start=58 _DENSENDARRAYPROTO._serialized_end=123 _NDARRAYPROTO._serialized_start=125 @@ -42,29 +42,29 @@ _GENERICDICTVALUE._serialized_start=322 _GENERICDICTVALUE._serialized_end=381 _NODEPROTO._serialized_start=384 - _NODEPROTO._serialized_end=838 - _DOCUMENTPROTO._serialized_start=841 - _DOCUMENTPROTO._serialized_end=971 - _DOCUMENTPROTO_DATAENTRY._serialized_start=907 - _DOCUMENTPROTO_DATAENTRY._serialized_end=971 - _DICTOFANYPROTO._serialized_start=974 - _DICTOFANYPROTO._serialized_end=1106 - _DICTOFANYPROTO_DATAENTRY._serialized_start=907 - _DICTOFANYPROTO_DATAENTRY._serialized_end=971 - _LISTOFANYPROTO._serialized_start=1108 - _LISTOFANYPROTO._serialized_end=1159 - _DOCUMENTARRAYPROTO._serialized_start=1161 - _DOCUMENTARRAYPROTO._serialized_end=1220 - _LISTOFDOCUMENTARRAYPROTO._serialized_start=1222 - _LISTOFDOCUMENTARRAYPROTO._serialized_end=1292 - _DOCUMENTARRAYSTACKEDPROTO._serialized_start=1295 - _DOCUMENTARRAYSTACKEDPROTO._serialized_end=1951 - _DOCUMENTARRAYSTACKEDPROTO_TENSORCOLUMNSENTRY._serialized_start=1624 - _DOCUMENTARRAYSTACKEDPROTO_TENSORCOLUMNSENTRY._serialized_end=1700 - _DOCUMENTARRAYSTACKEDPROTO_DOCCOLUMNSENTRY._serialized_start=1702 - _DOCUMENTARRAYSTACKEDPROTO_DOCCOLUMNSENTRY._serialized_end=1788 - _DOCUMENTARRAYSTACKEDPROTO_DACOLUMNSENTRY._serialized_start=1790 - _DOCUMENTARRAYSTACKEDPROTO_DACOLUMNSENTRY._serialized_end=1874 - _DOCUMENTARRAYSTACKEDPROTO_ANYCOLUMNSENTRY._serialized_start=1876 - _DOCUMENTARRAYSTACKEDPROTO_ANYCOLUMNSENTRY._serialized_end=1951 + _NODEPROTO._serialized_end=833 + _DOCUMENTPROTO._serialized_start=836 + _DOCUMENTPROTO._serialized_end=966 + _DOCUMENTPROTO_DATAENTRY._serialized_start=902 + _DOCUMENTPROTO_DATAENTRY._serialized_end=966 + _DICTOFANYPROTO._serialized_start=969 + _DICTOFANYPROTO._serialized_end=1101 + _DICTOFANYPROTO_DATAENTRY._serialized_start=902 + _DICTOFANYPROTO_DATAENTRY._serialized_end=966 + _LISTOFANYPROTO._serialized_start=1103 + _LISTOFANYPROTO._serialized_end=1154 + _DOCARRAYPROTO._serialized_start=1156 + _DOCARRAYPROTO._serialized_end=1210 + _LISTOFDOCARRAYPROTO._serialized_start=1212 + _LISTOFDOCARRAYPROTO._serialized_end=1272 + _DOCARRAYSTACKEDPROTO._serialized_start=1275 + _DOCARRAYSTACKEDPROTO._serialized_end=1896 + _DOCARRAYSTACKEDPROTO_TENSORCOLUMNSENTRY._serialized_start=1579 + _DOCARRAYSTACKEDPROTO_TENSORCOLUMNSENTRY._serialized_end=1655 + _DOCARRAYSTACKEDPROTO_DOCCOLUMNSENTRY._serialized_start=1657 + _DOCARRAYSTACKEDPROTO_DOCCOLUMNSENTRY._serialized_end=1738 + _DOCARRAYSTACKEDPROTO_DACOLUMNSENTRY._serialized_start=1740 + _DOCARRAYSTACKEDPROTO_DACOLUMNSENTRY._serialized_end=1819 + _DOCARRAYSTACKEDPROTO_ANYCOLUMNSENTRY._serialized_start=1821 + _DOCARRAYSTACKEDPROTO_ANYCOLUMNSENTRY._serialized_end=1896 # @@protoc_insertion_point(module_scope) diff --git a/docarray/proto/pb2/docarray_pb2.py b/docarray/proto/pb2/docarray_pb2.py index cc71cb81420..2de3bbdf678 100644 --- a/docarray/proto/pb2/docarray_pb2.py +++ b/docarray/proto/pb2/docarray_pb2.py @@ -16,7 +16,7 @@ from google.protobuf import struct_pb2 as google_dot_protobuf_dot_struct__pb2 DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( - b'\n\x0e\x64ocarray.proto\x12\x08\x64ocarray\x1a\x1cgoogle/protobuf/struct.proto\"A\n\x11\x44\x65nseNdArrayProto\x12\x0e\n\x06\x62uffer\x18\x01 \x01(\x0c\x12\r\n\x05shape\x18\x02 \x03(\r\x12\r\n\x05\x64type\x18\x03 \x01(\t\"g\n\x0cNdArrayProto\x12*\n\x05\x64\x65nse\x18\x01 \x01(\x0b\x32\x1b.docarray.DenseNdArrayProto\x12+\n\nparameters\x18\x02 \x01(\x0b\x32\x17.google.protobuf.Struct\"Z\n\x0cKeyValuePair\x12#\n\x03key\x18\x01 \x01(\x0b\x32\x16.google.protobuf.Value\x12%\n\x05value\x18\x02 \x01(\x0b\x32\x16.google.protobuf.Value\";\n\x10GenericDictValue\x12\'\n\x07\x65ntries\x18\x01 \x03(\x0b\x32\x16.docarray.KeyValuePair\"\xc6\x03\n\tNodeProto\x12\x0e\n\x04text\x18\x01 \x01(\tH\x00\x12\x11\n\x07integer\x18\x02 \x01(\x05H\x00\x12\x0f\n\x05\x66loat\x18\x03 \x01(\x01H\x00\x12\x11\n\x07\x62oolean\x18\x04 \x01(\x08H\x00\x12\x0e\n\x04\x62lob\x18\x05 \x01(\x0cH\x00\x12)\n\x07ndarray\x18\x06 \x01(\x0b\x32\x16.docarray.NdArrayProtoH\x00\x12+\n\x08\x64ocument\x18\x07 \x01(\x0b\x32\x17.docarray.DocumentProtoH\x00\x12\x36\n\x0e\x64ocument_array\x18\x08 \x01(\x0b\x32\x1c.docarray.DocumentArrayProtoH\x00\x12(\n\x04list\x18\t \x01(\x0b\x32\x18.docarray.ListOfAnyProtoH\x00\x12\'\n\x03set\x18\n \x01(\x0b\x32\x18.docarray.ListOfAnyProtoH\x00\x12)\n\x05tuple\x18\x0b \x01(\x0b\x32\x18.docarray.ListOfAnyProtoH\x00\x12(\n\x04\x64ict\x18\x0c \x01(\x0b\x32\x18.docarray.DictOfAnyProtoH\x00\x12\x0e\n\x04type\x18\r \x01(\tH\x01\x42\t\n\x07\x63ontentB\x0f\n\rdocarray_type\"\x82\x01\n\rDocumentProto\x12/\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32!.docarray.DocumentProto.DataEntry\x1a@\n\tDataEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\"\n\x05value\x18\x02 \x01(\x0b\x32\x13.docarray.NodeProto:\x02\x38\x01\"\x84\x01\n\x0e\x44ictOfAnyProto\x12\x30\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\".docarray.DictOfAnyProto.DataEntry\x1a@\n\tDataEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\"\n\x05value\x18\x02 \x01(\x0b\x32\x13.docarray.NodeProto:\x02\x38\x01\"3\n\x0eListOfAnyProto\x12!\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\x13.docarray.NodeProto\";\n\x12\x44ocumentArrayProto\x12%\n\x04\x64ocs\x18\x01 \x03(\x0b\x32\x17.docarray.DocumentProto\"F\n\x18ListOfDocumentArrayProto\x12*\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\x1c.docarray.DocumentArrayProto\"\x90\x05\n\x19\x44ocumentArrayStackedProto\x12N\n\x0etensor_columns\x18\x01 \x03(\x0b\x32\x36.docarray.DocumentArrayStackedProto.TensorColumnsEntry\x12H\n\x0b\x64oc_columns\x18\x02 \x03(\x0b\x32\x33.docarray.DocumentArrayStackedProto.DocColumnsEntry\x12\x46\n\nda_columns\x18\x03 \x03(\x0b\x32\x32.docarray.DocumentArrayStackedProto.DaColumnsEntry\x12H\n\x0b\x61ny_columns\x18\x04 \x03(\x0b\x32\x33.docarray.DocumentArrayStackedProto.AnyColumnsEntry\x1aL\n\x12TensorColumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12%\n\x05value\x18\x02 \x01(\x0b\x32\x16.docarray.NdArrayProto:\x02\x38\x01\x1aV\n\x0f\x44ocColumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\x32\n\x05value\x18\x02 \x01(\x0b\x32#.docarray.DocumentArrayStackedProto:\x02\x38\x01\x1aT\n\x0e\x44\x61\x43olumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\x31\n\x05value\x18\x02 \x01(\x0b\x32\".docarray.ListOfDocumentArrayProto:\x02\x38\x01\x1aK\n\x0f\x41nyColumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\'\n\x05value\x18\x02 \x01(\x0b\x32\x18.docarray.ListOfAnyProto:\x02\x38\x01\x62\x06proto3' + b'\n\x0e\x64ocarray.proto\x12\x08\x64ocarray\x1a\x1cgoogle/protobuf/struct.proto\"A\n\x11\x44\x65nseNdArrayProto\x12\x0e\n\x06\x62uffer\x18\x01 \x01(\x0c\x12\r\n\x05shape\x18\x02 \x03(\r\x12\r\n\x05\x64type\x18\x03 \x01(\t\"g\n\x0cNdArrayProto\x12*\n\x05\x64\x65nse\x18\x01 \x01(\x0b\x32\x1b.docarray.DenseNdArrayProto\x12+\n\nparameters\x18\x02 \x01(\x0b\x32\x17.google.protobuf.Struct\"Z\n\x0cKeyValuePair\x12#\n\x03key\x18\x01 \x01(\x0b\x32\x16.google.protobuf.Value\x12%\n\x05value\x18\x02 \x01(\x0b\x32\x16.google.protobuf.Value\";\n\x10GenericDictValue\x12\'\n\x07\x65ntries\x18\x01 \x03(\x0b\x32\x16.docarray.KeyValuePair\"\xc1\x03\n\tNodeProto\x12\x0e\n\x04text\x18\x01 \x01(\tH\x00\x12\x11\n\x07integer\x18\x02 \x01(\x05H\x00\x12\x0f\n\x05\x66loat\x18\x03 \x01(\x01H\x00\x12\x11\n\x07\x62oolean\x18\x04 \x01(\x08H\x00\x12\x0e\n\x04\x62lob\x18\x05 \x01(\x0cH\x00\x12)\n\x07ndarray\x18\x06 \x01(\x0b\x32\x16.docarray.NdArrayProtoH\x00\x12+\n\x08\x64ocument\x18\x07 \x01(\x0b\x32\x17.docarray.DocumentProtoH\x00\x12\x31\n\x0e\x64ocument_array\x18\x08 \x01(\x0b\x32\x17.docarray.DocArrayProtoH\x00\x12(\n\x04list\x18\t \x01(\x0b\x32\x18.docarray.ListOfAnyProtoH\x00\x12\'\n\x03set\x18\n \x01(\x0b\x32\x18.docarray.ListOfAnyProtoH\x00\x12)\n\x05tuple\x18\x0b \x01(\x0b\x32\x18.docarray.ListOfAnyProtoH\x00\x12(\n\x04\x64ict\x18\x0c \x01(\x0b\x32\x18.docarray.DictOfAnyProtoH\x00\x12\x0e\n\x04type\x18\r \x01(\tH\x01\x42\t\n\x07\x63ontentB\x0f\n\rdocarray_type\"\x82\x01\n\rDocumentProto\x12/\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32!.docarray.DocumentProto.DataEntry\x1a@\n\tDataEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\"\n\x05value\x18\x02 \x01(\x0b\x32\x13.docarray.NodeProto:\x02\x38\x01\"\x84\x01\n\x0e\x44ictOfAnyProto\x12\x30\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\".docarray.DictOfAnyProto.DataEntry\x1a@\n\tDataEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\"\n\x05value\x18\x02 \x01(\x0b\x32\x13.docarray.NodeProto:\x02\x38\x01\"3\n\x0eListOfAnyProto\x12!\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\x13.docarray.NodeProto\"6\n\rDocArrayProto\x12%\n\x04\x64ocs\x18\x01 \x03(\x0b\x32\x17.docarray.DocumentProto\"<\n\x13ListOfDocArrayProto\x12%\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\x17.docarray.DocArrayProto\"\xed\x04\n\x14\x44ocArrayStackedProto\x12I\n\x0etensor_columns\x18\x01 \x03(\x0b\x32\x31.docarray.DocArrayStackedProto.TensorColumnsEntry\x12\x43\n\x0b\x64oc_columns\x18\x02 \x03(\x0b\x32..docarray.DocArrayStackedProto.DocColumnsEntry\x12\x41\n\nda_columns\x18\x03 \x03(\x0b\x32-.docarray.DocArrayStackedProto.DaColumnsEntry\x12\x43\n\x0b\x61ny_columns\x18\x04 \x03(\x0b\x32..docarray.DocArrayStackedProto.AnyColumnsEntry\x1aL\n\x12TensorColumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12%\n\x05value\x18\x02 \x01(\x0b\x32\x16.docarray.NdArrayProto:\x02\x38\x01\x1aQ\n\x0f\x44ocColumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12-\n\x05value\x18\x02 \x01(\x0b\x32\x1e.docarray.DocArrayStackedProto:\x02\x38\x01\x1aO\n\x0e\x44\x61\x43olumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12,\n\x05value\x18\x02 \x01(\x0b\x32\x1d.docarray.ListOfDocArrayProto:\x02\x38\x01\x1aK\n\x0f\x41nyColumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\'\n\x05value\x18\x02 \x01(\x0b\x32\x18.docarray.ListOfAnyProto:\x02\x38\x01\x62\x06proto3' ) @@ -30,23 +30,21 @@ _DICTOFANYPROTO = DESCRIPTOR.message_types_by_name['DictOfAnyProto'] _DICTOFANYPROTO_DATAENTRY = _DICTOFANYPROTO.nested_types_by_name['DataEntry'] _LISTOFANYPROTO = DESCRIPTOR.message_types_by_name['ListOfAnyProto'] -_DOCUMENTARRAYPROTO = DESCRIPTOR.message_types_by_name['DocumentArrayProto'] -_LISTOFDOCUMENTARRAYPROTO = DESCRIPTOR.message_types_by_name['ListOfDocumentArrayProto'] -_DOCUMENTARRAYSTACKEDPROTO = DESCRIPTOR.message_types_by_name[ - 'DocumentArrayStackedProto' +_DOCARRAYPROTO = DESCRIPTOR.message_types_by_name['DocArrayProto'] +_LISTOFDOCARRAYPROTO = DESCRIPTOR.message_types_by_name['ListOfDocArrayProto'] +_DOCARRAYSTACKEDPROTO = DESCRIPTOR.message_types_by_name['DocArrayStackedProto'] +_DOCARRAYSTACKEDPROTO_TENSORCOLUMNSENTRY = _DOCARRAYSTACKEDPROTO.nested_types_by_name[ + 'TensorColumnsEntry' +] +_DOCARRAYSTACKEDPROTO_DOCCOLUMNSENTRY = _DOCARRAYSTACKEDPROTO.nested_types_by_name[ + 'DocColumnsEntry' +] +_DOCARRAYSTACKEDPROTO_DACOLUMNSENTRY = _DOCARRAYSTACKEDPROTO.nested_types_by_name[ + 'DaColumnsEntry' +] +_DOCARRAYSTACKEDPROTO_ANYCOLUMNSENTRY = _DOCARRAYSTACKEDPROTO.nested_types_by_name[ + 'AnyColumnsEntry' ] -_DOCUMENTARRAYSTACKEDPROTO_TENSORCOLUMNSENTRY = ( - _DOCUMENTARRAYSTACKEDPROTO.nested_types_by_name['TensorColumnsEntry'] -) -_DOCUMENTARRAYSTACKEDPROTO_DOCCOLUMNSENTRY = ( - _DOCUMENTARRAYSTACKEDPROTO.nested_types_by_name['DocColumnsEntry'] -) -_DOCUMENTARRAYSTACKEDPROTO_DACOLUMNSENTRY = ( - _DOCUMENTARRAYSTACKEDPROTO.nested_types_by_name['DaColumnsEntry'] -) -_DOCUMENTARRAYSTACKEDPROTO_ANYCOLUMNSENTRY = ( - _DOCUMENTARRAYSTACKEDPROTO.nested_types_by_name['AnyColumnsEntry'] -) DenseNdArrayProto = _reflection.GeneratedProtocolMessageType( 'DenseNdArrayProto', (_message.Message,), @@ -155,78 +153,78 @@ ) _sym_db.RegisterMessage(ListOfAnyProto) -DocumentArrayProto = _reflection.GeneratedProtocolMessageType( - 'DocumentArrayProto', +DocArrayProto = _reflection.GeneratedProtocolMessageType( + 'DocArrayProto', (_message.Message,), { - 'DESCRIPTOR': _DOCUMENTARRAYPROTO, + 'DESCRIPTOR': _DOCARRAYPROTO, '__module__': 'docarray_pb2' - # @@protoc_insertion_point(class_scope:docarray.DocumentArrayProto) + # @@protoc_insertion_point(class_scope:docarray.DocArrayProto) }, ) -_sym_db.RegisterMessage(DocumentArrayProto) +_sym_db.RegisterMessage(DocArrayProto) -ListOfDocumentArrayProto = _reflection.GeneratedProtocolMessageType( - 'ListOfDocumentArrayProto', +ListOfDocArrayProto = _reflection.GeneratedProtocolMessageType( + 'ListOfDocArrayProto', (_message.Message,), { - 'DESCRIPTOR': _LISTOFDOCUMENTARRAYPROTO, + 'DESCRIPTOR': _LISTOFDOCARRAYPROTO, '__module__': 'docarray_pb2' - # @@protoc_insertion_point(class_scope:docarray.ListOfDocumentArrayProto) + # @@protoc_insertion_point(class_scope:docarray.ListOfDocArrayProto) }, ) -_sym_db.RegisterMessage(ListOfDocumentArrayProto) +_sym_db.RegisterMessage(ListOfDocArrayProto) -DocumentArrayStackedProto = _reflection.GeneratedProtocolMessageType( - 'DocumentArrayStackedProto', +DocArrayStackedProto = _reflection.GeneratedProtocolMessageType( + 'DocArrayStackedProto', (_message.Message,), { 'TensorColumnsEntry': _reflection.GeneratedProtocolMessageType( 'TensorColumnsEntry', (_message.Message,), { - 'DESCRIPTOR': _DOCUMENTARRAYSTACKEDPROTO_TENSORCOLUMNSENTRY, + 'DESCRIPTOR': _DOCARRAYSTACKEDPROTO_TENSORCOLUMNSENTRY, '__module__': 'docarray_pb2' - # @@protoc_insertion_point(class_scope:docarray.DocumentArrayStackedProto.TensorColumnsEntry) + # @@protoc_insertion_point(class_scope:docarray.DocArrayStackedProto.TensorColumnsEntry) }, ), 'DocColumnsEntry': _reflection.GeneratedProtocolMessageType( 'DocColumnsEntry', (_message.Message,), { - 'DESCRIPTOR': _DOCUMENTARRAYSTACKEDPROTO_DOCCOLUMNSENTRY, + 'DESCRIPTOR': _DOCARRAYSTACKEDPROTO_DOCCOLUMNSENTRY, '__module__': 'docarray_pb2' - # @@protoc_insertion_point(class_scope:docarray.DocumentArrayStackedProto.DocColumnsEntry) + # @@protoc_insertion_point(class_scope:docarray.DocArrayStackedProto.DocColumnsEntry) }, ), 'DaColumnsEntry': _reflection.GeneratedProtocolMessageType( 'DaColumnsEntry', (_message.Message,), { - 'DESCRIPTOR': _DOCUMENTARRAYSTACKEDPROTO_DACOLUMNSENTRY, + 'DESCRIPTOR': _DOCARRAYSTACKEDPROTO_DACOLUMNSENTRY, '__module__': 'docarray_pb2' - # @@protoc_insertion_point(class_scope:docarray.DocumentArrayStackedProto.DaColumnsEntry) + # @@protoc_insertion_point(class_scope:docarray.DocArrayStackedProto.DaColumnsEntry) }, ), 'AnyColumnsEntry': _reflection.GeneratedProtocolMessageType( 'AnyColumnsEntry', (_message.Message,), { - 'DESCRIPTOR': _DOCUMENTARRAYSTACKEDPROTO_ANYCOLUMNSENTRY, + 'DESCRIPTOR': _DOCARRAYSTACKEDPROTO_ANYCOLUMNSENTRY, '__module__': 'docarray_pb2' - # @@protoc_insertion_point(class_scope:docarray.DocumentArrayStackedProto.AnyColumnsEntry) + # @@protoc_insertion_point(class_scope:docarray.DocArrayStackedProto.AnyColumnsEntry) }, ), - 'DESCRIPTOR': _DOCUMENTARRAYSTACKEDPROTO, + 'DESCRIPTOR': _DOCARRAYSTACKEDPROTO, '__module__': 'docarray_pb2' - # @@protoc_insertion_point(class_scope:docarray.DocumentArrayStackedProto) + # @@protoc_insertion_point(class_scope:docarray.DocArrayStackedProto) }, ) -_sym_db.RegisterMessage(DocumentArrayStackedProto) -_sym_db.RegisterMessage(DocumentArrayStackedProto.TensorColumnsEntry) -_sym_db.RegisterMessage(DocumentArrayStackedProto.DocColumnsEntry) -_sym_db.RegisterMessage(DocumentArrayStackedProto.DaColumnsEntry) -_sym_db.RegisterMessage(DocumentArrayStackedProto.AnyColumnsEntry) +_sym_db.RegisterMessage(DocArrayStackedProto) +_sym_db.RegisterMessage(DocArrayStackedProto.TensorColumnsEntry) +_sym_db.RegisterMessage(DocArrayStackedProto.DocColumnsEntry) +_sym_db.RegisterMessage(DocArrayStackedProto.DaColumnsEntry) +_sym_db.RegisterMessage(DocArrayStackedProto.AnyColumnsEntry) if _descriptor._USE_C_DESCRIPTORS == False: @@ -235,14 +233,14 @@ _DOCUMENTPROTO_DATAENTRY._serialized_options = b'8\001' _DICTOFANYPROTO_DATAENTRY._options = None _DICTOFANYPROTO_DATAENTRY._serialized_options = b'8\001' - _DOCUMENTARRAYSTACKEDPROTO_TENSORCOLUMNSENTRY._options = None - _DOCUMENTARRAYSTACKEDPROTO_TENSORCOLUMNSENTRY._serialized_options = b'8\001' - _DOCUMENTARRAYSTACKEDPROTO_DOCCOLUMNSENTRY._options = None - _DOCUMENTARRAYSTACKEDPROTO_DOCCOLUMNSENTRY._serialized_options = b'8\001' - _DOCUMENTARRAYSTACKEDPROTO_DACOLUMNSENTRY._options = None - _DOCUMENTARRAYSTACKEDPROTO_DACOLUMNSENTRY._serialized_options = b'8\001' - _DOCUMENTARRAYSTACKEDPROTO_ANYCOLUMNSENTRY._options = None - _DOCUMENTARRAYSTACKEDPROTO_ANYCOLUMNSENTRY._serialized_options = b'8\001' + _DOCARRAYSTACKEDPROTO_TENSORCOLUMNSENTRY._options = None + _DOCARRAYSTACKEDPROTO_TENSORCOLUMNSENTRY._serialized_options = b'8\001' + _DOCARRAYSTACKEDPROTO_DOCCOLUMNSENTRY._options = None + _DOCARRAYSTACKEDPROTO_DOCCOLUMNSENTRY._serialized_options = b'8\001' + _DOCARRAYSTACKEDPROTO_DACOLUMNSENTRY._options = None + _DOCARRAYSTACKEDPROTO_DACOLUMNSENTRY._serialized_options = b'8\001' + _DOCARRAYSTACKEDPROTO_ANYCOLUMNSENTRY._options = None + _DOCARRAYSTACKEDPROTO_ANYCOLUMNSENTRY._serialized_options = b'8\001' _DENSENDARRAYPROTO._serialized_start = 58 _DENSENDARRAYPROTO._serialized_end = 123 _NDARRAYPROTO._serialized_start = 125 @@ -252,29 +250,29 @@ _GENERICDICTVALUE._serialized_start = 322 _GENERICDICTVALUE._serialized_end = 381 _NODEPROTO._serialized_start = 384 - _NODEPROTO._serialized_end = 838 - _DOCUMENTPROTO._serialized_start = 841 - _DOCUMENTPROTO._serialized_end = 971 - _DOCUMENTPROTO_DATAENTRY._serialized_start = 907 - _DOCUMENTPROTO_DATAENTRY._serialized_end = 971 - _DICTOFANYPROTO._serialized_start = 974 - _DICTOFANYPROTO._serialized_end = 1106 - _DICTOFANYPROTO_DATAENTRY._serialized_start = 907 - _DICTOFANYPROTO_DATAENTRY._serialized_end = 971 - _LISTOFANYPROTO._serialized_start = 1108 - _LISTOFANYPROTO._serialized_end = 1159 - _DOCUMENTARRAYPROTO._serialized_start = 1161 - _DOCUMENTARRAYPROTO._serialized_end = 1220 - _LISTOFDOCUMENTARRAYPROTO._serialized_start = 1222 - _LISTOFDOCUMENTARRAYPROTO._serialized_end = 1292 - _DOCUMENTARRAYSTACKEDPROTO._serialized_start = 1295 - _DOCUMENTARRAYSTACKEDPROTO._serialized_end = 1951 - _DOCUMENTARRAYSTACKEDPROTO_TENSORCOLUMNSENTRY._serialized_start = 1624 - _DOCUMENTARRAYSTACKEDPROTO_TENSORCOLUMNSENTRY._serialized_end = 1700 - _DOCUMENTARRAYSTACKEDPROTO_DOCCOLUMNSENTRY._serialized_start = 1702 - _DOCUMENTARRAYSTACKEDPROTO_DOCCOLUMNSENTRY._serialized_end = 1788 - _DOCUMENTARRAYSTACKEDPROTO_DACOLUMNSENTRY._serialized_start = 1790 - _DOCUMENTARRAYSTACKEDPROTO_DACOLUMNSENTRY._serialized_end = 1874 - _DOCUMENTARRAYSTACKEDPROTO_ANYCOLUMNSENTRY._serialized_start = 1876 - _DOCUMENTARRAYSTACKEDPROTO_ANYCOLUMNSENTRY._serialized_end = 1951 + _NODEPROTO._serialized_end = 833 + _DOCUMENTPROTO._serialized_start = 836 + _DOCUMENTPROTO._serialized_end = 966 + _DOCUMENTPROTO_DATAENTRY._serialized_start = 902 + _DOCUMENTPROTO_DATAENTRY._serialized_end = 966 + _DICTOFANYPROTO._serialized_start = 969 + _DICTOFANYPROTO._serialized_end = 1101 + _DICTOFANYPROTO_DATAENTRY._serialized_start = 902 + _DICTOFANYPROTO_DATAENTRY._serialized_end = 966 + _LISTOFANYPROTO._serialized_start = 1103 + _LISTOFANYPROTO._serialized_end = 1154 + _DOCARRAYPROTO._serialized_start = 1156 + _DOCARRAYPROTO._serialized_end = 1210 + _LISTOFDOCARRAYPROTO._serialized_start = 1212 + _LISTOFDOCARRAYPROTO._serialized_end = 1272 + _DOCARRAYSTACKEDPROTO._serialized_start = 1275 + _DOCARRAYSTACKEDPROTO._serialized_end = 1896 + _DOCARRAYSTACKEDPROTO_TENSORCOLUMNSENTRY._serialized_start = 1579 + _DOCARRAYSTACKEDPROTO_TENSORCOLUMNSENTRY._serialized_end = 1655 + _DOCARRAYSTACKEDPROTO_DOCCOLUMNSENTRY._serialized_start = 1657 + _DOCARRAYSTACKEDPROTO_DOCCOLUMNSENTRY._serialized_end = 1738 + _DOCARRAYSTACKEDPROTO_DACOLUMNSENTRY._serialized_start = 1740 + _DOCARRAYSTACKEDPROTO_DACOLUMNSENTRY._serialized_end = 1819 + _DOCARRAYSTACKEDPROTO_ANYCOLUMNSENTRY._serialized_start = 1821 + _DOCARRAYSTACKEDPROTO_ANYCOLUMNSENTRY._serialized_end = 1896 # @@protoc_insertion_point(module_scope) diff --git a/docarray/store/abstract_doc_store.py b/docarray/store/abstract_doc_store.py index ff0252bb397..c5c152499c2 100644 --- a/docarray/store/abstract_doc_store.py +++ b/docarray/store/abstract_doc_store.py @@ -4,48 +4,48 @@ from typing_extensions import TYPE_CHECKING if TYPE_CHECKING: - from docarray import BaseDoc, DocumentArray + from docarray import BaseDoc, DocArray class AbstractDocStore(ABC): @staticmethod @abstractmethod def list(namespace: str, show_table: bool) -> List[str]: - """List all DocumentArrays in the specified backend at the namespace. + """List all DocArrays in the specified backend at the namespace. :param namespace: The namespace to list :param show_table: If true, a table is printed to the console - :return: A list of DocumentArray names + :return: A list of DocArray names """ ... @staticmethod @abstractmethod def delete(name: str, missing_ok: bool) -> bool: - """Delete the DocumentArray object at the specified name + """Delete the DocArray object at the specified name - :param name: The name of the DocumentArray to delete - :param missing_ok: If true, no error will be raised if the DocumentArray does not exist. - :return: True if the DocumentArray was deleted, False if it did not exist. + :param name: The name of the DocArray to delete + :param missing_ok: If true, no error will be raised if the DocArray does not exist. + :return: True if the DocArray was deleted, False if it did not exist. """ ... @staticmethod @abstractmethod def push( - da: 'DocumentArray', + da: 'DocArray', name: str, public: bool, show_progress: bool, branding: Optional[Dict], ) -> Dict: - """Push this DocumentArray to the specified name. + """Push this DocArray to the specified name. - :param da: The DocumentArray to push + :param da: The DocArray to push :param name: The name to push to - :param public: Whether the DocumentArray should be publicly accessible + :param public: Whether the DocArray should be publicly accessible :param show_progress: If true, a progress bar will be displayed. - :param branding: Branding information to be stored with the DocumentArray + :param branding: Branding information to be stored with the DocArray """ ... @@ -62,43 +62,43 @@ def push_stream( :param docs: a stream of documents :param url: The name to push to - :param public: Whether the DocumentArray should be publicly accessible + :param public: Whether the DocArray should be publicly accessible :param show_progress: If true, a progress bar will be displayed. - :param branding: Branding information to be stored with the DocumentArray + :param branding: Branding information to be stored with the DocArray """ ... @staticmethod @abstractmethod def pull( - da_cls: Type['DocumentArray'], + da_cls: Type['DocArray'], name: str, show_progress: bool, local_cache: bool, - ) -> 'DocumentArray': - """Pull a DocumentArray from the specified name. + ) -> 'DocArray': + """Pull a DocArray from the specified name. - :param da_cls: The DocumentArray class to instantiate + :param da_cls: The DocArray class to instantiate :param name: The name to pull from :param show_progress: If true, a progress bar will be displayed. - :param local_cache: If true, the DocumentArray will be cached locally - :return: A DocumentArray + :param local_cache: If true, the DocArray will be cached locally + :return: A DocArray """ ... @staticmethod @abstractmethod def pull_stream( - da_cls: Type['DocumentArray'], + da_cls: Type['DocArray'], name: str, show_progress: bool, local_cache: bool, ) -> Iterator['BaseDoc']: """Pull a stream of documents from the specified name. - :param da_cls: The DocumentArray class to instantiate + :param da_cls: The DocArray class to instantiate :param name: The name to pull from :param show_progress: If true, a progress bar will be displayed. - :param local_cache: If true, the DocumentArray will be cached locally + :param local_cache: If true, the DocArray will be cached locally :return: An iterator of documents""" ... diff --git a/docarray/store/file.py b/docarray/store/file.py index fe7f3adb003..aeb355e2e17 100644 --- a/docarray/store/file.py +++ b/docarray/store/file.py @@ -10,7 +10,7 @@ from docarray.utils.cache import get_cache_path if TYPE_CHECKING: - from docarray import BaseDoc, DocumentArray + from docarray import BaseDoc, DocArray SelfFileDocStore = TypeVar('SelfFileDocStore', bound='FileDocStore') @@ -32,11 +32,11 @@ def _abs_filepath(name: str) -> Path: def list( cls: Type[SelfFileDocStore], namespace: str, show_table: bool ) -> List[str]: - """List all DocumentArrays in a directory. + """List all DocArrays in a directory. :param namespace: The directory to list. :param show_table: If True, print a table of the files in the directory. - :return: A list of the names of the DocumentArrays in the directory. + :return: A list of the names of the DocArrays in the directory. """ namespace_dir = cls._abs_filepath(namespace) if not namespace_dir.exists(): @@ -51,7 +51,7 @@ def list( from rich.table import Table table = Table( - title=f'You have {len(da_files)} DocumentArrays in file://{namespace_dir}', + title=f'You have {len(da_files)} DocArrays in file://{namespace_dir}', box=box.SIMPLE, highlight=True, ) @@ -74,9 +74,9 @@ def list( def delete( cls: Type[SelfFileDocStore], name: str, missing_ok: bool = False ) -> bool: - """Delete a DocumentArray from the local filesystem. + """Delete a DocArray from the local filesystem. - :param name: The name of the DocumentArray to delete. + :param name: The name of the DocArray to delete. :param missing_ok: If True, do not raise an exception if the file does not exist. Defaults to False. :return: True if the file was deleted, False if it did not exist. """ @@ -92,13 +92,13 @@ def delete( @classmethod def push( cls: Type[SelfFileDocStore], - da: 'DocumentArray', + da: 'DocArray', name: str, public: bool, show_progress: bool, branding: Optional[Dict], ) -> Dict: - """Push this DocumentArray object to the specified file path. + """Push this DocArray object to the specified file path. :param name: The file path to push to. :param public: Not used by the ``file`` protocol. @@ -145,17 +145,17 @@ def push_stream( @classmethod def pull( cls: Type[SelfFileDocStore], - da_cls: Type['DocumentArray'], + da_cls: Type['DocArray'], name: str, show_progress: bool, local_cache: bool, - ) -> 'DocumentArray': - """Pull a :class:`DocumentArray` from the specified url. + ) -> 'DocArray': + """Pull a :class:`DocArray` from the specified url. :param name: The file path to pull from. :param show_progress: if true, display a progress bar. - :param local_cache: store the downloaded DocumentArray to local folder - :return: a :class:`DocumentArray` object + :param local_cache: store the downloaded DocArray to local folder + :return: a :class:`DocArray` object """ return da_cls( @@ -167,7 +167,7 @@ def pull( @classmethod def pull_stream( cls: Type[SelfFileDocStore], - da_cls: Type['DocumentArray'], + da_cls: Type['DocArray'], name: str, show_progress: bool, local_cache: bool, diff --git a/docarray/store/jac.py b/docarray/store/jac.py index 5bcdc849b20..75685e61cb4 100644 --- a/docarray/store/jac.py +++ b/docarray/store/jac.py @@ -29,7 +29,7 @@ if TYPE_CHECKING: # pragma: no cover import io - from docarray import BaseDoc, DocumentArray + from docarray import BaseDoc, DocArray def _get_length_from_summary(summary: List[Dict]) -> Optional[int]: @@ -40,17 +40,17 @@ def _get_length_from_summary(summary: List[Dict]) -> Optional[int]: raise ValueError('Length not found in summary') -def _get_raw_summary(self: 'DocumentArray') -> List[Dict[str, Any]]: +def _get_raw_summary(self: 'DocArray') -> List[Dict[str, Any]]: items: List[Dict[str, Any]] = [ dict( name='Type', value=self.__class__.__name__, - description='The type of the DocumentArray', + description='The type of the DocArray', ), dict( name='Length', value=len(self), - description='The length of the DocumentArray', + description='The length of the DocArray', ), dict( name='Homogenous Documents', @@ -76,7 +76,7 @@ def _get_raw_summary(self: 'DocumentArray') -> List[Dict[str, Any]]: class JACDocStore(AbstractDocStore): - """Class to push and pull DocumentArray to and from Jina AI Cloud.""" + """Class to push and pull DocArray to and from Jina AI Cloud.""" @staticmethod @hubble.login_required @@ -85,7 +85,7 @@ def list(namespace: str = '', show_table: bool = False) -> List[str]: :param namespace: Not supported for Jina AI Cloud. :param show_table: if true, show the table of the arrays. - :returns: List of available DocumentArray's names. + :returns: List of available DocArray's names. """ if len(namespace) > 0: logging.warning('Namespace is not supported for Jina AI Cloud.') @@ -96,11 +96,11 @@ def list(namespace: str = '', show_table: bool = False) -> List[str]: from rich.table import Table resp = HubbleClient(jsonify=True).list_artifacts( - filter={'type': 'documentArray'}, sort={'createdAt': 1} + filter={'type': 'DocArray'}, sort={'createdAt': 1} ) table = Table( - title=f'You have {resp["meta"]["total"]} DocumentArray on the cloud', + title=f'You have {resp["meta"]["total"]} DocArray on the cloud', box=box.SIMPLE, highlight=True, ) @@ -129,10 +129,10 @@ def list(namespace: str = '', show_table: bool = False) -> List[str]: @hubble.login_required def delete(name: str, missing_ok: bool = True) -> bool: """ - Delete a DocumentArray from the cloud. - :param name: the name of the DocumentArray to delete. - :param missing_ok: if true, do not raise an error if the DocumentArray does not exist. - :return: True if the DocumentArray was deleted, False if it did not exist. + Delete a DocArray from the cloud. + :param name: the name of the DocArray to delete. + :param missing_ok: if true, do not raise an error if the DocArray does not exist. + :return: True if the DocArray was deleted, False if it did not exist. """ try: HubbleClient(jsonify=True).delete_artifact(name=name) @@ -146,13 +146,13 @@ def delete(name: str, missing_ok: bool = True) -> bool: @staticmethod @hubble.login_required def push( - da: 'DocumentArray', + da: 'DocArray', name: str, public: bool = True, show_progress: bool = False, branding: Optional[Dict] = None, ) -> Dict: - """Push this DocumentArray object to Jina AI Cloud + """Push this DocArray object to Jina AI Cloud .. note:: - Push with the same ``name`` will override the existing content. @@ -161,8 +161,8 @@ def push( - The lifetime of the content is not promised atm, could be a day, could be a week. Do not use it for persistence. Only use this full temporary transmission/storage/clipboard. - :param name: A name that can later be used to retrieve this :class:`DocumentArray`. - :param public: By default, anyone can pull a DocumentArray if they know its name. + :param name: A name that can later be used to retrieve this :class:`DocArray`. + :param public: By default, anyone can pull a DocArray if they know its name. Setting this to false will restrict access to only the creator. :param show_progress: If true, a progress bar will be displayed. :param branding: A dictionary of branding information to be sent to Jina Cloud. e.g. {"icon": "emoji", "background": "#fff"} @@ -175,11 +175,11 @@ def push( data, ctype = urllib3.filepost.encode_multipart_formdata( { 'file': ( - 'DocumentArray', + 'DocArray', delimiter, ), 'name': name, - 'type': 'documentArray', + 'type': 'DocArray', 'public': public, 'metaData': json.dumps( { @@ -246,20 +246,20 @@ def push_stream( - The lifetime of the content is not promised atm, could be a day, could be a week. Do not use it for persistence. Only use this full temporary transmission/storage/clipboard. - :param name: A name that can later be used to retrieve this :class:`DocumentArray`. - :param public: By default, anyone can pull a DocumentArray if they know its name. + :param name: A name that can later be used to retrieve this :class:`DocArray`. + :param public: By default, anyone can pull a DocArray if they know its name. Setting this to false will restrict access to only the creator. :param show_progress: If true, a progress bar will be displayed. :param branding: A dictionary of branding information to be sent to Jina Cloud. e.g. {"icon": "emoji", "background": "#fff"} """ - from docarray import DocumentArray + from docarray import DocArray # This is a temporary solution to push a stream of documents # The memory footprint is not ideal - # But it must be done this way for now because Hubble expects to know the length of the DocumentArray + # But it must be done this way for now because Hubble expects to know the length of the DocArray # before it starts receiving the documents first_doc = next(docs) - da = DocumentArray[first_doc.__class__]([first_doc]) # type: ignore + da = DocArray[first_doc.__class__]([first_doc]) # type: ignore for doc in docs: da.append(doc) return cls.push(da, name, public, show_progress, branding) @@ -267,37 +267,37 @@ def push_stream( @staticmethod @hubble.login_required def pull( - cls: Type['DocumentArray'], + cls: Type['DocArray'], name: str, show_progress: bool = False, local_cache: bool = True, - ) -> 'DocumentArray': - """Pull a :class:`DocumentArray` from Jina AI Cloud to local. + ) -> 'DocArray': + """Pull a :class:`DocArray` from Jina AI Cloud to local. :param name: the upload name set during :meth:`.push` :param show_progress: if true, display a progress bar. - :param local_cache: store the downloaded DocumentArray to local folder - :return: a :class:`DocumentArray` object + :param local_cache: store the downloaded DocArray to local folder + :return: a :class:`DocArray` object """ - from docarray import DocumentArray + from docarray import DocArray - return DocumentArray[cls.document_type]( # type: ignore + return DocArray[cls.document_type]( # type: ignore JACDocStore.pull_stream(cls, name, show_progress, local_cache) ) @staticmethod @hubble.login_required def pull_stream( - cls: Type['DocumentArray'], + cls: Type['DocArray'], name: str, show_progress: bool = False, local_cache: bool = False, ) -> Iterator['BaseDoc']: - """Pull a :class:`DocumentArray` from Jina AI Cloud to local. + """Pull a :class:`DocArray` from Jina AI Cloud to local. :param name: the upload name set during :meth:`.push` :param show_progress: if true, display a progress bar. - :param local_cache: store the downloaded DocumentArray to local folder + :param local_cache: store the downloaded DocArray to local folder :return: An iterator of Documents """ import requests diff --git a/docarray/store/s3.py b/docarray/store/s3.py index 64399b827ab..b038e7c4b41 100644 --- a/docarray/store/s3.py +++ b/docarray/store/s3.py @@ -13,7 +13,7 @@ from docarray.utils.cache import get_cache_path if TYPE_CHECKING: # pragma: no cover - from docarray import BaseDoc, DocumentArray + from docarray import BaseDoc, DocArray SelfS3DocStore = TypeVar('SelfS3DocStore', bound='S3DocStore') @@ -44,15 +44,15 @@ def close(self): class S3DocStore(AbstractDocStore): - """Class to push and pull DocumentArray to and from S3.""" + """Class to push and pull DocArray to and from S3.""" @staticmethod def list(namespace: str, show_table: bool = False) -> List[str]: - """List all DocumentArrays in the specified bucket and namespace. + """List all DocArrays in the specified bucket and namespace. :param namespace: The bucket and namespace to list. e.g. my_bucket/my_namespace :param show_table: If true, a rich table will be printed to the console. - :return: A list of DocumentArray names. + :return: A list of DocArray names. """ bucket, namespace = namespace.split('/', 1) s3 = boto3.resource('s3') @@ -70,7 +70,7 @@ def list(namespace: str, show_table: bool = False) -> List[str]: from rich.table import Table table = Table( - title=f'You have {len(da_files)} DocumentArrays in bucket s3://{bucket} under the namespace "{namespace}"', + title=f'You have {len(da_files)} DocArrays in bucket s3://{bucket} under the namespace "{namespace}"', box=box.SIMPLE, highlight=True, ) @@ -90,7 +90,7 @@ def list(namespace: str, show_table: bool = False) -> List[str]: @staticmethod def delete(name: str, missing_ok: bool = True) -> bool: - """Delete the DocumentArray object at the specified bucket and key. + """Delete the DocArray object at the specified bucket and key. :param name: The bucket and key to delete. e.g. my_bucket/my_key :param missing_ok: If true, no error will be raised if the object does not exist. @@ -115,15 +115,15 @@ def delete(name: str, missing_ok: bool = True) -> bool: @classmethod def push( cls: Type[SelfS3DocStore], - da: 'DocumentArray', + da: 'DocArray', name: str, public: bool = False, show_progress: bool = False, branding: Optional[Dict] = None, ) -> Dict: - """Push this DocumentArray object to the specified bucket and key. + """Push this DocArray object to the specified bucket and key. - :param da: The DocumentArray to push. + :param da: The DocArray to push. :param name: The bucket and key to push to. e.g. my_bucket/my_key :param public: Not used by the ``s3`` protocol. :param show_progress: If true, a progress bar will be displayed. @@ -173,17 +173,17 @@ def push_stream( @classmethod def pull( cls: Type[SelfS3DocStore], - da_cls: Type['DocumentArray'], + da_cls: Type['DocArray'], name: str, show_progress: bool = False, local_cache: bool = False, - ) -> 'DocumentArray': - """Pull a :class:`DocumentArray` from the specified bucket and key. + ) -> 'DocArray': + """Pull a :class:`DocArray` from the specified bucket and key. :param name: The bucket and key to pull from. e.g. my_bucket/my_key :param show_progress: if true, display a progress bar. - :param local_cache: store the downloaded DocumentArray to local cache - :return: a :class:`DocumentArray` object + :param local_cache: store the downloaded DocArray to local cache + :return: a :class:`DocArray` object """ da = da_cls( # type: ignore cls.pull_stream( @@ -195,7 +195,7 @@ def pull( @classmethod def pull_stream( cls: Type[SelfS3DocStore], - da_cls: Type['DocumentArray'], + da_cls: Type['DocArray'], name: str, show_progress: bool, local_cache: bool, @@ -205,7 +205,7 @@ def pull_stream( :param name: The bucket and key to pull from. e.g. my_bucket/my_key :param show_progress: if true, display a progress bar. - :param local_cache: store the downloaded DocumentArray to local cache + :param local_cache: store the downloaded DocArray to local cache :return: An iterator of Documents """ diff --git a/docarray/typing/tensor/abstract_tensor.py b/docarray/typing/tensor/abstract_tensor.py index 049151d3a47..08aa0d014ae 100644 --- a/docarray/typing/tensor/abstract_tensor.py +++ b/docarray/typing/tensor/abstract_tensor.py @@ -293,7 +293,7 @@ def __iter__(self): @abc.abstractmethod def to_protobuf(self) -> 'NdArrayProto': - """Convert DocumentArray into a Protobuf message""" + """Convert DocArray into a Protobuf message""" ... def unwrap(self): diff --git a/docarray/utils/filter.py b/docarray/utils/filter.py index ceefa10bfd1..7a9887364fc 100644 --- a/docarray/utils/filter.py +++ b/docarray/utils/filter.py @@ -2,7 +2,7 @@ from typing import Dict, List, Union from docarray.array.abstract_array import AnyDocArray -from docarray.array.array.array import DocumentArray +from docarray.array.array.array import DocArray def filter_docs( @@ -17,7 +17,7 @@ def filter_docs( .. code-block:: python - from docarray import DocumentArray, BaseDoc + from docarray import DocArray, BaseDoc from docarray.documents import Text, Image from docarray.util.filter import filter_docs @@ -28,7 +28,7 @@ class MyDocument(BaseDoc): price: int - docs = DocumentArray[MyDocument]( + docs = DocArray[MyDocument]( [ MyDocument( caption='A tiger in the jungle', @@ -58,9 +58,9 @@ class MyDocument(BaseDoc): assert results[0].caption == 'A couple birdwatching with binoculars' assert results[0].image.url == 'binocularsphoto.png' - :param docs: the DocumentArray where to apply the filter + :param docs: the DocArray where to apply the filter :param query: the query to filter by - :return: A DocumentArray containing the Documents + :return: A DocArray containing the Documents in `docs` that fulfill the filter conditions in the `query` """ from docarray.utils.query_language.query_parser import QueryParser @@ -68,7 +68,7 @@ class MyDocument(BaseDoc): if query: query = query if not isinstance(query, str) else json.loads(query) parser = QueryParser(query) - return DocumentArray.__class_getitem__(docs.document_type)( + return DocArray.__class_getitem__(docs.document_type)( d for d in docs if parser.evaluate(d) ) else: diff --git a/docarray/utils/find.py b/docarray/utils/find.py index 4bbd47767da..98229eb7b7b 100644 --- a/docarray/utils/find.py +++ b/docarray/utils/find.py @@ -3,8 +3,8 @@ from typing_inspect import is_union_type from docarray.array.abstract_array import AnyDocArray -from docarray.array.array.array import DocumentArray -from docarray.array.stacked.array_stacked import DocumentArrayStacked +from docarray.array.array.array import DocArray +from docarray.array.stacked.array_stacked import DocArrayStacked from docarray.base_document import BaseDoc from docarray.helper import _get_field_type_by_access_path from docarray.typing import AnyTensor @@ -12,12 +12,12 @@ class FindResult(NamedTuple): - documents: DocumentArray + documents: DocArray scores: AnyTensor class _FindResult(NamedTuple): - documents: Union[DocumentArray, List[Dict[str, Any]]] + documents: Union[DocArray, List[Dict[str, Any]]] scores: AnyTensor @@ -50,7 +50,7 @@ def find( .. code-block:: python - from docarray import DocumentArray, BaseDoc + from docarray import DocArray, BaseDoc from docarray.typing import TorchTensor from docarray.util.find import find @@ -59,7 +59,7 @@ class MyDocument(BaseDoc): embedding: TorchTensor - index = DocumentArray[MyDocument]( + index = DocArray[MyDocument]( [MyDocument(embedding=torch.rand(128)) for _ in range(100)] ) @@ -94,7 +94,7 @@ class MyDocument(BaseDoc): can be either `cpu` or a `cuda` device. :param descending: sort the results in descending order. Per default, this is chosen based on the `metric` argument. - :return: A named tuple of the form (DocumentArray, AnyTensor), + :return: A named tuple of the form (DocArray, AnyTensor), where the first element contains the closes matches for the query, and the second element contains the corresponding scores. """ @@ -112,7 +112,7 @@ class MyDocument(BaseDoc): def find_batched( index: AnyDocArray, - query: Union[AnyTensor, DocumentArray], + query: Union[AnyTensor, DocArray], embedding_field: str = 'embedding', metric: str = 'cosine_sim', limit: int = 10, @@ -139,7 +139,7 @@ def find_batched( .. code-block:: python - from docarray import DocumentArray, BaseDoc + from docarray import DocArray, BaseDoc from docarray.typing import TorchTensor from docarray.util.find import find @@ -148,14 +148,12 @@ class MyDocument(BaseDoc): embedding: TorchTensor - index = DocumentArray[MyDocument]( + index = DocArray[MyDocument]( [MyDocument(embedding=torch.rand(128)) for _ in range(100)] ) - # use DocumentArray as query - query = DocumentArray[MyDocument]( - [MyDocument(embedding=torch.rand(128)) for _ in range(3)] - ) + # use DocArray as query + query = DocArray[MyDocument]([MyDocument(embedding=torch.rand(128)) for _ in range(3)]) results = find( index=index, query=query, @@ -187,7 +185,7 @@ class MyDocument(BaseDoc): can be either `cpu` or a `cuda` device. :param descending: sort the results in descending order. Per default, this is chosen based on the `metric` argument. - :return: a list of named tuples of the form (DocumentArray, AnyTensor), + :return: a list of named tuples of the form (DocArray, AnyTensor), where the first element contains the closes matches for each query, and the second element contains the corresponding scores. """ @@ -210,16 +208,16 @@ class MyDocument(BaseDoc): results = [] for indices_per_query, scores_per_query in zip(top_indices, top_scores): - docs_per_query: DocumentArray = DocumentArray([]) + docs_per_query: DocArray = DocArray([]) for idx in indices_per_query: # workaround until #930 is fixed docs_per_query.append(index[idx]) - docs_per_query = DocumentArray(docs_per_query) + docs_per_query = DocArray(docs_per_query) results.append(FindResult(scores=scores_per_query, documents=docs_per_query)) return results def _extract_embedding_single( - data: Union[DocumentArray, BaseDoc, AnyTensor], + data: Union[DocArray, BaseDoc, AnyTensor], embedding_field: str, ) -> AnyTensor: """Extract the embeddings from a single query, @@ -254,10 +252,10 @@ def _extract_embeddings( :return: the embeddings """ emb: AnyTensor - if isinstance(data, DocumentArray): + if isinstance(data, DocArray): emb_list = list(AnyDocArray._traverse(data, embedding_field)) emb = embedding_type._docarray_stack(emb_list) - elif isinstance(data, (DocumentArrayStacked, BaseDoc)): + elif isinstance(data, (DocArrayStacked, BaseDoc)): emb = next(AnyDocArray._traverse(data, embedding_field)) else: # treat data as tensor emb = cast(AnyTensor, data) @@ -269,9 +267,9 @@ def _extract_embeddings( def _da_attr_type(da: AnyDocArray, access_path: str) -> Type[AnyTensor]: """Get the type of the attribute according to the Document type - (schema) of the DocumentArray. + (schema) of the DocArray. - :param da: the DocumentArray + :param da: the DocArray :param access_path: the "__"-separated access path :return: the type of the attribute """ diff --git a/docarray/utils/map.py b/docarray/utils/map.py index dc0d7fb17a9..9642d38f10a 100644 --- a/docarray/utils/map.py +++ b/docarray/utils/map.py @@ -29,7 +29,7 @@ def map_docs( .. code-block:: python - from docarray import DocumentArray + from docarray import DocArray from docarray.documents import Image from docarray.utils.map import map_docs @@ -39,15 +39,15 @@ def load_url_to_tensor(img: Image) -> Image: return img - da = DocumentArray[Image]([Image(url='/path/to/img.png') for _ in range(100)]) - da = DocumentArray[Image]( + da = DocArray[Image]([Image(url='/path/to/img.png') for _ in range(100)]) + da = DocArray[Image]( list(map_docs(da, load_url_to_tensor, backend='thread')) ) # threading is usually a good option for IO-bound tasks such as loading an image from url for doc in da: assert doc.tensor is not None - :param da: DocumentArray to apply function to + :param da: DocArray to apply function to :param func: a function that takes a :class:`BaseDoc` as input and outputs a :class:`BaseDoc`. :param backend: `thread` for multithreading and `process` for multiprocessing. @@ -112,7 +112,7 @@ def map_docs_batch( EXAMPLE USAGE .. code-block:: python - from docarray import BaseDoc, DocumentArray + from docarray import BaseDoc, DocArray from docarray.utils.map import map_docs_batch @@ -120,13 +120,13 @@ class MyDoc(BaseDoc): name: str - def upper_case_name(da: DocumentArray[MyDoc]) -> DocumentArray[MyDoc]: + def upper_case_name(da: DocArray[MyDoc]) -> DocArray[MyDoc]: da.name = [n.upper() for n in da.name] return da batch_size = 16 - da = DocumentArray[MyDoc]([MyDoc(name='my orange cat') for _ in range(100)]) + da = DocArray[MyDoc]([MyDoc(name='my orange cat') for _ in range(100)]) it = map_docs_batch(da, upper_case_name, batch_size=batch_size) for i, d in enumerate(it): da[i * batch_size : (i + 1) * batch_size] = d @@ -138,7 +138,7 @@ def upper_case_name(da: DocumentArray[MyDoc]) -> DocumentArray[MyDoc]: ['MY ORANGE CAT', 'MY ORANGE CAT', 'MY ORANGE CAT'] - :param da: DocumentArray to apply function to + :param da: DocArray to apply function to :param batch_size: Size of each generated batch (except the last one, which might be smaller). :param shuffle: If set, shuffle the Documents before dividing into minibatches. @@ -166,7 +166,7 @@ def upper_case_name(da: DocumentArray[MyDoc]) -> DocumentArray[MyDoc]: :param pool: use an existing/external pool. If given, `backend` is ignored and you will be responsible for closing the pool. - :yield: DocumentArrays returned from `func` + :yield: DocArrays returned from `func` """ if backend == 'process' and _is_lambda_or_partial_or_local_function(func): raise ValueError( diff --git a/docarray/utils/reduce.py b/docarray/utils/reduce.py index 60493d04ea5..c021febea00 100644 --- a/docarray/utils/reduce.py +++ b/docarray/utils/reduce.py @@ -1,26 +1,27 @@ -from docarray import DocumentArray -from typing import List, Optional, Dict +from typing import Dict, List, Optional + +from docarray import DocArray def reduce( - left: DocumentArray, right: DocumentArray, left_id_map: Optional[Dict] = None -) -> 'DocumentArray': + left: DocArray, right: DocArray, left_id_map: Optional[Dict] = None +) -> 'DocArray': """ - Reduces left and right DocumentArray into one DocumentArray in-place. - Changes are applied to the left DocumentArray. - Reducing 2 DocumentArrays consists in adding Documents in the second DocumentArray - to the first DocumentArray if they do not exist. - If a Document exists in both DocumentArrays (identified by ID), + Reduces left and right DocArray into one DocArray in-place. + Changes are applied to the left DocArray. + Reducing 2 DocArrays consists in adding Documents in the second DocArray + to the first DocArray if they do not exist. + If a Document exists in both DocArrays (identified by ID), the data properties are merged with priority to the left Document. - Nested DocumentArrays are also reduced in the same way. - :param left: First DocumentArray to be reduced. Changes will be applied to it + Nested DocArrays are also reduced in the same way. + :param left: First DocArray to be reduced. Changes will be applied to it in-place - :param right: Second DocumentArray to be reduced + :param right: Second DocArray to be reduced :param left_id_map: Optional parameter to be passed in repeated calls for optimizations, keeping a map of the Document ID to its offset - in the DocumentArray - :return: Reduced DocumentArray + in the DocArray + :return: Reduced DocArray """ left_id_map = left_id_map or {doc.id: i for i, doc in enumerate(left)} @@ -33,32 +34,31 @@ def reduce( return left -def reduce_all(docarrays: List[DocumentArray]) -> DocumentArray: +def reduce_all(docarrays: List[DocArray]) -> DocArray: """ - Reduces a list of DocumentArrays into one DocumentArray. - Changes are applied to the first DocumentArray in-place. + Reduces a list of DocArrays into one DocArray. + Changes are applied to the first DocArray in-place. - The resulting DocumentArray contains Documents of all DocumentArrays. - If a Document exists (identified by their ID) in many DocumentArrays, + The resulting DocArray contains Documents of all DocArrays. + If a Document exists (identified by their ID) in many DocArrays, data properties are merged with priority to the left-most - DocumentArrays (that is, if a data attribute is set in a Document - belonging to many DocumentArrays, the attribute value of the left-most - DocumentArray is kept). - Nested DocumentArrays belonging to many DocumentArrays + DocArrays (that is, if a data attribute is set in a Document + belonging to many DocArrays, the attribute value of the left-most + DocArray is kept). + Nested DocArrays belonging to many DocArrays are also reduced in the same way. .. note:: - - Nested DocumentArrays order does not follow any specific rule. + - Nested DocArrays order does not follow any specific rule. You might want to re-sort them in a later step. - - The final result depends on the order of DocumentArrays + - The final result depends on the order of DocArrays when applying reduction. - :param docarrays: List of DocumentArrays to be reduced - :return: the resulting DocumentArray + :param docarrays: List of DocArrays to be reduced + :return: the resulting DocArray """ if len(docarrays) <= 1: raise Exception( - 'In order to reduce DocumentArrays' - ' we should have more than one DocumentArray' + 'In order to reduce DocArrays' ' we should have more than one DocArray' ) left = docarrays[0] others = docarrays[1:] diff --git a/docs/api_references/array/da.md b/docs/api_references/array/da.md index b8b562c2a2b..d44a4913864 100644 --- a/docs/api_references/array/da.md +++ b/docs/api_references/array/da.md @@ -1,3 +1,3 @@ -# DocumentArray +# DocArray -::: docarray.array.array.array.DocumentArray +::: docarray.array.array.array.DocArray diff --git a/docs/api_references/array/da_stack.md b/docs/api_references/array/da_stack.md index 95693cd5bde..7f5f9e51a86 100644 --- a/docs/api_references/array/da_stack.md +++ b/docs/api_references/array/da_stack.md @@ -1,3 +1,3 @@ -# DocumentArrayStacked +# DocArrayStacked -::: docarray.array.array.array.DocumentArrayStacked +::: docarray.array.array.array.DocArrayStacked diff --git a/docs/tutorials/multimodal_training_and_serving.md b/docs/tutorials/multimodal_training_and_serving.md index fa9bcb62ad5..defe5ad27cf 100644 --- a/docs/tutorials/multimodal_training_and_serving.md +++ b/docs/tutorials/multimodal_training_and_serving.md @@ -83,7 +83,7 @@ The `BaseDocument` class allows users to define their own (nested, multi-modal) Let's start by defining a few Documents to handle the different modalities that we will use during our training: ```python -from docarray import BaseDocument, DocumentArray +from docarray import BaseDocument, DocArray from docarray.typing import TorchTensor, ImageUrl ``` @@ -184,14 +184,14 @@ import pandas as pd def get_flickr8k_da(file: str = "captions.txt", N: Optional[int] = None): df = pd.read_csv(file, nrows=N) - da = DocumentArray[PairTextImage]( + da = DocArray[PairTextImage]( PairTextImage(text=Text(text=i.caption), image=Image(url=f"Images/{i.image}")) for i in df.itertuples() ) return da ``` -In the `get_flickr8k_da` method we process the Flickr8k dataset into a `DocumentArray`. +In the `get_flickr8k_da` method we process the Flickr8k dataset into a `DocArray`. Now let's instantiate this dataset using the `MultiModalDataset` class. The constructor takes in the `da` and a dictionary of preprocessing transformations: @@ -214,11 +214,11 @@ loader = DataLoader( ) ``` -## Create the Pytorch model that works on DocumentArray +## Create the Pytorch model that works on DocArray In this section we create two encoders, one per modality (Text and Image). These encoders are normal PyTorch `nn.Module`s. -The only difference is that they operate on DocumentArray rather that on torch.Tensor: +The only difference is that they operate on DocArray rather that on torch.Tensor: ```python class TextEncoder(nn.Module): @@ -226,7 +226,7 @@ class TextEncoder(nn.Module): super().__init__() self.bert = DistilBertModel.from_pretrained("distilbert-base-uncased") - def forward(self, texts: DocumentArray[Text]) -> TorchTensor: + def forward(self, texts: DocArray[Text]) -> TorchTensor: last_hidden_state = self.bert( input_ids=texts.tokens.input_ids, attention_mask=texts.tokens.attention_mask ).last_hidden_state @@ -240,8 +240,8 @@ class TextEncoder(nn.Module): return masked_output.sum(dim=1) / attention_mask.sum(-1, keepdim=True) ``` -The `TextEncoder` takes a `DocumentArray` of `Text`s as input, and returns an embedding `TorchTensor` as output. -`DocumentArray` can be seen as a list of `Text` documents, and the encoder will treat it as one batch. +The `TextEncoder` takes a `DocArray` of `Text`s as input, and returns an embedding `TorchTensor` as output. +`DocArray` can be seen as a list of `Text` documents, and the encoder will treat it as one batch. ```python @@ -251,12 +251,12 @@ class VisionEncoder(nn.Module): self.backbone = torchvision.models.resnet18(pretrained=True) self.linear = nn.LazyLinear(out_features=768) - def forward(self, images: DocumentArray[Image]) -> TorchTensor: + def forward(self, images: DocArray[Image]) -> TorchTensor: x = self.backbone(images.tensor) return self.linear(x) ``` -Similarly, the `VisionEncoder` also takes a `DocumentArray` of `Image`s as input, and returns an embedding `TorchTensor` as output. +Similarly, the `VisionEncoder` also takes a `DocArray` of `Image`s as input, and returns an embedding `TorchTensor` as output. However, it operates on the `image` attribute of each Document. Now we can instantiate our encoders: @@ -289,7 +289,7 @@ def cosine_sim(x_mat: TorchTensor, y_mat: TorchTensor) -> TorchTensor: ``` ```python -def clip_loss(image: DocumentArray[Image], text: DocumentArray[Text]) -> TorchTensor: +def clip_loss(image: DocArray[Image], text: DocArray[Text]) -> TorchTensor: sims = cosine_sim(image.embedding, text.embedding) return torch.norm(sims - torch.eye(sims.shape[0], device=DEVICE)) ``` @@ -301,7 +301,7 @@ In the type hints of `cosine_sim` and `clip_loss` you can again notice that we c num_epoch = 1 # here you should do more epochs to really learn something ``` -One things to notice here is that our dataloader does not return a `torch.Tensor` but a `DocumentArray[PairTextImage]`, +One things to notice here is that our dataloader does not return a `torch.Tensor` but a `DocArray[PairTextImage]`, which is exactly what our model can operate on. So let's write a training loop and train our encoders: @@ -312,7 +312,7 @@ from tqdm import tqdm with torch.autocast(device_type="cuda", dtype=torch.float16): for epoch in range(num_epoch): for i, batch in tqdm(enumerate(loader), total=len(loader), desc=f"Epoch {epoch}"): - batch.to(DEVICE) # DocumentArray can be moved to device + batch.to(DEVICE) # DocArray can be moved to device optim.zero_grad() # FORWARD PASS: @@ -366,7 +366,7 @@ async def embed_text(doc: Text) -> Text: with torch.autocast(device_type="cuda", dtype=torch.float16): with torch.inference_mode(): text_preprocess(doc) - da = DocumentArray[Text]([doc], tensor_type=TorchTensor).stack() + da = DocArray[Text]([doc], tensor_type=TorchTensor).stack() da.to(DEVICE) doc.embedding = text_encoder(da)[0].to('cpu') return doc diff --git a/tests/benchmark_tests/test_map.py b/tests/benchmark_tests/test_map.py index bad86da41a7..ace42dacfa7 100644 --- a/tests/benchmark_tests/test_map.py +++ b/tests/benchmark_tests/test_map.py @@ -4,7 +4,7 @@ import numpy as np import pytest -from docarray import BaseDoc, DocumentArray +from docarray import BaseDoc, DocArray from docarray.documents import ImageDoc from docarray.typing import NdArray from docarray.utils.map import map_docs, map_docs_batch @@ -32,7 +32,7 @@ def time_multiprocessing(num_workers: int) -> float: n_docs = 5 rng = np.random.RandomState(0) matrices = [rng.random(size=(1000, 1000)) for _ in range(n_docs)] - da = DocumentArray[MyMatrix]([MyMatrix(matrix=m) for m in matrices]) + da = DocArray[MyMatrix]([MyMatrix(matrix=m) for m in matrices]) start_time = time() list( map_docs( @@ -47,7 +47,7 @@ def time_multiprocessing(num_workers: int) -> float: assert time_2_cpu < time_1_cpu -def cpu_intensive_batch(da: DocumentArray[MyMatrix]) -> DocumentArray[MyMatrix]: +def cpu_intensive_batch(da: DocArray[MyMatrix]) -> DocArray[MyMatrix]: # some cpu intensive function for doc in da: for i in range(3000): @@ -63,7 +63,7 @@ def time_multiprocessing(num_workers: int) -> float: n_docs = 16 rng = np.random.RandomState(0) matrices = [rng.random(size=(1000, 1000)) for _ in range(n_docs)] - da = DocumentArray[MyMatrix]([MyMatrix(matrix=m) for m in matrices]) + da = DocArray[MyMatrix]([MyMatrix(matrix=m) for m in matrices]) start_time = time() list( map_docs_batch( @@ -91,7 +91,7 @@ def io_intensive(img: ImageDoc) -> ImageDoc: def test_map_docs_multithreading(): def time_multithreading(num_workers: int) -> float: n_docs = 100 - da = DocumentArray[ImageDoc]( + da = DocArray[ImageDoc]( [ImageDoc(url=IMAGE_PATHS['png']) for _ in range(n_docs)] ) start_time = time() @@ -106,7 +106,7 @@ def time_multithreading(num_workers: int) -> float: assert time_2_thread < time_1_thread -def io_intensive_batch(da: DocumentArray[ImageDoc]) -> DocumentArray[ImageDoc]: +def io_intensive_batch(da: DocArray[ImageDoc]) -> DocArray[ImageDoc]: # some io intensive function: load and set image url for doc in da: doc.tensor = doc.url.load() @@ -116,7 +116,7 @@ def io_intensive_batch(da: DocumentArray[ImageDoc]) -> DocumentArray[ImageDoc]: def test_map_docs_batch_multithreading(): def time_multithreading_batch(num_workers: int) -> float: n_docs = 100 - da = DocumentArray[ImageDoc]( + da = DocArray[ImageDoc]( [ImageDoc(url=IMAGE_PATHS['png']) for _ in range(n_docs)] ) start_time = time() diff --git a/tests/index/base_classes/test_base_doc_store.py b/tests/index/base_classes/test_base_doc_store.py index 4fa67344100..cd1d8339ab2 100644 --- a/tests/index/base_classes/test_base_doc_store.py +++ b/tests/index/base_classes/test_base_doc_store.py @@ -5,7 +5,7 @@ import pytest from pydantic import Field -from docarray import BaseDoc, DocumentArray +from docarray import BaseDoc, DocArray from docarray.index.abstract import BaseDocIndex, _raise_not_composable from docarray.typing import ID, NdArray @@ -221,12 +221,12 @@ class OtherNestedDoc(NestedDoc): # SIMPLE store = DummyDocIndex[SimpleDoc]() in_list = [SimpleDoc(tens=np.random.random((10,)))] - assert isinstance(store._validate_docs(in_list), DocumentArray[BaseDoc]) - in_da = DocumentArray[SimpleDoc](in_list) + assert isinstance(store._validate_docs(in_list), DocArray[BaseDoc]) + in_da = DocArray[SimpleDoc](in_list) assert store._validate_docs(in_da) == in_da in_other_list = [OtherSimpleDoc(tens=np.random.random((10,)))] - assert isinstance(store._validate_docs(in_other_list), DocumentArray[BaseDoc]) - in_other_da = DocumentArray[OtherSimpleDoc](in_other_list) + assert isinstance(store._validate_docs(in_other_list), DocArray[BaseDoc]) + in_other_da = DocArray[OtherSimpleDoc](in_other_list) assert store._validate_docs(in_other_da) == in_other_da with pytest.raises(ValueError): @@ -239,7 +239,7 @@ class OtherNestedDoc(NestedDoc): ) with pytest.raises(ValueError): store._validate_docs( - DocumentArray[FlatDoc]( + DocArray[FlatDoc]( [ FlatDoc( tens_one=np.random.random((10,)), @@ -254,16 +254,16 @@ class OtherNestedDoc(NestedDoc): in_list = [ FlatDoc(tens_one=np.random.random((10,)), tens_two=np.random.random((50,))) ] - assert isinstance(store._validate_docs(in_list), DocumentArray[BaseDoc]) - in_da = DocumentArray[FlatDoc]( + assert isinstance(store._validate_docs(in_list), DocArray[BaseDoc]) + in_da = DocArray[FlatDoc]( [FlatDoc(tens_one=np.random.random((10,)), tens_two=np.random.random((50,)))] ) assert store._validate_docs(in_da) == in_da in_other_list = [ OtherFlatDoc(tens_one=np.random.random((10,)), tens_two=np.random.random((50,))) ] - assert isinstance(store._validate_docs(in_other_list), DocumentArray[BaseDoc]) - in_other_da = DocumentArray[OtherFlatDoc]( + assert isinstance(store._validate_docs(in_other_list), DocArray[BaseDoc]) + in_other_da = DocArray[OtherFlatDoc]( [ OtherFlatDoc( tens_one=np.random.random((10,)), tens_two=np.random.random((50,)) @@ -275,20 +275,18 @@ class OtherNestedDoc(NestedDoc): store._validate_docs([SimpleDoc(tens=np.random.random((10,)))]) with pytest.raises(ValueError): assert not store._validate_docs( - DocumentArray[SimpleDoc]([SimpleDoc(tens=np.random.random((10,)))]) + DocArray[SimpleDoc]([SimpleDoc(tens=np.random.random((10,)))]) ) # NESTED store = DummyDocIndex[NestedDoc]() in_list = [NestedDoc(d=SimpleDoc(tens=np.random.random((10,))))] - assert isinstance(store._validate_docs(in_list), DocumentArray[BaseDoc]) - in_da = DocumentArray[NestedDoc]( - [NestedDoc(d=SimpleDoc(tens=np.random.random((10,))))] - ) + assert isinstance(store._validate_docs(in_list), DocArray[BaseDoc]) + in_da = DocArray[NestedDoc]([NestedDoc(d=SimpleDoc(tens=np.random.random((10,))))]) assert store._validate_docs(in_da) == in_da in_other_list = [OtherNestedDoc(d=OtherSimpleDoc(tens=np.random.random((10,))))] - assert isinstance(store._validate_docs(in_other_list), DocumentArray[BaseDoc]) - in_other_da = DocumentArray[OtherNestedDoc]( + assert isinstance(store._validate_docs(in_other_list), DocArray[BaseDoc]) + in_other_da = DocArray[OtherNestedDoc]( [OtherNestedDoc(d=OtherSimpleDoc(tens=np.random.random((10,))))] ) @@ -297,7 +295,7 @@ class OtherNestedDoc(NestedDoc): store._validate_docs([SimpleDoc(tens=np.random.random((10,)))]) with pytest.raises(ValueError): store._validate_docs( - DocumentArray[SimpleDoc]([SimpleDoc(tens=np.random.random((10,)))]) + DocArray[SimpleDoc]([SimpleDoc(tens=np.random.random((10,)))]) ) @@ -311,8 +309,8 @@ class UnionDoc(BaseDoc): # OPTIONAL store = DummyDocIndex[SimpleDoc]() in_list = [OptionalDoc(tens=np.random.random((10,)))] - assert isinstance(store._validate_docs(in_list), DocumentArray[BaseDoc]) - in_da = DocumentArray[OptionalDoc](in_list) + assert isinstance(store._validate_docs(in_list), DocArray[BaseDoc]) + in_da = DocArray[OptionalDoc](in_list) assert store._validate_docs(in_da) == in_da with pytest.raises(ValueError): @@ -321,9 +319,9 @@ class UnionDoc(BaseDoc): # OTHER UNION store = DummyDocIndex[SimpleDoc]() in_list = [UnionDoc(tens=np.random.random((10,)))] - assert isinstance(store._validate_docs(in_list), DocumentArray[BaseDoc]) - in_da = DocumentArray[UnionDoc](in_list) - assert isinstance(store._validate_docs(in_da), DocumentArray[BaseDoc]) + assert isinstance(store._validate_docs(in_list), DocArray[BaseDoc]) + in_da = DocArray[UnionDoc](in_list) + assert isinstance(store._validate_docs(in_da), DocArray[BaseDoc]) with pytest.raises(ValueError): store._validate_docs([UnionDoc(tens='hello')]) diff --git a/tests/index/hnswlib/test_index_get_del.py b/tests/index/hnswlib/test_index_get_del.py index fa5c5f051b2..d8336e0ed6d 100644 --- a/tests/index/hnswlib/test_index_get_del.py +++ b/tests/index/hnswlib/test_index_get_del.py @@ -6,7 +6,7 @@ import torch from pydantic import Field -from docarray import BaseDoc, DocumentArray +from docarray import BaseDoc, DocArray from docarray.documents import ImageDoc, TextDoc from docarray.index import HnswDocumentIndex from docarray.typing import NdArray, NdArrayEmbedding, TorchTensor @@ -57,7 +57,7 @@ def ten_nested_docs(): def test_index_simple_schema(ten_simple_docs, tmp_path, use_docarray): store = HnswDocumentIndex[SimpleDoc](work_dir=str(tmp_path)) if use_docarray: - ten_simple_docs = DocumentArray[SimpleDoc](ten_simple_docs) + ten_simple_docs = DocArray[SimpleDoc](ten_simple_docs) store.index(ten_simple_docs) assert store.num_docs() == 10 @@ -77,7 +77,7 @@ class MyDoc(BaseDoc): def test_index_flat_schema(ten_flat_docs, tmp_path, use_docarray): store = HnswDocumentIndex[FlatDoc](work_dir=str(tmp_path)) if use_docarray: - ten_flat_docs = DocumentArray[FlatDoc](ten_flat_docs) + ten_flat_docs = DocArray[FlatDoc](ten_flat_docs) store.index(ten_flat_docs) assert store.num_docs() == 10 @@ -89,7 +89,7 @@ def test_index_flat_schema(ten_flat_docs, tmp_path, use_docarray): def test_index_nested_schema(ten_nested_docs, tmp_path, use_docarray): store = HnswDocumentIndex[NestedDoc](work_dir=str(tmp_path)) if use_docarray: - ten_nested_docs = DocumentArray[NestedDoc](ten_nested_docs) + ten_nested_docs = DocArray[NestedDoc](ten_nested_docs) store.index(ten_nested_docs) assert store.num_docs() == 10 @@ -137,7 +137,7 @@ class TextSchema(TextDoc): store = HnswDocumentIndex[TextSchema](work_dir=str(tmp_path)) store.index( - DocumentArray[TextDoc]( + DocArray[TextDoc]( [TextDoc(embedding=np.random.randn(10), text=f'{i}') for i in range(10)] ) ) @@ -154,7 +154,7 @@ class ImageSchema(ImageDoc): ) store.index( - DocumentArray[ImageDoc]( + DocArray[ImageDoc]( [ ImageDoc( embedding=np.random.randn(10), tensor=np.random.randn(3, 224, 224) diff --git a/tests/integrations/array/test_torch_train.py b/tests/integrations/array/test_torch_train.py index 8ed42a2ac19..930f237b0a1 100644 --- a/tests/integrations/array/test_torch_train.py +++ b/tests/integrations/array/test_torch_train.py @@ -2,7 +2,7 @@ import torch -from docarray import BaseDoc, DocumentArray +from docarray import BaseDoc, DocArray from docarray.typing import TorchTensor @@ -13,7 +13,7 @@ class Mmdoc(BaseDoc): N = 10 - batch = DocumentArray[Mmdoc](Mmdoc(text=f'hello{i}') for i in range(N)) + batch = DocArray[Mmdoc](Mmdoc(text=f'hello{i}') for i in range(N)) batch.tensor = torch.zeros(N, 3, 224, 224) batch = batch.stack() diff --git a/tests/integrations/document/test_document.py b/tests/integrations/document/test_document.py index 3a421a93b5a..9d8b85f260d 100644 --- a/tests/integrations/document/test_document.py +++ b/tests/integrations/document/test_document.py @@ -5,7 +5,7 @@ from pydantic import BaseModel, ValidationError from typing_extensions import TypedDict -from docarray import BaseDoc, DocumentArray +from docarray import BaseDoc, DocArray from docarray.documents import AudioDoc, ImageDoc, TextDoc from docarray.documents.helper import ( create_doc, @@ -35,14 +35,14 @@ class MyMultiModalDoc(BaseDoc): def test_nested_chunks_document(): class ChunksDocument(BaseDoc): text: str - images: DocumentArray[ImageDoc] + images: DocArray[ImageDoc] doc = ChunksDocument( text='hello', - images=DocumentArray[ImageDoc]([ImageDoc() for _ in range(10)]), + images=DocArray[ImageDoc]([ImageDoc() for _ in range(10)]), ) - assert isinstance(doc.images, DocumentArray) + assert isinstance(doc.images, DocArray) def test_create_doc(): diff --git a/tests/integrations/document/test_proto.py b/tests/integrations/document/test_proto.py index 2717dd7f423..c1f38d05b41 100644 --- a/tests/integrations/document/test_proto.py +++ b/tests/integrations/document/test_proto.py @@ -2,7 +2,7 @@ import pytest import torch -from docarray import BaseDoc, DocumentArray +from docarray import BaseDoc, DocArray from docarray.documents import ImageDoc, TextDoc from docarray.typing import ( AnyEmbedding, @@ -61,7 +61,7 @@ class MyDoc(BaseDoc): embedding: AnyEmbedding torch_embedding: TorchEmbedding[128] np_embedding: NdArrayEmbedding[128] - nested_docs: DocumentArray[NestedDoc] + nested_docs: DocArray[NestedDoc] bytes_: bytes img_bytes: ImageBytes @@ -80,7 +80,7 @@ class MyDoc(BaseDoc): embedding=np.zeros((3, 224, 224)), torch_embedding=torch.zeros((128,)), np_embedding=np.zeros((128,)), - nested_docs=DocumentArray[NestedDoc]([NestedDoc(tensor=np.zeros((128,)))]), + nested_docs=DocArray[NestedDoc]([NestedDoc(tensor=np.zeros((128,)))]), bytes_=b'hello', img_bytes=b'img', ) @@ -135,7 +135,7 @@ class MyDoc(BaseDoc): generic_tf_tensor: AnyTensor embedding: AnyEmbedding tf_embedding: TensorFlowEmbedding[128] - nested_docs: DocumentArray[NestedDoc] + nested_docs: DocArray[NestedDoc] doc = MyDoc( tf_tensor=tf.zeros((3, 224, 224)), @@ -143,7 +143,7 @@ class MyDoc(BaseDoc): generic_tf_tensor=tf.zeros((3, 224, 224)), embedding=tf.zeros((3, 224, 224)), tf_embedding=tf.zeros((128,)), - nested_docs=DocumentArray[NestedDoc]([NestedDoc(tensor=tf.zeros((128,)))]), + nested_docs=DocArray[NestedDoc]([NestedDoc(tensor=tf.zeros((128,)))]), ) doc = doc.to_protobuf() doc = MyDoc.from_protobuf(doc) diff --git a/tests/integrations/store/__init__.py b/tests/integrations/store/__init__.py index 51d9e298a4d..1191c403140 100644 --- a/tests/integrations/store/__init__.py +++ b/tests/integrations/store/__init__.py @@ -1,12 +1,12 @@ import tracemalloc from functools import wraps -from docarray import DocumentArray +from docarray import DocArray from docarray.documents import TextDoc def get_test_da(n: int): - return DocumentArray[TextDoc](gen_text_docs(n)) + return DocArray[TextDoc](gen_text_docs(n)) def gen_text_docs(n: int): diff --git a/tests/integrations/store/test_file.py b/tests/integrations/store/test_file.py index fc96ed7e686..43205260abd 100644 --- a/tests/integrations/store/test_file.py +++ b/tests/integrations/store/test_file.py @@ -3,7 +3,7 @@ import pytest -from docarray import DocumentArray +from docarray import DocArray from docarray.documents import TextDoc from docarray.store.file import ConcurrentPushException, FileDocStore from docarray.utils.cache import get_cache_path @@ -28,9 +28,7 @@ def test_pushpull_correct(capsys, tmp_path: Path): # Verbose da1.push(f'file://{namespace_dir}/meow', show_progress=True) - da2 = DocumentArray[TextDoc].pull( - f'file://{namespace_dir}/meow', show_progress=True - ) + da2 = DocArray[TextDoc].pull(f'file://{namespace_dir}/meow', show_progress=True) assert len(da1) == len(da2) assert all(d1.id == d2.id for d1, d2 in zip(da1, da2)) assert all(d1.text == d2.text for d1, d2 in zip(da1, da2)) @@ -41,7 +39,7 @@ def test_pushpull_correct(capsys, tmp_path: Path): # Quiet da2.push(f'file://{namespace_dir}/meow') - da1 = DocumentArray[TextDoc].pull(f'file://{namespace_dir}/meow') + da1 = DocArray[TextDoc].pull(f'file://{namespace_dir}/meow') assert len(da1) == len(da2) assert all(d1.id == d2.id for d1, d2 in zip(da1, da2)) assert all(d1.text == d2.text for d1, d2 in zip(da1, da2)) @@ -57,10 +55,10 @@ def test_pushpull_stream_correct(capsys, tmp_path: Path): da1 = get_test_da(DA_LEN) # Verbosity and correctness - DocumentArray[TextDoc].push_stream( + DocArray[TextDoc].push_stream( iter(da1), f'file://{namespace_dir}/meow', show_progress=True ) - doc_stream2 = DocumentArray[TextDoc].pull_stream( + doc_stream2 = DocArray[TextDoc].pull_stream( f'file://{namespace_dir}/meow', show_progress=True ) @@ -73,10 +71,10 @@ def test_pushpull_stream_correct(capsys, tmp_path: Path): assert len(captured.err) == 0 # Quiet and chained - doc_stream = DocumentArray[TextDoc].pull_stream( + doc_stream = DocArray[TextDoc].pull_stream( f'file://{namespace_dir}/meow', show_progress=False ) - DocumentArray[TextDoc].push_stream( + DocArray[TextDoc].push_stream( doc_stream, f'file://{namespace_dir}/meow2', show_progress=False ) @@ -89,12 +87,12 @@ def test_pushpull_stream_correct(capsys, tmp_path: Path): def test_pull_stream_vs_pull_full(tmp_path: Path): tmp_path.mkdir(parents=True, exist_ok=True) namespace_dir = tmp_path - DocumentArray[TextDoc].push_stream( + DocArray[TextDoc].push_stream( gen_text_docs(DA_LEN * 1), f'file://{namespace_dir}/meow-short', show_progress=False, ) - DocumentArray[TextDoc].push_stream( + DocArray[TextDoc].push_stream( gen_text_docs(DA_LEN * 4), f'file://{namespace_dir}/meow-long', show_progress=False, @@ -103,14 +101,13 @@ def test_pull_stream_vs_pull_full(tmp_path: Path): @profile_memory def get_total_stream(url: str): return sum( - len(d.text) - for d in DocumentArray[TextDoc].pull_stream(url, show_progress=False) + len(d.text) for d in DocArray[TextDoc].pull_stream(url, show_progress=False) ) @profile_memory def get_total_full(url: str): return sum( - len(d.text) for d in DocumentArray[TextDoc].pull(url, show_progress=False) + len(d.text) for d in DocArray[TextDoc].pull(url, show_progress=False) ) # A warmup is needed to get accurate memory usage comparison @@ -152,12 +149,12 @@ def test_list_and_delete(tmp_path: Path): da_names = FileDocStore.list(namespace_dir, show_table=False) assert len(da_names) == 0 - DocumentArray[TextDoc].push_stream( + DocArray[TextDoc].push_stream( gen_text_docs(DA_LEN), f'file://{namespace_dir}/meow', show_progress=False ) da_names = FileDocStore.list(namespace_dir, show_table=False) assert set(da_names) == {'meow'} - DocumentArray[TextDoc].push_stream( + DocArray[TextDoc].push_stream( gen_text_docs(DA_LEN), f'file://{namespace_dir}/woof', show_progress=False ) da_names = FileDocStore.list(namespace_dir, show_table=False) @@ -184,7 +181,7 @@ def test_concurrent_push_pull(tmp_path: Path): tmp_path.mkdir(parents=True, exist_ok=True) namespace_dir = tmp_path - DocumentArray[TextDoc].push_stream( + DocArray[TextDoc].push_stream( gen_text_docs(DA_LEN), f'file://{namespace_dir}/da0', show_progress=False, @@ -194,17 +191,14 @@ def test_concurrent_push_pull(tmp_path: Path): def _task(choice: str): if choice == 'push': - DocumentArray[TextDoc].push_stream( + DocArray[TextDoc].push_stream( gen_text_docs(DA_LEN), f'file://{namespace_dir}/da0', show_progress=False, ) elif choice == 'pull': pull_len = sum( - 1 - for _ in DocumentArray[TextDoc].pull_stream( - f'file://{namespace_dir}/da0' - ) + 1 for _ in DocArray[TextDoc].pull_stream(f'file://{namespace_dir}/da0') ) assert pull_len == DA_LEN else: @@ -222,7 +216,7 @@ def test_concurrent_push(tmp_path: Path): tmp_path.mkdir(parents=True, exist_ok=True) namespace_dir = tmp_path - DocumentArray[TextDoc].push_stream( + DocArray[TextDoc].push_stream( gen_text_docs(DA_LEN), f'file://{namespace_dir}/da0', show_progress=False, @@ -238,7 +232,7 @@ def _slowdown_iterator(iterator): def _push(choice: str): if choice == 'slow': - DocumentArray[TextDoc].push_stream( + DocArray[TextDoc].push_stream( _slowdown_iterator(gen_text_docs(DA_LEN)), f'file://{namespace_dir}/da0', show_progress=False, @@ -247,7 +241,7 @@ def _push(choice: str): elif choice == 'cold_start': try: time.sleep(0.1) - DocumentArray[TextDoc].push_stream( + DocArray[TextDoc].push_stream( gen_text_docs(DA_LEN), f'file://{namespace_dir}/da0', show_progress=False, diff --git a/tests/integrations/store/test_jac.py b/tests/integrations/store/test_jac.py index aa1763298b9..94d3c693e32 100644 --- a/tests/integrations/store/test_jac.py +++ b/tests/integrations/store/test_jac.py @@ -4,7 +4,7 @@ import hubble import pytest -from docarray import DocumentArray +from docarray import DocArray from docarray.documents import TextDoc from docarray.store import JACDocStore from tests.integrations.store import gen_text_docs, get_test_da, profile_memory @@ -43,7 +43,7 @@ def test_pushpull_correct(capsys): # Verbose da1.push(f'jac://{DA_NAME}', show_progress=True) - da2 = DocumentArray[TextDoc].pull(f'jac://{DA_NAME}', show_progress=True) + da2 = DocArray[TextDoc].pull(f'jac://{DA_NAME}', show_progress=True) assert len(da1) == len(da2) assert all(d1.id == d2.id for d1, d2 in zip(da1, da2)) assert all(d1.text == d2.text for d1, d2 in zip(da1, da2)) @@ -54,7 +54,7 @@ def test_pushpull_correct(capsys): # Quiet da2.push(f'jac://{DA_NAME}') - da1 = DocumentArray[TextDoc].pull(f'jac://{DA_NAME}') + da1 = DocArray[TextDoc].pull(f'jac://{DA_NAME}') assert len(da1) == len(da2) assert all(d1.id == d2.id for d1, d2 in zip(da1, da2)) assert all(d1.text == d2.text for d1, d2 in zip(da1, da2)) @@ -75,10 +75,8 @@ def test_pushpull_stream_correct(capsys): da1 = get_test_da(DA_LEN) # Verbosity and correctness - DocumentArray[TextDoc].push_stream( - iter(da1), f'jac://{DA_NAME_1}', show_progress=True - ) - doc_stream2 = DocumentArray[TextDoc].pull_stream( + DocArray[TextDoc].push_stream(iter(da1), f'jac://{DA_NAME_1}', show_progress=True) + doc_stream2 = DocArray[TextDoc].pull_stream( f'jac://{DA_NAME_1}', show_progress=True ) @@ -91,12 +89,10 @@ def test_pushpull_stream_correct(capsys): assert len(captured.err) == 0 # Quiet and chained - doc_stream = DocumentArray[TextDoc].pull_stream( + doc_stream = DocArray[TextDoc].pull_stream( f'jac://{DA_NAME_1}', show_progress=False ) - DocumentArray[TextDoc].push_stream( - doc_stream, f'jac://{DA_NAME_2}', show_progress=False - ) + DocArray[TextDoc].push_stream(doc_stream, f'jac://{DA_NAME_2}', show_progress=False) captured = capsys.readouterr() assert ( @@ -114,12 +110,12 @@ def test_pull_stream_vs_pull_full(): DA_NAME_SHORT: str = f'test{RANDOM}-pull-stream-vs-pull-full-short' DA_NAME_LONG: str = f'test{RANDOM}-pull-stream-vs-pull-full-long' - DocumentArray[TextDoc].push_stream( + DocArray[TextDoc].push_stream( gen_text_docs(DA_LEN * 1), f'jac://{DA_NAME_SHORT}', show_progress=False, ) - DocumentArray[TextDoc].push_stream( + DocArray[TextDoc].push_stream( gen_text_docs(DA_LEN * 4), f'jac://{DA_NAME_LONG}', show_progress=False, @@ -128,14 +124,13 @@ def test_pull_stream_vs_pull_full(): @profile_memory def get_total_stream(url: str): return sum( - len(d.text) - for d in DocumentArray[TextDoc].pull_stream(url, show_progress=False) + len(d.text) for d in DocArray[TextDoc].pull_stream(url, show_progress=False) ) @profile_memory def get_total_full(url: str): return sum( - len(d.text) for d in DocumentArray[TextDoc].pull(url, show_progress=False) + len(d.text) for d in DocArray[TextDoc].pull(url, show_progress=False) ) # A warmup is needed to get accurate memory usage comparison @@ -179,7 +174,7 @@ def test_list_and_delete(): ) assert len(da_names) == 0 - DocumentArray[TextDoc].push( + DocArray[TextDoc].push( get_test_da(DA_LEN), f'jac://{DA_NAME_0}', show_progress=False ) da_names = list( @@ -189,7 +184,7 @@ def test_list_and_delete(): ) ) assert set(da_names) == {DA_NAME_0} - DocumentArray[TextDoc].push( + DocArray[TextDoc].push( get_test_da(DA_LEN), f'jac://{DA_NAME_1}', show_progress=False ) da_names = list( @@ -227,7 +222,7 @@ def test_concurrent_push_pull(): # Push to DA that is being pulled should not mess up the pull DA_NAME_0 = f'test{RANDOM}-concurrent-push-pull-da0' - DocumentArray[TextDoc].push_stream( + DocArray[TextDoc].push_stream( gen_text_docs(DA_LEN), f'jac://{DA_NAME_0}', show_progress=False, @@ -237,14 +232,14 @@ def test_concurrent_push_pull(): def _task(choice: str): if choice == 'push': - DocumentArray[TextDoc].push_stream( + DocArray[TextDoc].push_stream( gen_text_docs(DA_LEN), f'jac://{DA_NAME_0}', show_progress=False, ) elif choice == 'pull': pull_len = sum( - 1 for _ in DocumentArray[TextDoc].pull_stream(f'jac://{DA_NAME_0}') + 1 for _ in DocArray[TextDoc].pull_stream(f'jac://{DA_NAME_0}') ) assert pull_len == DA_LEN else: diff --git a/tests/integrations/store/test_s3.py b/tests/integrations/store/test_s3.py index 37b5d6fe69e..ebe51b8c223 100644 --- a/tests/integrations/store/test_s3.py +++ b/tests/integrations/store/test_s3.py @@ -5,7 +5,7 @@ import pytest -from docarray import DocumentArray +from docarray import DocArray from docarray.documents import TextDoc from docarray.store import S3DocStore from tests.integrations.store import gen_text_docs, get_test_da, profile_memory @@ -72,7 +72,7 @@ def test_pushpull_correct(capsys): # Verbose da1.push(f's3://{namespace_dir}/meow', show_progress=True) - da2 = DocumentArray[TextDoc].pull(f's3://{namespace_dir}/meow', show_progress=True) + da2 = DocArray[TextDoc].pull(f's3://{namespace_dir}/meow', show_progress=True) assert len(da1) == len(da2) assert all(d1.id == d2.id for d1, d2 in zip(da1, da2)) assert all(d1.text == d2.text for d1, d2 in zip(da1, da2)) @@ -83,7 +83,7 @@ def test_pushpull_correct(capsys): # Quiet da2.push(f's3://{namespace_dir}/meow') - da1 = DocumentArray[TextDoc].pull(f's3://{namespace_dir}/meow') + da1 = DocArray[TextDoc].pull(f's3://{namespace_dir}/meow') assert len(da1) == len(da2) assert all(d1.id == d2.id for d1, d2 in zip(da1, da2)) assert all(d1.text == d2.text for d1, d2 in zip(da1, da2)) @@ -99,10 +99,10 @@ def test_pushpull_stream_correct(capsys): da1 = get_test_da(DA_LEN) # Verbosity and correctness - DocumentArray[TextDoc].push_stream( + DocArray[TextDoc].push_stream( iter(da1), f's3://{namespace_dir}/meow', show_progress=True ) - doc_stream2 = DocumentArray[TextDoc].pull_stream( + doc_stream2 = DocArray[TextDoc].pull_stream( f's3://{namespace_dir}/meow', show_progress=True ) @@ -115,10 +115,10 @@ def test_pushpull_stream_correct(capsys): assert len(captured.err) == 0 # Quiet and chained - doc_stream = DocumentArray[TextDoc].pull_stream( + doc_stream = DocArray[TextDoc].pull_stream( f's3://{namespace_dir}/meow', show_progress=False ) - DocumentArray[TextDoc].push_stream( + DocArray[TextDoc].push_stream( doc_stream, f's3://{namespace_dir}/meow2', show_progress=False ) @@ -130,12 +130,12 @@ def test_pushpull_stream_correct(capsys): @pytest.mark.slow def test_pull_stream_vs_pull_full(): namespace_dir = f'{BUCKET}/test{RANDOM}/pull-stream-vs-pull-full' - DocumentArray[TextDoc].push_stream( + DocArray[TextDoc].push_stream( gen_text_docs(DA_LEN * 1), f's3://{namespace_dir}/meow-short', show_progress=False, ) - DocumentArray[TextDoc].push_stream( + DocArray[TextDoc].push_stream( gen_text_docs(DA_LEN * 4), f's3://{namespace_dir}/meow-long', show_progress=False, @@ -144,14 +144,13 @@ def test_pull_stream_vs_pull_full(): @profile_memory def get_total_stream(url: str): return sum( - len(d.text) - for d in DocumentArray[TextDoc].pull_stream(url, show_progress=False) + len(d.text) for d in DocArray[TextDoc].pull_stream(url, show_progress=False) ) @profile_memory def get_total_full(url: str): return sum( - len(d.text) for d in DocumentArray[TextDoc].pull(url, show_progress=False) + len(d.text) for d in DocArray[TextDoc].pull(url, show_progress=False) ) # A warmup is needed to get accurate memory usage comparison @@ -193,12 +192,12 @@ def test_list_and_delete(): da_names = S3DocStore.list(namespace_dir, show_table=False) assert len(da_names) == 0 - DocumentArray[TextDoc].push_stream( + DocArray[TextDoc].push_stream( gen_text_docs(DA_LEN), f's3://{namespace_dir}/meow', show_progress=False ) da_names = S3DocStore.list(f'{namespace_dir}', show_table=False) assert set(da_names) == {'meow'} - DocumentArray[TextDoc].push_stream( + DocArray[TextDoc].push_stream( gen_text_docs(DA_LEN), f's3://{namespace_dir}/woof', show_progress=False ) da_names = S3DocStore.list(f'{namespace_dir}', show_table=False) @@ -225,7 +224,7 @@ def test_concurrent_push_pull(): # Push to DA that is being pulled should not mess up the pull namespace_dir = f'{BUCKET}/test{RANDOM}/concurrent-push-pull' - DocumentArray[TextDoc].push_stream( + DocArray[TextDoc].push_stream( gen_text_docs(DA_LEN), f's3://{namespace_dir}/da0', show_progress=False, @@ -235,15 +234,14 @@ def test_concurrent_push_pull(): def _task(choice: str): if choice == 'push': - DocumentArray[TextDoc].push_stream( + DocArray[TextDoc].push_stream( gen_text_docs(DA_LEN), f's3://{namespace_dir}/da0', show_progress=False, ) elif choice == 'pull': pull_len = sum( - 1 - for _ in DocumentArray[TextDoc].pull_stream(f's3://{namespace_dir}/da0') + 1 for _ in DocArray[TextDoc].pull_stream(f's3://{namespace_dir}/da0') ) assert pull_len == DA_LEN else: diff --git a/tests/integrations/torch/data/test_torch_dataset.py b/tests/integrations/torch/data/test_torch_dataset.py index 569b66db49b..238e05e8ac2 100644 --- a/tests/integrations/torch/data/test_torch_dataset.py +++ b/tests/integrations/torch/data/test_torch_dataset.py @@ -2,7 +2,7 @@ import torch from torch.utils.data import DataLoader -from docarray import BaseDoc, DocumentArray +from docarray import BaseDoc, DocArray from docarray.data import MultiModalDataset from docarray.documents import ImageDoc, TextDoc @@ -34,10 +34,10 @@ def __call__(self, text: str) -> None: @pytest.fixture -def captions_da() -> DocumentArray[PairTextImage]: +def captions_da() -> DocArray[PairTextImage]: with open("tests/toydata/captions.csv", "r") as f: f.readline() - da = DocumentArray[PairTextImage]( + da = DocArray[PairTextImage]( PairTextImage( text=TextDoc(text=i[1]), image=ImageDoc(url=f"tests/toydata/image-data/{i[0]}"), @@ -47,7 +47,7 @@ def captions_da() -> DocumentArray[PairTextImage]: return da -def test_torch_dataset(captions_da: DocumentArray[PairTextImage]): +def test_torch_dataset(captions_da: DocArray[PairTextImage]): BATCH_SIZE = 32 preprocessing = {"image": ImagePreprocess(), "text": TextPreprocess()} @@ -56,16 +56,16 @@ def test_torch_dataset(captions_da: DocumentArray[PairTextImage]): dataset, batch_size=BATCH_SIZE, collate_fn=dataset.collate_fn, shuffle=True ) - from docarray.array.stacked.array_stacked import DocumentArrayStacked + from docarray.array.stacked.array_stacked import DocArrayStacked batch_lens = [] for batch in loader: - assert isinstance(batch, DocumentArrayStacked[PairTextImage]) + assert isinstance(batch, DocArrayStacked[PairTextImage]) batch_lens.append(len(batch)) assert all(x == BATCH_SIZE for x in batch_lens[:-1]) -def test_primitives(captions_da: DocumentArray[PairTextImage]): +def test_primitives(captions_da: DocArray[PairTextImage]): BATCH_SIZE = 32 preprocessing = {"text": Meowification()} @@ -78,7 +78,7 @@ def test_primitives(captions_da: DocumentArray[PairTextImage]): assert all(t.endswith(' meow') for t in batch.text) -def test_root_field(captions_da: DocumentArray[TextDoc]): +def test_root_field(captions_da: DocArray[TextDoc]): BATCH_SIZE = 32 preprocessing = {"": TextPreprocess()} @@ -91,7 +91,7 @@ def test_root_field(captions_da: DocumentArray[TextDoc]): assert batch.embedding.shape[1] == 64 -def test_nested_field(captions_da: DocumentArray[PairTextImage]): +def test_nested_field(captions_da: DocArray[PairTextImage]): BATCH_SIZE = 32 preprocessing = { @@ -122,7 +122,7 @@ def test_nested_field(captions_da: DocumentArray[PairTextImage]): @pytest.mark.slow -def test_torch_dl_multiprocessing(captions_da: DocumentArray[PairTextImage]): +def test_torch_dl_multiprocessing(captions_da: DocArray[PairTextImage]): BATCH_SIZE = 32 preprocessing = {"image": ImagePreprocess(), "text": TextPreprocess()} @@ -136,17 +136,17 @@ def test_torch_dl_multiprocessing(captions_da: DocumentArray[PairTextImage]): multiprocessing_context='fork', ) - from docarray.array.stacked.array_stacked import DocumentArrayStacked + from docarray.array.stacked.array_stacked import DocArrayStacked batch_lens = [] for batch in loader: - assert isinstance(batch, DocumentArrayStacked[PairTextImage]) + assert isinstance(batch, DocArrayStacked[PairTextImage]) batch_lens.append(len(batch)) assert all(x == BATCH_SIZE for x in batch_lens[:-1]) @pytest.mark.skip(reason="UNRESOLVED BUG") -def test_torch_dl_pin_memory(captions_da: DocumentArray[PairTextImage]): +def test_torch_dl_pin_memory(captions_da: DocArray[PairTextImage]): BATCH_SIZE = 32 preprocessing = {"image": ImagePreprocess(), "text": TextPreprocess()} @@ -164,10 +164,10 @@ def test_torch_dl_pin_memory(captions_da: DocumentArray[PairTextImage]): multiprocessing_context='fork', ) - from docarray.array.stacked.array_stacked import DocumentArrayStacked + from docarray.array.stacked.array_stacked import DocArrayStacked batch_lens = [] for batch in loader: - assert isinstance(batch, DocumentArrayStacked[PairTextImage]) + assert isinstance(batch, DocArrayStacked[PairTextImage]) batch_lens.append(len(batch)) assert all(x == BATCH_SIZE for x in batch_lens[:-1]) diff --git a/tests/units/array/stack/storage/test_storage.py b/tests/units/array/stack/storage/test_storage.py index 280b986c886..591c2057d8b 100644 --- a/tests/units/array/stack/storage/test_storage.py +++ b/tests/units/array/stack/storage/test_storage.py @@ -1,7 +1,7 @@ import numpy as np from docarray import BaseDoc -from docarray.array import DocumentArrayStacked +from docarray.array import DocArrayStacked from docarray.array.stacked.column_storage import ColumnStorageView from docarray.typing import AnyTensor @@ -20,13 +20,13 @@ class MyDoc(BaseDoc): for i in range(4) ] - storage = DocumentArrayStacked[MyDoc](docs)._storage + storage = DocArrayStacked[MyDoc](docs)._storage assert (storage.tensor_columns['tensor'] == np.zeros((4, 10))).all() for name in storage.any_columns['name']: assert name == 'hello' inner_docs = storage.doc_columns['doc'] - assert isinstance(inner_docs, DocumentArrayStacked[InnerDoc]) + assert isinstance(inner_docs, DocArrayStacked[InnerDoc]) for i, doc in enumerate(inner_docs): assert doc.price == i @@ -38,7 +38,7 @@ class MyDoc(BaseDoc): docs = [MyDoc(tensor=np.zeros((10, 10)), name='hello', id=i) for i in range(4)] - storage = DocumentArrayStacked[MyDoc](docs)._storage + storage = DocArrayStacked[MyDoc](docs)._storage view = ColumnStorageView(0, storage) diff --git a/tests/units/array/stack/test_array_stacked.py b/tests/units/array/stack/test_array_stacked.py index 66d82ea523e..95cbf58c150 100644 --- a/tests/units/array/stack/test_array_stacked.py +++ b/tests/units/array/stack/test_array_stacked.py @@ -5,8 +5,8 @@ import torch from pydantic import parse_obj_as -from docarray import BaseDoc, DocumentArray -from docarray.array import DocumentArrayStacked +from docarray import BaseDoc, DocArray +from docarray.array import DocArrayStacked from docarray.documents import ImageDoc from docarray.typing import AnyEmbedding, AnyTensor, NdArray, TorchTensor @@ -16,7 +16,7 @@ def batch(): class ImageDoc(BaseDoc): tensor: TorchTensor[3, 224, 224] - batch = DocumentArrayStacked[ImageDoc]( + batch = DocArrayStacked[ImageDoc]( [ImageDoc(tensor=torch.zeros(3, 224, 224)) for _ in range(10)] ) @@ -29,12 +29,12 @@ class ImageDoc(BaseDoc): tensor: TorchTensor[3, 224, 224] class MMdoc(BaseDoc): - img: DocumentArray[ImageDoc] + img: DocArray[ImageDoc] - batch = DocumentArray[MMdoc]( + batch = DocArray[MMdoc]( [ MMdoc( - img=DocumentArray[ImageDoc]( + img=DocArray[ImageDoc]( [ImageDoc(tensor=torch.zeros(3, 224, 224)) for _ in range(10)] ) ) @@ -47,7 +47,7 @@ class MMdoc(BaseDoc): def test_create_from_list_docs(): list_ = [ImageDoc(tensor=torch.zeros(3, 224, 224)) for _ in range(10)] - da_stacked = DocumentArrayStacked[ImageDoc](docs=list_, tensor_type=TorchTensor) + da_stacked = DocArrayStacked[ImageDoc](docs=list_, tensor_type=TorchTensor) assert len(da_stacked) == 10 assert da_stacked.tensor.shape == tuple([10, 3, 224, 224]) @@ -58,7 +58,7 @@ def test_len(batch): def test_create_from_None(): with pytest.raises(ValueError): - DocumentArrayStacked[ImageDoc]([]) + DocArrayStacked[ImageDoc]([]) def test_getitem(batch): @@ -75,7 +75,7 @@ def test_stack_setter(): class ImageDoc(BaseDoc): tensor: TorchTensor[3, 224, 224] - batch = DocumentArray[ImageDoc]( + batch = DocArray[ImageDoc]( [ImageDoc(tensor=torch.zeros(3, 224, 224)) for _ in range(10)] ) @@ -92,7 +92,7 @@ def test_stack_setter_np(): class ImageDoc(BaseDoc): tensor: NdArray[3, 224, 224] - batch = DocumentArray[ImageDoc]( + batch = DocArray[ImageDoc]( [ImageDoc(tensor=np.zeros((3, 224, 224))) for _ in range(10)] ) @@ -116,7 +116,7 @@ def test_stack_numpy(): class ImageDoc(BaseDoc): tensor: NdArray[3, 224, 224] - batch = DocumentArray[ImageDoc]( + batch = DocArray[ImageDoc]( [ImageDoc(tensor=np.zeros((3, 224, 224))) for _ in range(10)] ) @@ -152,7 +152,7 @@ class ImageDoc(BaseDoc): class MMdoc(BaseDoc): img: ImageDoc - batch = DocumentArray[MMdoc]( + batch = DocArray[MMdoc]( [MMdoc(img=ImageDoc(tensor=torch.zeros(3, 224, 224))) for _ in range(10)] ) @@ -171,7 +171,7 @@ class MMdoc(BaseDoc): ) -def test_stack_nested_documentarray(nested_batch): +def test_stack_nested_DocArray(nested_batch): for i in range(len(nested_batch)): assert ( nested_batch[i].img._storage.tensor_columns['tensor'] @@ -188,7 +188,7 @@ def test_convert_to_da(batch): class ImageDoc(BaseDoc): tensor: TorchTensor[3, 224, 224] - batch = DocumentArray[ImageDoc]( + batch = DocArray[ImageDoc]( [ImageDoc(tensor=torch.zeros(3, 224, 224)) for _ in range(10)] ) @@ -206,7 +206,7 @@ class ImageDoc(BaseDoc): class MMdoc(BaseDoc): img: ImageDoc - batch = DocumentArray[MMdoc]( + batch = DocArray[MMdoc]( [MMdoc(img=ImageDoc(tensor=torch.zeros(3, 224, 224))) for _ in range(10)] ) @@ -218,10 +218,10 @@ class MMdoc(BaseDoc): assert (doc.img.tensor == torch.zeros(3, 224, 224)).all() -def test_unstack_nested_documentarray(nested_batch): +def test_unstack_nested_DocArray(nested_batch): batch = nested_batch.unstack() for i in range(len(batch)): - assert isinstance(batch[i].img, DocumentArray) + assert isinstance(batch[i].img, DocArray) for doc in batch[i].img: assert (doc.tensor == torch.zeros(3, 224, 224)).all() @@ -230,7 +230,7 @@ def test_stack_call(): class ImageDoc(BaseDoc): tensor: TorchTensor[3, 224, 224] - da = DocumentArray[ImageDoc]( + da = DocArray[ImageDoc]( [ImageDoc(tensor=torch.zeros(3, 224, 224)) for _ in range(10)] ) @@ -245,7 +245,7 @@ def test_stack_union(): class ImageDoc(BaseDoc): tensor: Union[NdArray[3, 224, 224], TorchTensor[3, 224, 224]] - batch = DocumentArray[ImageDoc]( + batch = DocArray[ImageDoc]( [ImageDoc(tensor=np.zeros((3, 224, 224))) for _ in range(10)] ) batch[3].tensor = np.zeros((3, 224, 224)) @@ -263,7 +263,7 @@ def test_any_tensor_with_torch(tensor_type, tensor): class ImageDoc(BaseDoc): tensor: AnyTensor - da = DocumentArrayStacked[ImageDoc]( + da = DocArrayStacked[ImageDoc]( [ImageDoc(tensor=tensor) for _ in range(10)], tensor_type=tensor_type, ) @@ -284,7 +284,7 @@ class ImageDoc(BaseDoc): class TopDoc(BaseDoc): img: ImageDoc - da = DocumentArrayStacked[TopDoc]( + da = DocArrayStacked[TopDoc]( [TopDoc(img=ImageDoc(tensor=tensor)) for _ in range(10)], tensor_type=TorchTensor, ) @@ -300,9 +300,7 @@ def test_dict_stack(): class MyDoc(BaseDoc): my_dict: Dict[str, int] - da = DocumentArrayStacked[MyDoc]( - [MyDoc(my_dict={'a': 1, 'b': 2}) for _ in range(10)] - ) + da = DocArrayStacked[MyDoc]([MyDoc(my_dict={'a': 1, 'b': 2}) for _ in range(10)]) da.my_dict @@ -314,12 +312,12 @@ class Doc(BaseDoc): N = 10 - da = DocumentArrayStacked[Doc]( + da = DocArrayStacked[Doc]( [Doc(text=f'hello{i}', tensor=np.zeros((3, 224, 224))) for i in range(N)] ) da_sliced = da[0:10:2] - assert isinstance(da_sliced, DocumentArrayStacked) + assert isinstance(da_sliced, DocArrayStacked) tensors = da_sliced.tensor assert tensors.shape == (5, 3, 224, 224) @@ -334,7 +332,7 @@ def test_stack_embedding(): class MyDoc(BaseDoc): embedding: AnyEmbedding - da = DocumentArrayStacked[MyDoc]([MyDoc(embedding=np.zeros(10)) for _ in range(10)]) + da = DocArrayStacked[MyDoc]([MyDoc(embedding=np.zeros(10)) for _ in range(10)]) assert 'embedding' in da._storage.tensor_columns.keys() assert (da.embedding == np.zeros((10, 10))).all() @@ -345,7 +343,7 @@ def test_stack_none(tensor_backend): class MyDoc(BaseDoc): tensor: Optional[AnyTensor] - da = DocumentArrayStacked[MyDoc]( + da = DocArrayStacked[MyDoc]( [MyDoc(tensor=None) for _ in range(10)], tensor_type=tensor_backend ) @@ -353,7 +351,7 @@ class MyDoc(BaseDoc): def test_to_device(): - da = DocumentArrayStacked[ImageDoc]( + da = DocArrayStacked[ImageDoc]( [ImageDoc(tensor=torch.zeros(3, 5))], tensor_type=TorchTensor ) assert da.tensor.device == torch.device('cpu') @@ -363,13 +361,13 @@ def test_to_device(): def test_to_device_with_nested_da(): class Video(BaseDoc): - images: DocumentArray[ImageDoc] + images: DocArray[ImageDoc] - da_image = DocumentArrayStacked[ImageDoc]( + da_image = DocArrayStacked[ImageDoc]( [ImageDoc(tensor=torch.zeros(3, 5))], tensor_type=TorchTensor ) - da = DocumentArrayStacked[Video]([Video(images=da_image)]) + da = DocArrayStacked[Video]([Video(images=da_image)]) assert da.images[0].tensor.device == torch.device('cpu') da.to('meta') assert da.images[0].tensor.device == torch.device('meta') @@ -380,7 +378,7 @@ class MyDoc(BaseDoc): tensor: TorchTensor docs: ImageDoc - da = DocumentArrayStacked[MyDoc]( + da = DocArrayStacked[MyDoc]( [MyDoc(tensor=torch.zeros(3, 5), docs=ImageDoc(tensor=torch.zeros(3, 5)))], tensor_type=TorchTensor, ) @@ -392,7 +390,7 @@ class MyDoc(BaseDoc): def test_to_device_numpy(): - da = DocumentArrayStacked[ImageDoc]( + da = DocArrayStacked[ImageDoc]( [ImageDoc(tensor=np.zeros((3, 5)))], tensor_type=NdArray ) with pytest.raises(NotImplementedError): @@ -403,7 +401,7 @@ def test_keep_dtype_torch(): class MyDoc(BaseDoc): tensor: TorchTensor - da = DocumentArray[MyDoc]( + da = DocArray[MyDoc]( [MyDoc(tensor=torch.zeros([2, 4], dtype=torch.int32)) for _ in range(3)] ) assert da[0].tensor.dtype == torch.int32 @@ -417,7 +415,7 @@ def test_keep_dtype_np(): class MyDoc(BaseDoc): tensor: NdArray - da = DocumentArray[MyDoc]( + da = DocArray[MyDoc]( [MyDoc(tensor=np.zeros([2, 4], dtype=np.int32)) for _ in range(3)] ) assert da[0].tensor.dtype == np.int32 @@ -438,7 +436,7 @@ def test_np_scalar(): class MyDoc(BaseDoc): scalar: NdArray - da = DocumentArray[MyDoc]([MyDoc(scalar=np.array(2.0)) for _ in range(3)]) + da = DocArray[MyDoc]([MyDoc(scalar=np.array(2.0)) for _ in range(3)]) assert all(doc.scalar.ndim == 0 for doc in da) assert all(doc.scalar == 2.0 for doc in da) @@ -458,7 +456,7 @@ def test_torch_scalar(): class MyDoc(BaseDoc): scalar: TorchTensor - da = DocumentArray[MyDoc]( + da = DocArray[MyDoc]( [MyDoc(scalar=torch.tensor(2.0)) for _ in range(3)], ) assert all(doc.scalar.ndim == 0 for doc in da) @@ -478,7 +476,7 @@ def test_np_nan(): class MyDoc(BaseDoc): scalar: Optional[NdArray] - da = DocumentArray[MyDoc]([MyDoc() for _ in range(3)]) + da = DocArray[MyDoc]([MyDoc() for _ in range(3)]) assert all(doc.scalar is None for doc in da) assert all(doc.scalar == doc.scalar for doc in da) stacked_da = da.stack() @@ -497,7 +495,7 @@ def test_torch_nan(): class MyDoc(BaseDoc): scalar: Optional[TorchTensor] - da = DocumentArray[MyDoc]([MyDoc() for _ in range(3)]) + da = DocArray[MyDoc]([MyDoc() for _ in range(3)]) assert all(doc.scalar is None for doc in da) assert all(doc.scalar == doc.scalar for doc in da) stacked_da = da.stack(tensor_type=TorchTensor) @@ -517,24 +515,24 @@ def test_from_storage(): class ImageDoc(BaseDoc): tensor: TorchTensor[3, 224, 224] - batch = DocumentArrayStacked[ImageDoc]( + batch = DocArrayStacked[ImageDoc]( [ImageDoc(tensor=torch.zeros(3, 224, 224)) for _ in range(10)] ) - DocumentArrayStacked[ImageDoc].from_columns_storage(batch._storage) + DocArrayStacked[ImageDoc].from_columns_storage(batch._storage) def test_validate_from_da(): class ImageDoc(BaseDoc): tensor: TorchTensor[3, 224, 224] - batch = DocumentArray[ImageDoc]( + batch = DocArray[ImageDoc]( [ImageDoc(tensor=torch.zeros(3, 224, 224)) for _ in range(10)] ) - da = parse_obj_as(DocumentArrayStacked[ImageDoc], batch) + da = parse_obj_as(DocArrayStacked[ImageDoc], batch) - assert isinstance(da, DocumentArrayStacked[ImageDoc]) + assert isinstance(da, DocArrayStacked[ImageDoc]) def test_validation_column_tensor(batch): @@ -558,24 +556,22 @@ class Inner(BaseDoc): class Doc(BaseDoc): inner: Inner - batch = DocumentArrayStacked[Doc]( - [Doc(inner=Inner(hello='hello')) for _ in range(10)] - ) + batch = DocArrayStacked[Doc]([Doc(inner=Inner(hello='hello')) for _ in range(10)]) return batch, Doc, Inner def test_validation_column_doc(batch_nested_doc): batch, Doc, Inner = batch_nested_doc - batch.inner = DocumentArray[Inner]([Inner(hello='hello') for _ in range(10)]) - assert isinstance(batch.inner, DocumentArrayStacked[Inner]) + batch.inner = DocArray[Inner]([Inner(hello='hello') for _ in range(10)]) + assert isinstance(batch.inner, DocArrayStacked[Inner]) def test_validation_list_doc(batch_nested_doc): batch, Doc, Inner = batch_nested_doc batch.inner = [Inner(hello='hello') for _ in range(10)] - assert isinstance(batch.inner, DocumentArrayStacked[Inner]) + assert isinstance(batch.inner, DocArrayStacked[Inner]) def test_validation_col_doc_fail(batch_nested_doc): @@ -585,7 +581,7 @@ def test_validation_col_doc_fail(batch_nested_doc): batch.inner = ['hello'] * 10 with pytest.raises(ValueError): - batch.inner = DocumentArray[Inner]([Inner(hello='hello') for _ in range(11)]) + batch.inner = DocArray[Inner]([Inner(hello='hello') for _ in range(11)]) def test_doc_view_update(batch): diff --git a/tests/units/array/stack/test_array_stacked_tf.py b/tests/units/array/stack/test_array_stacked_tf.py index 5b06a3c1b3c..e82bfc7716a 100644 --- a/tests/units/array/stack/test_array_stacked_tf.py +++ b/tests/units/array/stack/test_array_stacked_tf.py @@ -2,8 +2,8 @@ import pytest -from docarray import BaseDoc, DocumentArray -from docarray.array import DocumentArrayStacked +from docarray import BaseDoc, DocArray +from docarray.array import DocArrayStacked from docarray.typing import AnyTensor, NdArray from docarray.utils.misc import is_tf_available @@ -22,9 +22,7 @@ class Image(BaseDoc): import tensorflow as tf - batch = DocumentArray[Image]( - [Image(tensor=tf.zeros((3, 224, 224))) for _ in range(10)] - ) + batch = DocArray[Image]([Image(tensor=tf.zeros((3, 224, 224))) for _ in range(10)]) return batch.stack() @@ -35,14 +33,14 @@ class Image(BaseDoc): tensor: TensorFlowTensor[3, 224, 224] class MMdoc(BaseDoc): - img: DocumentArray[Image] + img: DocArray[Image] import tensorflow as tf - batch = DocumentArrayStacked[MMdoc]( + batch = DocArrayStacked[MMdoc]( [ MMdoc( - img=DocumentArray[Image]( + img=DocArray[Image]( [Image(tensor=tf.zeros((3, 224, 224))) for _ in range(10)] ) ) @@ -69,7 +67,7 @@ def test_getitem(batch): @pytest.mark.tensorflow def test_get_slice(batch): sliced = batch[0:2] - assert isinstance(sliced, DocumentArrayStacked) + assert isinstance(sliced, DocArrayStacked) assert len(sliced) == 2 @@ -84,7 +82,7 @@ def test_set_after_stacking(): class Image(BaseDoc): tensor: TensorFlowTensor[3, 224, 224] - batch = DocumentArrayStacked[Image]( + batch = DocArrayStacked[Image]( [Image(tensor=tf.zeros((3, 224, 224))) for _ in range(10)] ) @@ -111,7 +109,7 @@ class Image(BaseDoc): class MMdoc(BaseDoc): img: Image - batch = DocumentArray[MMdoc]( + batch = DocArray[MMdoc]( [MMdoc(img=Image(tensor=tf.zeros((3, 224, 224)))) for _ in range(10)] ).stack() @@ -124,7 +122,7 @@ class MMdoc(BaseDoc): @pytest.mark.tensorflow -def test_stack_nested_documentarray(nested_batch): +def test_stack_nested_DocArray(nested_batch): for i in range(len(nested_batch)): assert tnp.allclose( nested_batch[i].img._storage.tensor_columns['tensor'].tensor, @@ -152,7 +150,7 @@ class Image(BaseDoc): class MMdoc(BaseDoc): img: Image - batch = DocumentArrayStacked[MMdoc]( + batch = DocArrayStacked[MMdoc]( [MMdoc(img=Image(tensor=tf.zeros((3, 224, 224)))) for _ in range(10)] ) assert isinstance(batch.img._storage.tensor_columns['tensor'], TensorFlowTensor) @@ -163,10 +161,10 @@ class MMdoc(BaseDoc): @pytest.mark.tensorflow -def test_unstack_nested_documentarray(nested_batch): +def test_unstack_nested_DocArray(nested_batch): batch = nested_batch.unstack() for i in range(len(batch)): - assert isinstance(batch[i].img, DocumentArray) + assert isinstance(batch[i].img, DocArray) for doc in batch[i].img: assert tnp.allclose(doc.tensor.tensor, tf.zeros((3, 224, 224))) @@ -176,9 +174,7 @@ def test_stack_call(): class Image(BaseDoc): tensor: TensorFlowTensor[3, 224, 224] - da = DocumentArray[Image]( - [Image(tensor=tf.zeros((3, 224, 224))) for _ in range(10)] - ) + da = DocArray[Image]([Image(tensor=tf.zeros((3, 224, 224))) for _ in range(10)]) da = da.stack() @@ -192,7 +188,7 @@ def test_stack_union(): class Image(BaseDoc): tensor: Union[NdArray[3, 224, 224], TensorFlowTensor[3, 224, 224]] - DocumentArrayStacked[Image]( + DocArrayStacked[Image]( [Image(tensor=tf.zeros((3, 224, 224))) for _ in range(10)], tensor_type=TensorFlowTensor, ) @@ -219,7 +215,7 @@ def test_any_tensor_with_tf(): class Image(BaseDoc): tensor: AnyTensor - da = DocumentArrayStacked[Image]( + da = DocArrayStacked[Image]( [Image(tensor=tensor) for _ in range(10)], tensor_type=TensorFlowTensor, ) @@ -241,7 +237,7 @@ class Image(BaseDoc): class TopDoc(BaseDoc): img: Image - da = DocumentArrayStacked[TopDoc]( + da = DocArrayStacked[TopDoc]( [TopDoc(img=Image(tensor=tensor)) for _ in range(10)], tensor_type=TensorFlowTensor, ) @@ -260,12 +256,12 @@ class Doc(BaseDoc): text: str tensor: TensorFlowTensor - da = DocumentArrayStacked[Doc]( + da = DocArrayStacked[Doc]( [Doc(text=f'hello{i}', tensor=tf.zeros((3, 224, 224))) for i in range(10)] ) da_sliced = da[0:10:2] - assert isinstance(da_sliced, DocumentArrayStacked) + assert isinstance(da_sliced, DocArrayStacked) tensors = da_sliced.tensor.tensor assert tensors.shape == (5, 3, 224, 224) @@ -276,7 +272,7 @@ def test_stack_none(): class MyDoc(BaseDoc): tensor: Optional[AnyTensor] - da = DocumentArrayStacked[MyDoc]( + da = DocArrayStacked[MyDoc]( [MyDoc(tensor=None) for _ in range(10)], tensor_type=TensorFlowTensor ) assert 'tensor' in da._storage.tensor_columns.keys() @@ -287,7 +283,7 @@ def test_keep_dtype_tf(): class MyDoc(BaseDoc): tensor: TensorFlowTensor - da = DocumentArray[MyDoc]( + da = DocArray[MyDoc]( [MyDoc(tensor=tf.zeros([2, 4], dtype=tf.int32)) for _ in range(3)] ) assert da[0].tensor.tensor.dtype == tf.int32 diff --git a/tests/units/array/stack/test_init.py b/tests/units/array/stack/test_init.py index aedd761aadc..c4e906e82b1 100644 --- a/tests/units/array/stack/test_init.py +++ b/tests/units/array/stack/test_init.py @@ -1,7 +1,7 @@ import numpy as np from docarray import BaseDoc -from docarray.array.stacked.array_stacked import DocumentArrayStacked +from docarray.array.stacked.array_stacked import DocArrayStacked from docarray.typing import AnyTensor, NdArray @@ -12,7 +12,7 @@ class MyDoc(BaseDoc): docs = [MyDoc(tensor=np.zeros(10), name='hello') for _ in range(4)] - da = DocumentArrayStacked[MyDoc](docs, tensor_type=NdArray) + da = DocArrayStacked[MyDoc](docs, tensor_type=NdArray) assert (da._storage.tensor_columns['tensor'] == np.zeros((4, 10))).all() assert da._storage.any_columns['name']._data == ['hello' for _ in range(4)] @@ -25,7 +25,7 @@ class MyDoc(BaseDoc): docs = [MyDoc(tensor=i * np.zeros((10, 10)), name=f'hello{i}') for i in range(4)] - da = DocumentArrayStacked[MyDoc](docs, tensor_type=NdArray) + da = DocArrayStacked[MyDoc](docs, tensor_type=NdArray) for i, doc in enumerate(da): assert isinstance(doc, MyDoc) diff --git a/tests/units/array/stack/test_proto.py b/tests/units/array/stack/test_proto.py index 6a0f2881b08..1589c28197b 100644 --- a/tests/units/array/stack/test_proto.py +++ b/tests/units/array/stack/test_proto.py @@ -2,8 +2,8 @@ import pytest import torch -from docarray import BaseDoc, DocumentArray -from docarray.array import DocumentArrayStacked +from docarray import BaseDoc, DocArray +from docarray.array import DocArrayStacked from docarray.typing import NdArray, TorchTensor @@ -12,9 +12,7 @@ def batch(): class Image(BaseDoc): tensor: TorchTensor[3, 224, 224] - batch = DocumentArray[Image]( - [Image(tensor=torch.zeros(3, 224, 224)) for _ in range(10)] - ) + batch = DocArray[Image]([Image(tensor=torch.zeros(3, 224, 224)) for _ in range(10)]) return batch.stack() @@ -29,9 +27,7 @@ def test_proto_stacked_mode_numpy(): class MyDoc(BaseDoc): tensor: NdArray[3, 224, 224] - da = DocumentArray[MyDoc]( - [MyDoc(tensor=np.zeros((3, 224, 224))) for _ in range(10)] - ) + da = DocArray[MyDoc]([MyDoc(tensor=np.zeros((3, 224, 224))) for _ in range(10)]) da = da.stack() @@ -43,10 +39,10 @@ def test_stacked_proto(): class CustomDocument(BaseDoc): image: NdArray - da = DocumentArray[CustomDocument]( + da = DocArray[CustomDocument]( [CustomDocument(image=np.zeros((3, 224, 224))) for _ in range(10)] ).stack() - da2 = DocumentArrayStacked.from_protobuf(da.to_protobuf()) + da2 = DocArrayStacked.from_protobuf(da.to_protobuf()) - assert isinstance(da2, DocumentArrayStacked) + assert isinstance(da2, DocArrayStacked) diff --git a/tests/units/array/test_array.py b/tests/units/array/test_array.py index 0126fbb69e9..bab9ccd1313 100644 --- a/tests/units/array/test_array.py +++ b/tests/units/array/test_array.py @@ -4,7 +4,7 @@ import pytest import torch -from docarray import BaseDoc, DocumentArray +from docarray import BaseDoc, DocArray from docarray.typing import ImageUrl, NdArray, TorchTensor from docarray.utils.misc import is_tf_available @@ -18,7 +18,7 @@ def da(): class Text(BaseDoc): text: str - return DocumentArray[Text]([Text(text=f'hello {i}') for i in range(10)]) + return DocArray[Text]([Text(text=f'hello {i}') for i in range(10)]) def test_iterate(da): @@ -30,7 +30,7 @@ def test_append(): class Text(BaseDoc): text: str - da = DocumentArray[Text]([]) + da = DocArray[Text]([]) da.append(Text(text='hello', id='1')) @@ -42,7 +42,7 @@ def test_extend(): class Text(BaseDoc): text: str - da = DocumentArray[Text]([Text(text='hello', id=str(i)) for i in range(10)]) + da = DocArray[Text]([Text(text='hello', id=str(i)) for i in range(10)]) da.extend([Text(text='hello', id=str(10 + i)) for i in range(10)]) @@ -61,13 +61,13 @@ def test_document_array(): class Text(BaseDoc): text: str - da = DocumentArray([Text(text='hello') for _ in range(10)]) + da = DocArray([Text(text='hello') for _ in range(10)]) assert len(da) == 10 def test_empty_array(): - da = DocumentArray() + da = DocArray() len(da) == 0 @@ -75,7 +75,7 @@ def test_document_array_fixed_type(): class Text(BaseDoc): text: str - da = DocumentArray[Text]([Text(text='hello') for _ in range(10)]) + da = DocArray[Text]([Text(text='hello') for _ in range(10)]) assert len(da) == 10 @@ -87,7 +87,7 @@ class Mmdoc(BaseDoc): N = 10 - da = DocumentArray[Mmdoc]( + da = DocArray[Mmdoc]( (Mmdoc(text=f'hello{i}', tensor=np.zeros((3, 224, 224))) for i in range(N)) ) @@ -113,9 +113,7 @@ class Mmdoc(BaseDoc): N = 10 - da = DocumentArray[Mmdoc]( - (Mmdoc(inner=InnerDoc(text=f'hello{i}')) for i in range(N)) - ) + da = DocArray[Mmdoc]((Mmdoc(inner=InnerDoc(text=f'hello{i}')) for i in range(N))) list_docs = [InnerDoc(text=f'hello{i}') for i in range(N)] da._set_data_column('inner', list_docs) @@ -131,7 +129,7 @@ class Mmdoc(BaseDoc): N = 10 - da = DocumentArray[Mmdoc]( + da = DocArray[Mmdoc]( (Mmdoc(text=f'hello{i}', tensor=np.zeros((3, 224, 224))) for i in range(N)) ) @@ -157,11 +155,9 @@ class Mmdoc(BaseDoc): N = 10 - da = DocumentArray[Mmdoc]( - (Mmdoc(inner=InnerDoc(text=f'hello{i}')) for i in range(N)) - ) + da = DocArray[Mmdoc]((Mmdoc(inner=InnerDoc(text=f'hello{i}')) for i in range(N))) - assert isinstance(da.inner, DocumentArray) + assert isinstance(da.inner, DocArray) def test_get_bulk_attributes_optional_type(): @@ -171,7 +167,7 @@ class Mmdoc(BaseDoc): N = 10 - da = DocumentArray[Mmdoc]( + da = DocArray[Mmdoc]( (Mmdoc(text=f'hello{i}', tensor=np.zeros((3, 224, 224))) for i in range(N)) ) @@ -195,7 +191,7 @@ class Mmdoc(BaseDoc): N = 10 - da = DocumentArray[Mmdoc]( + da = DocArray[Mmdoc]( (Mmdoc(text=f'hello{i}', tensor=np.zeros((3, 224, 224))) for i in range(N)) ) @@ -223,7 +219,7 @@ class MyDoc(BaseDoc): Optional[Union[TorchTensor, NdArray, TensorFlowTensor]], TorchTensor ] - da = DocumentArray[MyDoc]( + da = DocArray[MyDoc]( [ MyDoc( embedding=torch.rand(10), @@ -250,12 +246,12 @@ class Doc(BaseDoc): N = 10 - da = DocumentArray[Doc]( + da = DocArray[Doc]( (Doc(text=f'hello{i}', tensor=np.zeros((3, 224, 224))) for i in range(N)) ) da_sliced = da[0:10:2] - assert isinstance(da_sliced, DocumentArray) + assert isinstance(da_sliced, DocArray) tensors = da_sliced.tensor assert len(tensors) == 5 @@ -299,13 +295,13 @@ def test_del_item(da): def test_generic_type_var(): T = TypeVar('T', bound=BaseDoc) - def f(a: DocumentArray[T]) -> DocumentArray[T]: + def f(a: DocArray[T]) -> DocArray[T]: return a - def g(a: DocumentArray['BaseDoc']) -> DocumentArray['BaseDoc']: + def g(a: DocArray['BaseDoc']) -> DocArray['BaseDoc']: return a - a = DocumentArray() + a = DocArray() f(a) g(a) @@ -316,7 +312,7 @@ class Text(BaseDoc): docs = [Text(text=f'hello {i}') for i in range(10)] - da = DocumentArray[Text].construct(docs) + da = DocArray[Text].construct(docs) assert da._data is docs @@ -327,7 +323,7 @@ class Text(BaseDoc): docs = [Text(text=f'hello {i}') for i in range(10)] - da = DocumentArray[Text](docs) + da = DocArray[Text](docs) da.reverse() assert da[-1].text == 'hello 0' assert da[0].text == 'hello 9' @@ -340,7 +336,7 @@ class Image(BaseDoc): def test_remove(): images = [Image(url=f'http://url.com/foo_{i}.png') for i in range(3)] - da = DocumentArray[Image](images) + da = DocArray[Image](images) da.remove(images[1]) assert len(da) == 2 assert da[0] == images[0] @@ -349,7 +345,7 @@ def test_remove(): def test_pop(): images = [Image(url=f'http://url.com/foo_{i}.png') for i in range(3)] - da = DocumentArray[Image](images) + da = DocArray[Image](images) popped = da.pop(1) assert len(da) == 2 assert popped == images[1] @@ -361,7 +357,7 @@ def test_sort(): images = [ Image(url=f'http://url.com/foo_{i}.png', tensor=NdArray(i)) for i in [2, 0, 1] ] - da = DocumentArray[Image](images) + da = DocArray[Image](images) da.sort(key=lambda img: len(img.tensor)) assert len(da) == 3 assert da[0].url == 'http://url.com/foo_0.png' diff --git a/tests/units/array/test_array_from_to_bytes.py b/tests/units/array/test_array_from_to_bytes.py index 7112763402e..0d269e036a3 100644 --- a/tests/units/array/test_array_from_to_bytes.py +++ b/tests/units/array/test_array_from_to_bytes.py @@ -1,6 +1,6 @@ import pytest -from docarray import BaseDoc, DocumentArray +from docarray import BaseDoc, DocArray from docarray.documents import ImageDoc from docarray.typing import NdArray @@ -17,7 +17,7 @@ class MyDoc(BaseDoc): @pytest.mark.parametrize('compress', ['lz4', 'bz2', 'lzma', 'zlib', 'gzip', None]) @pytest.mark.parametrize('show_progress', [False, True]) def test_from_to_bytes(protocol, compress, show_progress): - da = DocumentArray[MyDoc]( + da = DocArray[MyDoc]( [ MyDoc( embedding=[1, 2, 3, 4, 5], text='hello', image=ImageDoc(url='aux.png') @@ -28,7 +28,7 @@ def test_from_to_bytes(protocol, compress, show_progress): bytes_da = da.to_bytes( protocol=protocol, compress=compress, show_progress=show_progress ) - da2 = DocumentArray[MyDoc].from_bytes( + da2 = DocArray[MyDoc].from_bytes( bytes_da, protocol=protocol, compress=compress, show_progress=show_progress ) assert len(da2) == 2 @@ -47,7 +47,7 @@ def test_from_to_bytes(protocol, compress, show_progress): @pytest.mark.parametrize('compress', ['lz4', 'bz2', 'lzma', 'zlib', 'gzip', None]) @pytest.mark.parametrize('show_progress', [False, True]) def test_from_to_base64(protocol, compress, show_progress): - da = DocumentArray[MyDoc]( + da = DocArray[MyDoc]( [ MyDoc( embedding=[1, 2, 3, 4, 5], text='hello', image=ImageDoc(url='aux.png') @@ -58,7 +58,7 @@ def test_from_to_base64(protocol, compress, show_progress): bytes_da = da.to_base64( protocol=protocol, compress=compress, show_progress=show_progress ) - da2 = DocumentArray[MyDoc].from_base64( + da2 = DocArray[MyDoc].from_base64( bytes_da, protocol=protocol, compress=compress, show_progress=show_progress ) assert len(da2) == 2 diff --git a/tests/units/array/test_array_from_to_csv.py b/tests/units/array/test_array_from_to_csv.py index 2a4049f4290..ecec376d433 100644 --- a/tests/units/array/test_array_from_to_csv.py +++ b/tests/units/array/test_array_from_to_csv.py @@ -3,7 +3,7 @@ import pytest -from docarray import BaseDoc, DocumentArray +from docarray import BaseDoc, DocArray from docarray.documents import ImageDoc from tests import TOYDATA_DIR @@ -22,7 +22,7 @@ class MyDocNested(MyDoc): def test_to_from_csv(tmpdir, nested_doc_cls): - da = DocumentArray[nested_doc_cls]( + da = DocArray[nested_doc_cls]( [ nested_doc_cls( count=0, @@ -37,13 +37,13 @@ def test_to_from_csv(tmpdir, nested_doc_cls): da.to_csv(tmp_file) assert os.path.isfile(tmp_file) - da_from = DocumentArray[nested_doc_cls].from_csv(tmp_file) + da_from = DocArray[nested_doc_cls].from_csv(tmp_file) for doc1, doc2 in zip(da, da_from): assert doc1 == doc2 def test_from_csv_nested(nested_doc_cls): - da = DocumentArray[nested_doc_cls].from_csv( + da = DocArray[nested_doc_cls].from_csv( file_path=str(TOYDATA_DIR / 'docs_nested.csv') ) assert len(da) == 3 @@ -91,11 +91,9 @@ class Outer(BaseDoc): def test_from_csv_without_schema_raise_exception(): with pytest.raises(TypeError, match='no document schema defined'): - DocumentArray.from_csv(file_path=str(TOYDATA_DIR / 'docs_nested.csv')) + DocArray.from_csv(file_path=str(TOYDATA_DIR / 'docs_nested.csv')) def test_from_csv_with_wrong_schema_raise_exception(nested_doc): with pytest.raises(ValueError, match='Column names do not match the schema'): - DocumentArray[nested_doc.__class__].from_csv( - file_path=str(TOYDATA_DIR / 'docs.csv') - ) + DocArray[nested_doc.__class__].from_csv(file_path=str(TOYDATA_DIR / 'docs.csv')) diff --git a/tests/units/array/test_array_from_to_json.py b/tests/units/array/test_array_from_to_json.py index 2e910496a32..52d6b2ec977 100644 --- a/tests/units/array/test_array_from_to_json.py +++ b/tests/units/array/test_array_from_to_json.py @@ -1,4 +1,4 @@ -from docarray import BaseDoc, DocumentArray +from docarray import BaseDoc, DocArray from docarray.documents import ImageDoc from docarray.typing import NdArray @@ -10,7 +10,7 @@ class MyDoc(BaseDoc): def test_from_to_json(): - da = DocumentArray[MyDoc]( + da = DocArray[MyDoc]( [ MyDoc( embedding=[1, 2, 3, 4, 5], text='hello', image=ImageDoc(url='aux.png') @@ -19,7 +19,7 @@ def test_from_to_json(): ] ) json_da = da.to_json() - da2 = DocumentArray[MyDoc].from_json(json_da) + da2 = DocArray[MyDoc].from_json(json_da) assert len(da2) == 2 assert len(da) == len(da2) for d1, d2 in zip(da, da2): diff --git a/tests/units/array/test_array_from_to_pandas.py b/tests/units/array/test_array_from_to_pandas.py index 0ca762807a6..d01cd8a1d68 100644 --- a/tests/units/array/test_array_from_to_pandas.py +++ b/tests/units/array/test_array_from_to_pandas.py @@ -3,7 +3,7 @@ import pandas as pd import pytest -from docarray import BaseDoc, DocumentArray +from docarray import BaseDoc, DocArray from docarray.documents import ImageDoc @@ -20,7 +20,7 @@ class MyDocNested(MyDoc): def test_to_from_pandas_df(nested_doc_cls): - da = DocumentArray[nested_doc_cls]( + da = DocArray[nested_doc_cls]( [ nested_doc_cls( count=0, @@ -47,7 +47,7 @@ def test_to_from_pandas_df(nested_doc_cls): ] ).all() - da_from_df = DocumentArray[nested_doc_cls].from_pandas(df) + da_from_df = DocArray[nested_doc_cls].from_pandas(df) for doc1, doc2 in zip(da, da_from_df): assert doc1 == doc2 @@ -76,7 +76,7 @@ def test_from_pandas_without_schema_raise_exception(): df = pd.DataFrame( columns=['title', 'count'], data=[['title 0', 0], ['title 1', 1]] ) - DocumentArray.from_pandas(df=df) + DocArray.from_pandas(df=df) def test_from_pandas_with_wrong_schema_raise_exception(nested_doc): @@ -84,4 +84,4 @@ def test_from_pandas_with_wrong_schema_raise_exception(nested_doc): df = pd.DataFrame( columns=['title', 'count'], data=[['title 0', 0], ['title 1', 1]] ) - DocumentArray[nested_doc.__class__].from_pandas(df=df) + DocArray[nested_doc.__class__].from_pandas(df=df) diff --git a/tests/units/array/test_array_proto.py b/tests/units/array/test_array_proto.py index 5ba2b0fef65..ac0265016fc 100644 --- a/tests/units/array/test_array_proto.py +++ b/tests/units/array/test_array_proto.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from docarray import BaseDoc, DocumentArray +from docarray import BaseDoc, DocArray from docarray.documents import ImageDoc, TextDoc from docarray.typing import NdArray @@ -12,11 +12,11 @@ class CustomDoc(BaseDoc): text: str tensor: NdArray - da = DocumentArray( + da = DocArray( [CustomDoc(text='hello', tensor=np.zeros((3, 224, 224))) for _ in range(10)] ) - new_da = DocumentArray[CustomDoc].from_protobuf(da.to_protobuf()) + new_da = DocArray[CustomDoc].from_protobuf(da.to_protobuf()) for doc1, doc2 in zip(da, new_da): assert doc1.text == doc2.text @@ -29,7 +29,7 @@ class CustomDocument(BaseDoc): text: TextDoc image: ImageDoc - da = DocumentArray[CustomDocument]( + da = DocArray[CustomDocument]( [ CustomDocument( text=TextDoc(text='hello'), @@ -39,7 +39,7 @@ class CustomDocument(BaseDoc): ] ) - DocumentArray[CustomDocument].from_protobuf(da.to_protobuf()) + DocArray[CustomDocument].from_protobuf(da.to_protobuf()) @pytest.mark.proto @@ -48,7 +48,7 @@ class CustomDocument(BaseDoc): text: TextDoc image: ImageDoc - da = DocumentArray[CustomDocument]( + da = DocArray[CustomDocument]( [ CustomDocument( text=TextDoc(text='hello'), @@ -58,4 +58,4 @@ class CustomDocument(BaseDoc): ] ) - DocumentArray.from_protobuf(da.to_protobuf()) + DocArray.from_protobuf(da.to_protobuf()) diff --git a/tests/units/array/test_array_save_load.py b/tests/units/array/test_array_save_load.py index 6d5d9a8da4a..795c437608d 100644 --- a/tests/units/array/test_array_save_load.py +++ b/tests/units/array/test_array_save_load.py @@ -3,7 +3,7 @@ import numpy as np import pytest -from docarray import BaseDoc, DocumentArray +from docarray import BaseDoc, DocArray from docarray.documents import ImageDoc from docarray.typing import NdArray @@ -23,7 +23,7 @@ class MyDoc(BaseDoc): def test_array_save_load_binary(protocol, compress, tmp_path, show_progress): tmp_file = os.path.join(tmp_path, 'test') - da = DocumentArray[MyDoc]( + da = DocArray[MyDoc]( [ MyDoc( embedding=[1, 2, 3, 4, 5], text='hello', image=ImageDoc(url='aux.png') @@ -36,7 +36,7 @@ def test_array_save_load_binary(protocol, compress, tmp_path, show_progress): tmp_file, protocol=protocol, compress=compress, show_progress=show_progress ) - da2 = DocumentArray[MyDoc].load_binary( + da2 = DocArray[MyDoc].load_binary( tmp_file, protocol=protocol, compress=compress, show_progress=show_progress ) @@ -59,7 +59,7 @@ def test_array_save_load_binary(protocol, compress, tmp_path, show_progress): def test_array_save_load_binary_streaming(protocol, compress, tmp_path, show_progress): tmp_file = os.path.join(tmp_path, 'test') - da = DocumentArray[MyDoc]() + da = DocArray[MyDoc]() def _extend_da(num_docs=100): for _ in range(num_docs): @@ -79,8 +79,8 @@ def _extend_da(num_docs=100): tmp_file, protocol=protocol, compress=compress, show_progress=show_progress ) - da2 = DocumentArray[MyDoc]() - da_generator = DocumentArray[MyDoc].load_binary( + da2 = DocArray[MyDoc]() + da_generator = DocArray[MyDoc].load_binary( tmp_file, protocol=protocol, compress=compress, show_progress=show_progress ) diff --git a/tests/units/array/test_batching.py b/tests/units/array/test_batching.py index f7bce9bea96..389d649dbc4 100644 --- a/tests/units/array/test_batching.py +++ b/tests/units/array/test_batching.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from docarray import BaseDoc, DocumentArray +from docarray import BaseDoc, DocArray from docarray.typing import NdArray @@ -14,7 +14,7 @@ class MyDoc(BaseDoc): tensor: NdArray t_shape = (32, 32) - da = DocumentArray[MyDoc]( + da = DocArray[MyDoc]( [ MyDoc( id=i, diff --git a/tests/units/array/test_generic_array.py b/tests/units/array/test_generic_array.py index 7d0fb36b0af..66f9e92b87b 100644 --- a/tests/units/array/test_generic_array.py +++ b/tests/units/array/test_generic_array.py @@ -1,4 +1,4 @@ -from docarray import BaseDoc, DocumentArray +from docarray import BaseDoc, DocArray from docarray.base_document import AnyDoc @@ -6,14 +6,14 @@ def test_generic_init(): class Text(BaseDoc): text: str - da = DocumentArray[Text]([]) + da = DocArray[Text]([]) da.document_type == Text - assert isinstance(da, DocumentArray) + assert isinstance(da, DocArray) def test_normal_access_init(): - da = DocumentArray([]) + da = DocArray([]) da.document_type == AnyDoc - assert isinstance(da, DocumentArray) + assert isinstance(da, DocArray) diff --git a/tests/units/array/test_indexing.py b/tests/units/array/test_indexing.py index 9d875b1b6bf..6aa9e363301 100644 --- a/tests/units/array/test_indexing.py +++ b/tests/units/array/test_indexing.py @@ -2,7 +2,7 @@ import pytest import torch -from docarray import DocumentArray, DocumentArrayStacked +from docarray import DocArray, DocArrayStacked from docarray.documents import TextDoc from docarray.typing import TorchTensor @@ -11,7 +11,7 @@ def da(): texts = [f'hello {i}' for i in range(10)] tensors = [torch.ones((4,)) * i for i in range(10)] - return DocumentArray[TextDoc]( + return DocArray[TextDoc]( [TextDoc(text=text, embedding=tens) for text, tens in zip(texts, tensors)], ) @@ -20,7 +20,7 @@ def da(): def da_to_set(): texts = [f'hello {2*i}' for i in range(5)] tensors = [torch.ones((4,)) * i * 2 for i in range(5)] - return DocumentArray[TextDoc]( + return DocArray[TextDoc]( [TextDoc(text=text, embedding=tens) for text, tens in zip(texts, tensors)], ) @@ -236,7 +236,7 @@ def test_boolmask_setitem(stack_left, stack_right, da, da_to_set, index): def test_setitem_update_column(): texts = [f'hello {i}' for i in range(10)] tensors = [torch.ones((4,)) * (i + 1) for i in range(10)] - da = DocumentArrayStacked[TextDoc]( + da = DocArrayStacked[TextDoc]( [TextDoc(text=text, embedding=tens) for text, tens in zip(texts, tensors)], tensor_type=TorchTensor, ) diff --git a/tests/units/array/test_traverse.py b/tests/units/array/test_traverse.py index 700ecedd3a4..b6bd25f0be8 100644 --- a/tests/units/array/test_traverse.py +++ b/tests/units/array/test_traverse.py @@ -3,7 +3,7 @@ import pytest import torch -from docarray import BaseDoc, DocumentArray +from docarray import BaseDoc, DocArray from docarray.array.abstract_array import AnyDocArray from docarray.documents import TextDoc from docarray.typing import TorchTensor @@ -21,21 +21,21 @@ class SubSubDoc(BaseDoc): class SubDoc(BaseDoc): sub_text: TextDoc - sub_da: DocumentArray[SubSubDoc] + sub_da: DocArray[SubSubDoc] class MultiModalDoc(BaseDoc): mm_text: TextDoc mm_tensor: Optional[TorchTensor[3, 2, 2]] - mm_da: DocumentArray[SubDoc] + mm_da: DocArray[SubDoc] - docs = DocumentArray[MultiModalDoc]( + docs = DocArray[MultiModalDoc]( [ MultiModalDoc( mm_text=TextDoc(text=f'hello{i}'), mm_da=[ SubDoc( sub_text=TextDoc(text=f'sub_{i}_1'), - sub_da=DocumentArray[SubSubDoc]( + sub_da=DocArray[SubSubDoc]( [ SubSubDoc( sub_sub_text=TextDoc(text='subsub'), @@ -81,7 +81,7 @@ def test_traverse_stacked_da(): class Image(BaseDoc): tensor: TorchTensor[3, 224, 224] - batch = DocumentArray[Image]( + batch = DocArray[Image]( [ Image( tensor=torch.zeros(3, 224, 224), @@ -112,7 +112,7 @@ def test_flatten_one_level(input_list, output_list): def test_flatten_one_level_list_of_da(): doc = BaseDoc() - input_list = [DocumentArray([doc, doc, doc])] + input_list = [DocArray([doc, doc, doc])] flattened = AnyDocArray._flatten_one_level(sequence=input_list) assert flattened == [doc, doc, doc] diff --git a/tests/units/document/proto/test_document_proto.py b/tests/units/document/proto/test_document_proto.py index bd132966c38..9b4c73e57cd 100644 --- a/tests/units/document/proto/test_document_proto.py +++ b/tests/units/document/proto/test_document_proto.py @@ -4,7 +4,7 @@ import pytest import torch -from docarray import DocumentArray +from docarray import DocArray from docarray.base_document import BaseDoc from docarray.typing import NdArray, TorchTensor from docarray.utils.misc import is_tf_available @@ -57,11 +57,11 @@ class CustomInnerDoc(BaseDoc): class CustomDoc(BaseDoc): text: str - chunks: DocumentArray[CustomInnerDoc] + chunks: DocArray[CustomInnerDoc] doc = CustomDoc( text='hello', - chunks=DocumentArray[CustomInnerDoc]( + chunks=DocArray[CustomInnerDoc]( [CustomInnerDoc(tensor=np.zeros((3, 224, 224))) for _ in range(5)], ), ) @@ -95,11 +95,11 @@ class CustomInnerDoc(BaseDoc): class CustomDoc(BaseDoc): text: str - chunks: DocumentArray[CustomInnerDoc] + chunks: DocArray[CustomInnerDoc] doc = CustomDoc( text='hello', - chunks=DocumentArray[CustomInnerDoc]( + chunks=DocArray[CustomInnerDoc]( [CustomInnerDoc(tensor=torch.zeros((3, 224, 224))) for _ in range(5)], ), ) diff --git a/tests/units/document/test_update.py b/tests/units/document/test_update.py index 0ed1745f3fb..690b83649ed 100644 --- a/tests/units/document/test_update.py +++ b/tests/units/document/test_update.py @@ -2,7 +2,7 @@ import pytest -from docarray import BaseDoc, DocumentArray +from docarray import BaseDoc, DocArray from docarray.documents import ImageDoc @@ -16,8 +16,8 @@ class MMDoc(BaseDoc): price: int = 0 categories: Optional[List[str]] = None image: Optional[ImageDoc] = None - matches: Optional[DocumentArray] = None - matches_with_same_id: Optional[DocumentArray] = None + matches: Optional[DocArray] = None + matches_with_same_id: Optional[DocArray] = None opt_int: Optional[int] = None test_set: Optional[Set] = None inner_doc: Optional[InnerDoc] = None @@ -30,9 +30,9 @@ def doc1(): text='hey here', categories=['a', 'b', 'c'], price=10, - matches=DocumentArray[MMDoc]([MMDoc()]), - matches_with_same_id=DocumentArray[MMDoc]( - [MMDoc(id='a', matches=DocumentArray[MMDoc]([MMDoc()]))] + matches=DocArray[MMDoc]([MMDoc()]), + matches_with_same_id=DocArray[MMDoc]( + [MMDoc(id='a', matches=DocArray[MMDoc]([MMDoc()]))] ), test_set={'a', 'a'}, inner_doc=InnerDoc(integer=2, inner_list=['c', 'd']), @@ -48,9 +48,9 @@ def doc2(doc1): categories=['d', 'e', 'f'], price=5, opt_int=5, - matches=DocumentArray[MMDoc]([MMDoc()]), - matches_with_same_id=DocumentArray[MMDoc]( - [MMDoc(id='a', matches=DocumentArray[MMDoc]([MMDoc()]))] + matches=DocArray[MMDoc]([MMDoc()]), + matches_with_same_id=DocArray[MMDoc]( + [MMDoc(id='a', matches=DocArray[MMDoc]([MMDoc()]))] ), test_set={'a', 'b'}, inner_doc=InnerDoc(integer=3, inner_list=['a', 'b']), diff --git a/tests/units/document/test_view.py b/tests/units/document/test_view.py index 1fdbe2f5a9f..a544289f7ec 100644 --- a/tests/units/document/test_view.py +++ b/tests/units/document/test_view.py @@ -1,7 +1,7 @@ import numpy as np from docarray import BaseDoc -from docarray.array import DocumentArrayStacked +from docarray.array import DocArrayStacked from docarray.array.stacked.column_storage import ColumnStorageView from docarray.typing import AnyTensor @@ -13,7 +13,7 @@ class MyDoc(BaseDoc): docs = [MyDoc(tensor=np.zeros((10, 10)), name='hello', id=i) for i in range(4)] - storage = DocumentArrayStacked[MyDoc](docs)._storage + storage = DocArrayStacked[MyDoc](docs)._storage doc = MyDoc.from_view(ColumnStorageView(0, storage)) assert doc.is_view() diff --git a/tests/units/test_helper.py b/tests/units/test_helper.py index 9dd300a9dec..652400d2905 100644 --- a/tests/units/test_helper.py +++ b/tests/units/test_helper.py @@ -2,7 +2,7 @@ import pytest -from docarray import BaseDoc, DocumentArray +from docarray import BaseDoc, DocArray from docarray.documents import ImageDoc from docarray.helper import ( _access_path_dict_to_nested_dict, @@ -26,12 +26,12 @@ class Middle(BaseDoc): class Outer(BaseDoc): img: Optional[ImageDoc] middle: Optional[Middle] - da: DocumentArray[Inner] + da: DocArray[Inner] doc = Outer( img=ImageDoc(), middle=Middle(img=ImageDoc(), inner=Inner(img=ImageDoc())), - da=DocumentArray[Inner]([Inner(img=ImageDoc(url='test.png'))]), + da=DocArray[Inner]([Inner(img=ImageDoc(url='test.png'))]), ) return doc diff --git a/tests/units/typing/da/test_relations.py b/tests/units/typing/da/test_relations.py index dad579ad81d..fcdf1177657 100644 --- a/tests/units/typing/da/test_relations.py +++ b/tests/units/typing/da/test_relations.py @@ -1,33 +1,33 @@ -from docarray import BaseDoc, DocumentArray +from docarray import BaseDoc, DocArray def test_instance_and_equivalence(): class MyDoc(BaseDoc): text: str - docs = DocumentArray[MyDoc]([MyDoc(text='hello')]) + docs = DocArray[MyDoc]([MyDoc(text='hello')]) - assert issubclass(DocumentArray[MyDoc], DocumentArray[MyDoc]) - assert issubclass(docs.__class__, DocumentArray[MyDoc]) + assert issubclass(DocArray[MyDoc], DocArray[MyDoc]) + assert issubclass(docs.__class__, DocArray[MyDoc]) - assert isinstance(docs, DocumentArray[MyDoc]) + assert isinstance(docs, DocArray[MyDoc]) def test_subclassing(): class MyDoc(BaseDoc): text: str - class MyDocArray(DocumentArray[MyDoc]): + class MyDocArray(DocArray[MyDoc]): pass docs = MyDocArray([MyDoc(text='hello')]) - assert issubclass(MyDocArray, DocumentArray[MyDoc]) - assert issubclass(docs.__class__, DocumentArray[MyDoc]) + assert issubclass(MyDocArray, DocArray[MyDoc]) + assert issubclass(docs.__class__, DocArray[MyDoc]) assert isinstance(docs, MyDocArray) - assert isinstance(docs, DocumentArray[MyDoc]) + assert isinstance(docs, DocArray[MyDoc]) assert issubclass(MyDoc, BaseDoc) - assert not issubclass(DocumentArray[MyDoc], DocumentArray[BaseDoc]) - assert not issubclass(MyDocArray, DocumentArray[BaseDoc]) + assert not issubclass(DocArray[MyDoc], DocArray[BaseDoc]) + assert not issubclass(MyDocArray, DocArray[BaseDoc]) diff --git a/tests/units/util/test_filter.py b/tests/units/util/test_filter.py index c9602a32c83..14e43290e9a 100644 --- a/tests/units/util/test_filter.py +++ b/tests/units/util/test_filter.py @@ -3,7 +3,7 @@ import pytest -from docarray import BaseDoc, DocumentArray +from docarray import BaseDoc, DocArray from docarray.documents import ImageDoc, TextDoc from docarray.utils.filter import filter_docs @@ -45,7 +45,7 @@ def docs(): optional_num=30, dictionary={'a': 0, 'b': 1}, ) - docs = DocumentArray[MMDoc]([mmdoc1, mmdoc2, mmdoc3]) + docs = DocArray[MMDoc]([mmdoc1, mmdoc2, mmdoc3]) return docs @@ -173,7 +173,7 @@ def test_array_simple_filters(docs, dict_api): @pytest.mark.parametrize('dict_api', [True, False]) def test_placehold_filter(dict_api): - docs = DocumentArray[MMDoc]( + docs = DocArray[MMDoc]( [ MMDoc(text='A', text_doc=TextDoc(text='A')), MMDoc(text='A', text_doc=TextDoc(text='B')), @@ -251,7 +251,7 @@ class MyDocument(BaseDoc): image: ImageDoc price: int - docs = DocumentArray[MyDocument]( + docs = DocArray[MyDocument]( [ MyDocument( caption='A tiger in the jungle', diff --git a/tests/units/util/test_find.py b/tests/units/util/test_find.py index 342695c072e..9239e6d8dff 100644 --- a/tests/units/util/test_find.py +++ b/tests/units/util/test_find.py @@ -4,7 +4,7 @@ import pytest import torch -from docarray import BaseDoc, DocumentArray +from docarray import BaseDoc, DocArray from docarray.typing import NdArray, TorchTensor from docarray.utils.find import find, find_batched @@ -24,7 +24,7 @@ def random_torch_query(): @pytest.fixture() def random_torch_batch_query(): - return DocumentArray[TorchDoc]([TorchDoc(tensor=torch.rand(128)) for _ in range(5)]) + return DocArray[TorchDoc]([TorchDoc(tensor=torch.rand(128)) for _ in range(5)]) @pytest.fixture() @@ -34,17 +34,17 @@ def random_nd_query(): @pytest.fixture() def random_nd_batch_query(): - return DocumentArray[NdDoc]([NdDoc(tensor=np.random.rand(128)) for _ in range(5)]) + return DocArray[NdDoc]([NdDoc(tensor=np.random.rand(128)) for _ in range(5)]) @pytest.fixture() def random_torch_index(): - return DocumentArray[TorchDoc](TorchDoc(tensor=torch.rand(128)) for _ in range(10)) + return DocArray[TorchDoc](TorchDoc(tensor=torch.rand(128)) for _ in range(10)) @pytest.fixture() def random_nd_index(): - return DocumentArray[NdDoc](NdDoc(tensor=np.random.rand(128)) for _ in range(10)) + return DocArray[NdDoc](NdDoc(tensor=np.random.rand(128)) for _ in range(10)) @pytest.mark.parametrize('metric', ['cosine_sim', 'euclidean_dist', 'sqeuclidean_dist']) @@ -261,7 +261,7 @@ class MyDoc(BaseDoc): embedding: Optional[TorchTensor] query = MyDoc(embedding=torch.rand(10)) - index = DocumentArray[MyDoc]([MyDoc(embedding=torch.rand(10)) for _ in range(10)]) + index = DocArray[MyDoc]([MyDoc(embedding=torch.rand(10)) for _ in range(10)]) top_k, scores = find( index, @@ -279,7 +279,7 @@ class MyDoc(BaseDoc): embedding: Union[TorchTensor, NdArray] query = MyDoc(embedding=torch.rand(10)) - index = DocumentArray[MyDoc]([MyDoc(embedding=torch.rand(10)) for _ in range(10)]) + index = DocArray[MyDoc]([MyDoc(embedding=torch.rand(10)) for _ in range(10)]) top_k, scores = find( index, @@ -302,7 +302,7 @@ class MyDoc(BaseDoc): inner: InnerDoc query = MyDoc(inner=InnerDoc(title='query', embedding=torch.rand(2))) - index = DocumentArray[MyDoc]( + index = DocArray[MyDoc]( [ MyDoc(inner=InnerDoc(title=f'doc {i}', embedding=torch.rand(2))) for i in range(10) @@ -335,7 +335,7 @@ class MyDoc(BaseDoc): embedding3=torch.rand(10), embedding4=torch.rand(10), ) - index = DocumentArray[MyDoc]( + index = DocArray[MyDoc]( [ MyDoc( embedding=torch.rand(10), diff --git a/tests/units/util/test_map.py b/tests/units/util/test_map.py index 65227998d73..0e54aaa2732 100644 --- a/tests/units/util/test_map.py +++ b/tests/units/util/test_map.py @@ -2,7 +2,7 @@ import pytest -from docarray import BaseDoc, DocumentArray +from docarray import BaseDoc, DocArray from docarray.documents import ImageDoc from docarray.typing import ImageUrl, NdArray from docarray.utils.map import map_docs, map_docs_batch @@ -19,9 +19,7 @@ def load_from_doc(d: ImageDoc) -> ImageDoc: @pytest.fixture() def da(): - da = DocumentArray[ImageDoc]( - [ImageDoc(url=IMAGE_PATHS['png']) for _ in range(N_DOCS)] - ) + da = DocArray[ImageDoc]([ImageDoc(url=IMAGE_PATHS['png']) for _ in range(N_DOCS)]) return da @@ -52,7 +50,7 @@ def local_func(x): @pytest.mark.parametrize('backend', ['thread', 'process']) def test_check_order(backend): - da = DocumentArray[ImageDoc]([ImageDoc(id=i) for i in range(N_DOCS)]) + da = DocArray[ImageDoc]([ImageDoc(id=i) for i in range(N_DOCS)]) docs = list(map_docs(da=da, func=load_from_doc, backend=backend)) @@ -61,7 +59,7 @@ def test_check_order(backend): assert doc.id == str(i) -def load_from_da(da: DocumentArray) -> DocumentArray: +def load_from_da(da: DocArray) -> DocArray: for doc in da: doc.tensor = doc.url.load() return da @@ -77,13 +75,11 @@ class MyImage(BaseDoc): @pytest.mark.parametrize('backend', ['thread', 'process']) def test_map_docs_batch(n_docs, batch_size, backend): - da = DocumentArray[MyImage]( - [MyImage(url=IMAGE_PATHS['png']) for _ in range(n_docs)] - ) + da = DocArray[MyImage]([MyImage(url=IMAGE_PATHS['png']) for _ in range(n_docs)]) it = map_docs_batch( da=da, func=load_from_da, batch_size=batch_size, backend=backend ) assert isinstance(it, Generator) for batch in it: - assert isinstance(batch, DocumentArray[MyImage]) + assert isinstance(batch, DocArray[MyImage]) diff --git a/tests/units/util/test_reduce.py b/tests/units/util/test_reduce.py index 362381b5580..e72e8863a46 100644 --- a/tests/units/util/test_reduce.py +++ b/tests/units/util/test_reduce.py @@ -2,7 +2,7 @@ import pytest -from docarray import BaseDoc, DocumentArray +from docarray import BaseDoc, DocArray from docarray.documents import ImageDoc from docarray.utils.reduce import reduce, reduce_all @@ -17,8 +17,8 @@ class MMDoc(BaseDoc): price: int = 0 categories: Optional[List[str]] = None image: Optional[ImageDoc] = None - matches: Optional[DocumentArray] = None - matches_with_same_id: Optional[DocumentArray] = None + matches: Optional[DocArray] = None + matches_with_same_id: Optional[DocArray] = None opt_int: Optional[int] = None test_set: Optional[Set] = None inner_doc: Optional[InnerDoc] = None @@ -31,9 +31,9 @@ def doc1(): text='hey here', categories=['a', 'b', 'c'], price=10, - matches=DocumentArray[MMDoc]([MMDoc()]), - matches_with_same_id=DocumentArray[MMDoc]( - [MMDoc(id='a', matches=DocumentArray[MMDoc]([MMDoc()]))] + matches=DocArray[MMDoc]([MMDoc()]), + matches_with_same_id=DocArray[MMDoc]( + [MMDoc(id='a', matches=DocArray[MMDoc]([MMDoc()]))] ), test_set={'a', 'a'}, inner_doc=InnerDoc(integer=2, inner_list=['c', 'd']), @@ -49,9 +49,9 @@ def doc2(doc1): categories=['d', 'e', 'f'], price=5, opt_int=5, - matches=DocumentArray[MMDoc]([MMDoc()]), - matches_with_same_id=DocumentArray[MMDoc]( - [MMDoc(id='a', matches=DocumentArray[MMDoc]([MMDoc()]))] + matches=DocArray[MMDoc]([MMDoc()]), + matches_with_same_id=DocArray[MMDoc]( + [MMDoc(id='a', matches=DocArray[MMDoc]([MMDoc()]))] ), test_set={'a', 'b'}, inner_doc=InnerDoc(integer=3, inner_list=['a', 'b']), @@ -60,8 +60,8 @@ def doc2(doc1): def test_reduce_different_ids(): - da1 = DocumentArray[MMDoc]([MMDoc() for _ in range(10)]) - da2 = DocumentArray[MMDoc]([MMDoc() for _ in range(10)]) + da1 = DocArray[MMDoc]([MMDoc() for _ in range(10)]) + da2 = DocArray[MMDoc]([MMDoc() for _ in range(10)]) result = reduce(da1, da2) assert len(result) == 20 # da1 is changed in place (no extra memory) @@ -69,8 +69,8 @@ def test_reduce_different_ids(): def test_reduce(doc1, doc2): - da1 = DocumentArray[MMDoc]([doc1, MMDoc()]) - da2 = DocumentArray[MMDoc]([MMDoc(), doc2]) + da1 = DocArray[MMDoc]([doc1, MMDoc()]) + da2 = DocArray[MMDoc]([MMDoc(), doc2]) result = reduce(da1, da2) assert len(result) == 3 # da1 is changed in place (no extra memory) @@ -89,9 +89,9 @@ def test_reduce(doc1, doc2): def test_reduce_all(doc1, doc2): - da1 = DocumentArray[MMDoc]([doc1, MMDoc()]) - da2 = DocumentArray[MMDoc]([MMDoc(), doc2]) - da3 = DocumentArray[MMDoc]([MMDoc(), MMDoc(), doc1]) + da1 = DocArray[MMDoc]([doc1, MMDoc()]) + da2 = DocArray[MMDoc]([MMDoc(), doc2]) + da3 = DocArray[MMDoc]([MMDoc(), MMDoc(), doc1]) result = reduce_all([da1, da2, da3]) assert len(result) == 5 # da1 is changed in place (no extra memory) From 0f545c8ae0b7d0410a0e523ff93d99c5fead1112 Mon Sep 17 00:00:00 2001 From: samsja Date: Mon, 27 Mar 2023 13:31:13 +0200 Subject: [PATCH 03/22] refactor: rename base doc in md files Signed-off-by: samsja --- .github/workflows/ci.yml | 2 +- README.md | 42 +++++++++---------- .../base_document/base_document.md | 4 +- docs/tutorials/add_doc_index.md | 38 ++++++++--------- .../multimodal_training_and_serving.md | 10 ++--- ...optimize_performance_with_id_generation.md | 12 +++--- 6 files changed, 54 insertions(+), 54 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 51759432772..9715db93d72 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -61,7 +61,7 @@ jobs: poetry install --without dev poetry run pip install tensorflow==2.11.0 - name: Test basic import - run: poetry run python -c 'from docarray import DocArray, BaseDocument' + run: poetry run python -c 'from docarray import DocArray, BaseDoc' check-mypy: diff --git a/README.md b/README.md index c62e8f20a90..c8a21f420e7 100644 --- a/README.md +++ b/README.md @@ -15,12 +15,12 @@ This means that DocArray lets you do the following things: ## Represent ```python -from docarray import BaseDocument +from docarray import BaseDoc from docarray.typing import TorchTensor, ImageUrl from typing import Optional -class MyDocument(BaseDocument): +class MyDocument(BaseDoc): description: str image_url: ImageUrl image_tensor: Optional[TorchTensor[1704, 2272, 3]] @@ -62,12 +62,12 @@ doc.embedding = clip_image_encoder( ### Compose nested Documents: ```python -from docarray import BaseDocument +from docarray import BaseDoc from docarray.documents import ImageDoc, TextDoc import numpy as np -class MultiModalDocument(BaseDocument): +class MultiModalDocument(BaseDoc): image_doc: ImageDoc text_doc: TextDoc @@ -79,12 +79,12 @@ doc = MultiModalDocument( ### Collect multiple `Documents` into a `DocArray`: ```python -from docarray import DocArray, BaseDocument +from docarray import DocArray, BaseDoc from docarray.typing import AnyTensor, ImageUrl import numpy as np -class Image(BaseDocument): +class Image(BaseDoc): url: ImageUrl tensor: AnyTensor ``` @@ -233,20 +233,20 @@ Not very easy on the eyes if you ask us. And even worse, if you need to add one So, now let's see what the same code looks like with DocArray: ```python -from docarray import DocArray, BaseDocument +from docarray import DocArray, BaseDoc from docarray.documents import ImageDoc, TextDoc, AudioDoc from docarray.typing import TorchTensor import torch -class Podcast(BaseDocument): +class Podcast(BaseDoc): text: TextDoc image: ImageDoc audio: AudioDoc -class PairPodcast(BaseDocument): +class PairPodcast(BaseDoc): left: Podcast right: Podcast @@ -297,12 +297,12 @@ This would look like the following: ```python from typing import Optional -from docarray import DocArray, BaseDocument +from docarray import DocArray, BaseDoc import tensorflow as tf -class Podcast(BaseDocument): +class Podcast(BaseDoc): audio_tensor: Optional[AudioTensorFlowTensor] embedding: Optional[AudioTensorFlowTensor] @@ -328,17 +328,17 @@ import numpy as np from fastapi import FastAPI from httpx import AsyncClient -from docarray import BaseDocument +from docarray import BaseDoc from docarray.documents import ImageDoc from docarray.typing import NdArray from docarray.base_document import DocumentResponse -class InputDoc(BaseDocument): +class InputDoc(BaseDoc): img: ImageDoc -class OutputDoc(BaseDocument): +class OutputDoc(BaseDoc): embedding_clip: NdArray embedding_bert: NdArray @@ -368,12 +368,12 @@ The big advantage here is **first-class support for ML centric data**, such as { This includes handy features such as validating the shape of a tensor: ```python -from docarray import BaseDocument +from docarray import BaseDoc from docarray.typing import TorchTensor import torch -class MyDoc(BaseDocument): +class MyDoc(BaseDoc): tensor: TorchTensor[3, 224, 224] @@ -382,7 +382,7 @@ doc = MyDoc(tensor=torch.zeros(224, 224, 3)) # works by reshaping doc = MyDoc(tensor=torch.zeros(224)) # fails validation -class Image(BaseDocument): +class Image(BaseDoc): tensor: TorchTensor[3, 'x', 'x'] @@ -407,13 +407,13 @@ store it there, and thus make it searchable: ```python # NOTE: DocumentStores are not yet implemented in version 2 -from docarray import DocArray, BaseDocument +from docarray import DocArray, BaseDoc from docarray.stores import DocumentStore from docarray.documents import ImageDoc, TextDoc import numpy as np -class MyDoc(BaseDocument): +class MyDoc(BaseDoc): image: ImageDoc text: TextDoc description: str @@ -449,7 +449,7 @@ You can see more logs by setting the log level to `DEBUG` or `INFO`: ```python from pydantic import Field -from docarray import BaseDocument +from docarray import BaseDoc from docarray.index import HnswDocumentIndex from docarray.typing import NdArray import logging @@ -459,7 +459,7 @@ logging.getLogger('docarray').setLevel(logging.DEBUG) # define a simple document and create a document index -class SimpleDoc(BaseDocument): +class SimpleDoc(BaseDoc): vector: NdArray = Field(dim=10) diff --git a/docs/api_references/base_document/base_document.md b/docs/api_references/base_document/base_document.md index cdb6b53e6f0..68427b67742 100644 --- a/docs/api_references/base_document/base_document.md +++ b/docs/api_references/base_document/base_document.md @@ -1,3 +1,3 @@ -# BaseDocument +# BaseDoc -::: docarray.base_document.document.BaseDocument +::: docarray.base_document.document.BaseDoc diff --git a/docs/tutorials/add_doc_index.md b/docs/tutorials/add_doc_index.md index afc8476d87e..8fb03b9978b 100644 --- a/docs/tutorials/add_doc_index.md +++ b/docs/tutorials/add_doc_index.md @@ -9,7 +9,7 @@ This document shows how to add a new Document Index to DocArray. That process can be broken down into a number of basic steps: -1. Create a new class that inherits from `BaseDocumentIndex` +1. Create a new class that inherits from `BaseDocIndex` 2. Declare default configurations for your Document Index 3. Implement abstract methods for indexing, searching, and deleting 4. Implement a Query Builder for your Document Index @@ -27,14 +27,14 @@ This is _not_ how you should store Documents in your implementation! You can fin ## Create a new Document Index class -To get started, create a new class that inherits from `BaseDocumentIndex` and `typing.Generic`: +To get started, create a new class that inherits from `BaseDocIndex` and `typing.Generic`: ```python -TSchema = TypeVar('TSchema', bound=BaseDocument) +TSchema = TypeVar('TSchema', bound=BaseDoc) -class MyDocumentIndex(BaseDocumentIndex, Generic[TSchema]): +class MyDocumentIndex(BaseDocIndex, Generic[TSchema]): ... ``` @@ -84,11 +84,11 @@ To help you with all of this, `super().__init__` inject a few helpful attributes When a user instantiates a Document Index, they do so in a parametric way, like so: ```python -class Inner(BaseDocument): +class Inner(BaseDoc): embedding: NdArray[512] -class MyDoc(BaseDocument): +class MyDoc(BaseDoc): tensor: NdArray[100] other_tensor: NdArray = Field(dim=10, space='cosine') description: str @@ -167,7 +167,7 @@ This leads to four possible scenarios: Imagine the user defines a schema like the following: ```python -class MyDoc(BaseDocument): +class MyDoc(BaseDoc): tensor: NdArray[100] @@ -182,7 +182,7 @@ The `tensor` column in your backend should be configured to have dimensionality Imagine the user defines a schema like the following: ```python -class MyDoc(BaseDocument): +class MyDoc(BaseDoc): tensor: NdArray = Field(dim=50) @@ -197,7 +197,7 @@ The `tensor` column in your backend should be configured to have dimensionality Imagine the user defines a schema like the following: ```python -class MyDoc(BaseDocument): +class MyDoc(BaseDoc): tensor: NdArray[100] = Field(dim=50) @@ -212,7 +212,7 @@ The `tensor` column in your backend should be configured to have dimensionality Imagine the user defines a schema like the following: ```python -class MyDoc(BaseDocument): +class MyDoc(BaseDoc): tensor: NdArray @@ -231,17 +231,17 @@ In order to define what can be stored in them, and what the default values are, ```python @dataclass -class DBConfig(BaseDocumentIndex.DBConfig): +class DBConfig(BaseDocIndex.DBConfig): ... @dataclass -class RuntimeConfig(BaseDocumentIndex.RuntimeConfig): +class RuntimeConfig(BaseDocIndex.RuntimeConfig): default_column_config: Dict[Type, Dict[str, Any]] = ... ``` Note that: -- `DBConfig` inherits from `BaseDocumentIndex.DBConfig` and `RuntimeConfig` inherits from `BaseDocumentIndex.RuntimeConfig` +- `DBConfig` inherits from `BaseDocIndex.DBConfig` and `RuntimeConfig` inherits from `BaseDocIndex.RuntimeConfig` - All fields in each dataclass need to have default values. Choose these sensibly, as they will be used if the user does not specify a value. ### The `DBConfig` class @@ -278,7 +278,7 @@ In general, the following is true: - For every method that you need to implement, there is a public variant (e.g. `index`) and a private variant (e.g. `_index`) - You should usually implement the private variant, which is called by the already implemented public variant. This should make your life easier, because some preprocessing and data normalization will already be done for you. - You can, however, also implement the public variant directly, if you want to do something special. - - **Caution**: While this is a perfectly fine thing to do, it might create more maintenance work for you in the future, because the public variant defined in the `BaseDocumentIndex` might change in the future, and you will have to update your implementation accordingly. + - **Caution**: While this is a perfectly fine thing to do, it might create more maintenance work for you in the future, because the public variant defined in the `BaseDocIndex` might change in the future, and you will have to update your implementation accordingly. Further: - You don't absolutely have to implement everything. If a feature (e.g. `text_search`) is not supported by your backend, just raise a `NotImplementedError` in the corresponding method. @@ -289,7 +289,7 @@ Further: These can then be used to control DB specific behaviours, such as consistency levels, batch sizes, etc. As mentioned above, it is good practice to mirror these arguments in `self.RuntimeConfig`. Overall, you're asked to implement the methods that appear after the `Abstract methods; Subclasses must implement these` -comment in the `BaseDocumentIndex` class. +comment in the `BaseDocIndex` class. The details of each method should become clear from the docstrings and type hints. ### The `python_type_to_db_type()` method @@ -297,12 +297,12 @@ The details of each method should become clear from the docstrings and type hint This method is slightly special, because 1) it is not exposed to the user, and 2) you absolutely have to implement it. It is intended to do the following: It takes a type of a field in the store's schema (e.g. `NdArray` for `tensor`), and returns the corresponding type in the database (e.g. `np.ndarray`). -The `BaseDocumentIndex` class uses this information to create and populate the `_ColumnInfo`s in `self._column_infos`. +The `BaseDocIndex` class uses this information to create and populate the `_ColumnInfo`s in `self._column_infos`. If the user wants to change the default behaviour, one can set the db type by using the `col_type` field: ```python -class MySchema(BaseDocument): +class MySchema(BaseDoc): my_num: float = Field(col_type='float64') my_text: str = Field(..., col_type='varchar', max_len=2048) ``` @@ -356,12 +356,12 @@ The QueryBuilder is what accumulates partial queries and builds them into a sing Your Query Builder has to be an inner class of your Document Index, its class name has to be `QueryBuilder`, and it has to inherit from the Base Query Builder: ```python -class QueryBuilder(BaseDocumentIndex.QueryBuilder): +class QueryBuilder(BaseDocIndex.QueryBuilder): ... ``` The Query Builder exposes the following interface: -- The same query related methods as the `BaseDocumentIndex` class (e.g. `filter`, `find`, `text_search`, and their batched variants) +- The same query related methods as the `BaseDocIndex` class (e.g. `filter`, `find`, `text_search`, and their batched variants) - The `build()` method The goal of it is to enable an interface for composing coplex queries, like this: diff --git a/docs/tutorials/multimodal_training_and_serving.md b/docs/tutorials/multimodal_training_and_serving.md index defe5ad27cf..0b8645a46fb 100644 --- a/docs/tutorials/multimodal_training_and_serving.md +++ b/docs/tutorials/multimodal_training_and_serving.md @@ -78,12 +78,12 @@ The first thing we are trying to achieve when using DocArray is to clearly model about which tensors are supposed to represent what. To do that we are using a concept that is at the core of DocArray. The `Document`, a collection of multi-modal data. -The `BaseDocument` class allows users to define their own (nested, multi-modal) Document schema to represent any kind of complex data. +The `BaseDoc` class allows users to define their own (nested, multi-modal) Document schema to represent any kind of complex data. Let's start by defining a few Documents to handle the different modalities that we will use during our training: ```python -from docarray import BaseDocument, DocArray +from docarray import BaseDoc, DocArray from docarray.typing import TorchTensor, ImageUrl ``` @@ -93,7 +93,7 @@ Let's first create a Document for our Text modality. It will contain a number of from docarray.documents import TextDoc as BaseText -class Tokens(BaseDocument): +class Tokens(BaseDoc): input_ids: TorchTensor[48] attention_mask: TorchTensor ``` @@ -116,7 +116,7 @@ Under the hood, an `Image` looks something like this (with the only main differe supported ML framework): ```python -# class Image(BaseDocument): +# class Image(BaseDoc): # url: Optional[ImageUrl] # tensor: Optional[TorchTesor] # embedding: Optional[TorchTensor] @@ -128,7 +128,7 @@ Actually, the `BaseText` above also alredy includes `tensor`, `url` and `embeddi The final Document used for training here is the `PairTextImage`, which simply combines the Text and Image modalities: ```python -class PairTextImage(BaseDocument): +class PairTextImage(BaseDoc): text: Text image: Image ``` diff --git a/docs/tutorials/optimize_performance_with_id_generation.md b/docs/tutorials/optimize_performance_with_id_generation.md index 72518a6ec61..db46020faa2 100644 --- a/docs/tutorials/optimize_performance_with_id_generation.md +++ b/docs/tutorials/optimize_performance_with_id_generation.md @@ -1,24 +1,24 @@ # How to optimize performance -### `BaseDocument`'s id +### `BaseDoc`'s id -DocArray's `BaseDocument` has an optional `id` field, which defaults to `ID(os.urandom(16).hex())`. This takes quite some time. +DocArray's `BaseDoc` has an optional `id` field, which defaults to `ID(os.urandom(16).hex())`. This takes quite some time. If you don't rely on the id anywhere, you can instead set the default to None. This increases the performance by a factor of approximately 1.4. ```python -from docarray import BaseDocument +from docarray import BaseDoc from docarray.typing import ID -class MyDoc(BaseDocument): +class MyDoc(BaseDoc): id: ID = None title: str ``` -Since the `BaseDocument.id` is optional, you could also set the value to None, but this turns out to be a bit less efficient than the option above, and increases the performance by a factor of approximately 1.2. +Since the `BaseDoc.id` is optional, you could also set the value to None, but this turns out to be a bit less efficient than the option above, and increases the performance by a factor of approximately 1.2. ```python -class MyDoc2(BaseDocument): +class MyDoc2(BaseDoc): title: str From 88e01eedb4b4fe1afb9e26c9102ef74de076f4fd Mon Sep 17 00:00:00 2001 From: samsja Date: Mon, 27 Mar 2023 13:40:13 +0200 Subject: [PATCH 04/22] refactor: rename base base document ot base doc Signed-off-by: samsja --- README.md | 2 +- docarray/__init__.py | 2 +- docarray/array/abstract_array.py | 2 +- docarray/array/array/array.py | 2 +- docarray/array/array/io.py | 2 +- docarray/array/array/pushpull.py | 4 ++-- docarray/array/stacked/array_stacked.py | 4 ++-- docarray/base_doc/__init__.py | 6 ++++++ docarray/{base_document => base_doc}/any_doc.py | 0 docarray/{base_document => base_doc}/base_node.py | 0 docarray/{base_document => base_doc}/doc.py | 6 +++--- docarray/{base_document => base_doc}/doc_response.py | 2 +- docarray/{base_document => base_doc}/io/__init__.py | 0 docarray/{base_document => base_doc}/io/json.py | 0 docarray/base_doc/mixins/__init__.py | 4 ++++ docarray/{base_document => base_doc}/mixins/io.py | 2 +- docarray/{base_document => base_doc}/mixins/update.py | 0 docarray/base_document/__init__.py | 6 ------ docarray/base_document/mixins/__init__.py | 4 ---- docarray/display/document_summary.py | 2 +- docarray/documents/audio.py | 2 +- docarray/documents/image.py | 2 +- docarray/documents/mesh/mesh_3d.py | 2 +- docarray/documents/mesh/vertices_and_faces.py | 2 +- docarray/documents/point_cloud/point_cloud_3d.py | 2 +- docarray/documents/point_cloud/points_and_colors.py | 2 +- docarray/documents/text.py | 2 +- docarray/documents/video.py | 2 +- docarray/typing/abstract_type.py | 2 +- docarray/typing/tensor/ndarray.py | 2 +- docarray/typing/tensor/tensorflow_tensor.py | 2 +- docarray/typing/tensor/torch_tensor.py | 2 +- docarray/utils/find.py | 2 +- docs/tutorials/multimodal_training_and_serving.md | 2 +- tests/integrations/document/test_to_json.py | 4 ++-- tests/integrations/externals/test_fastapi.py | 2 +- tests/integrations/predefined_document/test_mesh.py | 2 +- tests/integrations/typing/test_typing_proto.py | 2 +- tests/units/array/test_generic_array.py | 2 +- tests/units/document/proto/test_document_proto.py | 2 +- tests/units/document/test_any_document.py | 2 +- tests/units/document/test_base_document.py | 2 +- tests/units/typing/tensor/test_embedding.py | 2 +- tests/units/typing/tensor/test_tensor.py | 2 +- tests/units/typing/tensor/test_tensor_flow_tensor.py | 2 +- tests/units/typing/tensor/test_torch_tensor.py | 2 +- tests/units/typing/test_id.py | 2 +- tests/units/typing/url/test_any_url.py | 2 +- tests/units/typing/url/test_audio_url.py | 2 +- tests/units/typing/url/test_image_url.py | 2 +- tests/units/typing/url/test_mesh_url.py | 2 +- tests/units/typing/url/test_point_cloud_url.py | 2 +- tests/units/typing/url/test_text_url.py | 3 +-- tests/units/typing/url/test_video_url.py | 2 +- 54 files changed, 60 insertions(+), 61 deletions(-) create mode 100644 docarray/base_doc/__init__.py rename docarray/{base_document => base_doc}/any_doc.py (100%) rename docarray/{base_document => base_doc}/base_node.py (100%) rename docarray/{base_document => base_doc}/doc.py (93%) rename docarray/{base_document => base_doc}/doc_response.py (94%) rename docarray/{base_document => base_doc}/io/__init__.py (100%) rename docarray/{base_document => base_doc}/io/json.py (100%) create mode 100644 docarray/base_doc/mixins/__init__.py rename docarray/{base_document => base_doc}/mixins/io.py (99%) rename docarray/{base_document => base_doc}/mixins/update.py (100%) delete mode 100644 docarray/base_document/__init__.py delete mode 100644 docarray/base_document/mixins/__init__.py diff --git a/README.md b/README.md index c8a21f420e7..52826b5cd8b 100644 --- a/README.md +++ b/README.md @@ -331,7 +331,7 @@ from httpx import AsyncClient from docarray import BaseDoc from docarray.documents import ImageDoc from docarray.typing import NdArray -from docarray.base_document import DocumentResponse +from docarray.base_doc import DocumentResponse class InputDoc(BaseDoc): diff --git a/docarray/__init__.py b/docarray/__init__.py index f41f2f4af6a..d8b6bae90a5 100644 --- a/docarray/__init__.py +++ b/docarray/__init__.py @@ -3,7 +3,7 @@ import logging from docarray.array import DocArray, DocArrayStacked -from docarray.base_document.doc import BaseDoc +from docarray.base_doc.doc import BaseDoc __all__ = ['BaseDoc', 'DocArray', 'DocArrayStacked'] diff --git a/docarray/array/abstract_array.py b/docarray/array/abstract_array.py index ece2fd4270e..762f37cfb42 100644 --- a/docarray/array/abstract_array.py +++ b/docarray/array/abstract_array.py @@ -19,7 +19,7 @@ import numpy as np -from docarray.base_document import BaseDoc +from docarray.base_doc import BaseDoc from docarray.display.document_array_summary import DocArraySummary from docarray.typing.abstract_type import AbstractType from docarray.utils._typing import change_cls_name diff --git a/docarray/array/array/array.py b/docarray/array/array/array.py index 2ea713493d9..1dac16e2eed 100644 --- a/docarray/array/array/array.py +++ b/docarray/array/array/array.py @@ -24,7 +24,7 @@ IndexingSequenceMixin, IndexIterType, ) -from docarray.base_document import AnyDoc, BaseDoc +from docarray.base_doc import AnyDoc, BaseDoc from docarray.typing import NdArray if TYPE_CHECKING: diff --git a/docarray/array/array/io.py b/docarray/array/array/io.py index 06042aab541..8659af08588 100644 --- a/docarray/array/array/io.py +++ b/docarray/array/array/io.py @@ -25,7 +25,7 @@ Union, ) -from docarray.base_document import AnyDoc, BaseDoc +from docarray.base_doc import AnyDoc, BaseDoc from docarray.helper import ( _access_path_dict_to_nested_dict, _all_access_paths_valid, diff --git a/docarray/array/array/pushpull.py b/docarray/array/array/pushpull.py index def3d144127..ee306620f4d 100644 --- a/docarray/array/array/pushpull.py +++ b/docarray/array/array/pushpull.py @@ -137,7 +137,7 @@ def pull( :param local_cache: store the downloaded DocArray to local folder :return: a :class:`DocArray` object """ - from docarray.base_document import AnyDoc + from docarray.base_doc import AnyDoc if cls.document_type == AnyDoc: raise TypeError( @@ -165,7 +165,7 @@ def pull_stream( :param local_cache: store the downloaded DocArray to local folder :return: Iterator of Documents """ - from docarray.base_document import AnyDoc + from docarray.base_doc import AnyDoc if cls.document_type == AnyDoc: raise TypeError( diff --git a/docarray/array/stacked/array_stacked.py b/docarray/array/stacked/array_stacked.py index ec64037224e..d2630a4b655 100644 --- a/docarray/array/stacked/array_stacked.py +++ b/docarray/array/stacked/array_stacked.py @@ -22,8 +22,8 @@ from docarray.array.array.array import DocArray from docarray.array.stacked.column_storage import ColumnStorage, ColumnStorageView from docarray.array.stacked.list_advance_indexing import ListAdvancedIndexing -from docarray.base_document import BaseDoc -from docarray.base_document.mixins.io import _type_to_protobuf +from docarray.base_doc import BaseDoc +from docarray.base_doc.mixins.io import _type_to_protobuf from docarray.typing import NdArray from docarray.typing.tensor.abstract_tensor import AbstractTensor from docarray.utils._typing import is_tensor_union diff --git a/docarray/base_doc/__init__.py b/docarray/base_doc/__init__.py new file mode 100644 index 00000000000..5fdeed1a807 --- /dev/null +++ b/docarray/base_doc/__init__.py @@ -0,0 +1,6 @@ +from docarray.base_doc.any_doc import AnyDoc +from docarray.base_doc.base_node import BaseNode +from docarray.base_doc.doc import BaseDoc +from docarray.base_doc.doc_response import DocResponse + +__all__ = ['AnyDoc', 'BaseDoc', 'BaseNode', 'DocResponse'] diff --git a/docarray/base_document/any_doc.py b/docarray/base_doc/any_doc.py similarity index 100% rename from docarray/base_document/any_doc.py rename to docarray/base_doc/any_doc.py diff --git a/docarray/base_document/base_node.py b/docarray/base_doc/base_node.py similarity index 100% rename from docarray/base_document/base_node.py rename to docarray/base_doc/base_node.py diff --git a/docarray/base_document/doc.py b/docarray/base_doc/doc.py similarity index 93% rename from docarray/base_document/doc.py rename to docarray/base_doc/doc.py index ab3881caf6e..c828a98f8b2 100644 --- a/docarray/base_document/doc.py +++ b/docarray/base_doc/doc.py @@ -5,9 +5,9 @@ from pydantic import BaseModel, Field from rich.console import Console -from docarray.base_document.base_node import BaseNode -from docarray.base_document.io.json import orjson_dumps, orjson_dumps_and_decode -from docarray.base_document.mixins import IOMixin, UpdateMixin +from docarray.base_doc.base_node import BaseNode +from docarray.base_doc.io.json import orjson_dumps, orjson_dumps_and_decode +from docarray.base_doc.mixins import IOMixin, UpdateMixin from docarray.typing import ID if TYPE_CHECKING: diff --git a/docarray/base_document/doc_response.py b/docarray/base_doc/doc_response.py similarity index 94% rename from docarray/base_document/doc_response.py rename to docarray/base_doc/doc_response.py index ee58adc8a8b..cbc43bf4767 100644 --- a/docarray/base_document/doc_response.py +++ b/docarray/base_doc/doc_response.py @@ -19,7 +19,7 @@ class DocResponse(JSONResponse): EXAMPLE USAGE .. code-block:: python from docarray.documets import Text - from docarray.base_document import DocResponse + from docarray.base_doc import DocResponse @app.post("/doc/", response_model=Text, response_class=DocResponse) diff --git a/docarray/base_document/io/__init__.py b/docarray/base_doc/io/__init__.py similarity index 100% rename from docarray/base_document/io/__init__.py rename to docarray/base_doc/io/__init__.py diff --git a/docarray/base_document/io/json.py b/docarray/base_doc/io/json.py similarity index 100% rename from docarray/base_document/io/json.py rename to docarray/base_doc/io/json.py diff --git a/docarray/base_doc/mixins/__init__.py b/docarray/base_doc/mixins/__init__.py new file mode 100644 index 00000000000..bfa675df9a1 --- /dev/null +++ b/docarray/base_doc/mixins/__init__.py @@ -0,0 +1,4 @@ +from docarray.base_doc.mixins.io import IOMixin +from docarray.base_doc.mixins.update import UpdateMixin + +__all__ = ['IOMixin', 'UpdateMixin'] diff --git a/docarray/base_document/mixins/io.py b/docarray/base_doc/mixins/io.py similarity index 99% rename from docarray/base_document/mixins/io.py rename to docarray/base_doc/mixins/io.py index a69b95ef3f1..13f723df3a3 100644 --- a/docarray/base_document/mixins/io.py +++ b/docarray/base_doc/mixins/io.py @@ -17,7 +17,7 @@ import numpy as np from typing_inspect import is_union_type -from docarray.base_document.base_node import BaseNode +from docarray.base_doc.base_node import BaseNode from docarray.typing import NdArray from docarray.typing.proto_register import _PROTO_TYPE_NAME_TO_CLASS from docarray.utils.compress import _compress_bytes, _decompress_bytes diff --git a/docarray/base_document/mixins/update.py b/docarray/base_doc/mixins/update.py similarity index 100% rename from docarray/base_document/mixins/update.py rename to docarray/base_doc/mixins/update.py diff --git a/docarray/base_document/__init__.py b/docarray/base_document/__init__.py deleted file mode 100644 index 5018394f7de..00000000000 --- a/docarray/base_document/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -from docarray.base_document.any_doc import AnyDoc -from docarray.base_document.base_node import BaseNode -from docarray.base_document.doc import BaseDoc -from docarray.base_document.doc_response import DocResponse - -__all__ = ['AnyDoc', 'BaseDoc', 'BaseNode', 'DocResponse'] diff --git a/docarray/base_document/mixins/__init__.py b/docarray/base_document/mixins/__init__.py deleted file mode 100644 index 53b3242874a..00000000000 --- a/docarray/base_document/mixins/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from docarray.base_document.mixins.io import IOMixin -from docarray.base_document.mixins.update import UpdateMixin - -__all__ = ['IOMixin', 'UpdateMixin'] diff --git a/docarray/display/document_summary.py b/docarray/display/document_summary.py index f77dddd1e71..a7fe5009e9b 100644 --- a/docarray/display/document_summary.py +++ b/docarray/display/document_summary.py @@ -6,7 +6,7 @@ from typing_extensions import TYPE_CHECKING from typing_inspect import is_optional_type, is_union_type -from docarray.base_document.doc import BaseDoc +from docarray.base_doc.doc import BaseDoc from docarray.display.tensor_display import TensorDisplay from docarray.typing import ID from docarray.typing.tensor.abstract_tensor import AbstractTensor diff --git a/docarray/documents/audio.py b/docarray/documents/audio.py index 4db0a3dc899..ab0bbe00b34 100644 --- a/docarray/documents/audio.py +++ b/docarray/documents/audio.py @@ -2,7 +2,7 @@ import numpy as np -from docarray.base_document import BaseDoc +from docarray.base_doc import BaseDoc from docarray.typing import AnyEmbedding, AudioUrl from docarray.typing.bytes.audio_bytes import AudioBytes from docarray.typing.tensor.abstract_tensor import AbstractTensor diff --git a/docarray/documents/image.py b/docarray/documents/image.py index 6a6c643c68b..f2375637de6 100644 --- a/docarray/documents/image.py +++ b/docarray/documents/image.py @@ -2,7 +2,7 @@ import numpy as np -from docarray.base_document import BaseDoc +from docarray.base_doc import BaseDoc from docarray.typing import AnyEmbedding, ImageBytes, ImageUrl from docarray.typing.tensor.abstract_tensor import AbstractTensor from docarray.typing.tensor.image.image_tensor import ImageTensor diff --git a/docarray/documents/mesh/mesh_3d.py b/docarray/documents/mesh/mesh_3d.py index 10a45607486..a8a2edd5d15 100644 --- a/docarray/documents/mesh/mesh_3d.py +++ b/docarray/documents/mesh/mesh_3d.py @@ -1,6 +1,6 @@ from typing import Any, Optional, Type, TypeVar, Union -from docarray.base_document import BaseDoc +from docarray.base_doc import BaseDoc from docarray.documents.mesh.vertices_and_faces import VerticesAndFaces from docarray.typing.tensor.embedding import AnyEmbedding from docarray.typing.url.url_3d.mesh_url import Mesh3DUrl diff --git a/docarray/documents/mesh/vertices_and_faces.py b/docarray/documents/mesh/vertices_and_faces.py index d6909414a8e..364ad3187f6 100644 --- a/docarray/documents/mesh/vertices_and_faces.py +++ b/docarray/documents/mesh/vertices_and_faces.py @@ -1,6 +1,6 @@ from typing import Any, Type, TypeVar, Union -from docarray.base_document import BaseDoc +from docarray.base_doc import BaseDoc from docarray.typing.tensor.tensor import AnyTensor T = TypeVar('T', bound='VerticesAndFaces') diff --git a/docarray/documents/point_cloud/point_cloud_3d.py b/docarray/documents/point_cloud/point_cloud_3d.py index 937e2a77f20..3cee613f596 100644 --- a/docarray/documents/point_cloud/point_cloud_3d.py +++ b/docarray/documents/point_cloud/point_cloud_3d.py @@ -2,7 +2,7 @@ import numpy as np -from docarray.base_document import BaseDoc +from docarray.base_doc import BaseDoc from docarray.documents.point_cloud.points_and_colors import PointsAndColors from docarray.typing import AnyEmbedding, PointCloud3DUrl from docarray.typing.tensor.abstract_tensor import AbstractTensor diff --git a/docarray/documents/point_cloud/points_and_colors.py b/docarray/documents/point_cloud/points_and_colors.py index c64cd4f48bd..af4917bf5a7 100644 --- a/docarray/documents/point_cloud/points_and_colors.py +++ b/docarray/documents/point_cloud/points_and_colors.py @@ -2,7 +2,7 @@ import numpy as np -from docarray.base_document import BaseDoc +from docarray.base_doc import BaseDoc from docarray.typing import AnyTensor from docarray.typing.tensor.abstract_tensor import AbstractTensor from docarray.utils.misc import is_tf_available, is_torch_available diff --git a/docarray/documents/text.py b/docarray/documents/text.py index fb19397dd39..c0c2725891c 100644 --- a/docarray/documents/text.py +++ b/docarray/documents/text.py @@ -1,6 +1,6 @@ from typing import Any, Optional, Type, TypeVar, Union -from docarray.base_document import BaseDoc +from docarray.base_doc import BaseDoc from docarray.typing import TextUrl from docarray.typing.tensor.embedding import AnyEmbedding diff --git a/docarray/documents/video.py b/docarray/documents/video.py index ceef6122561..3770ee245fc 100644 --- a/docarray/documents/video.py +++ b/docarray/documents/video.py @@ -2,7 +2,7 @@ import numpy as np -from docarray.base_document import BaseDoc +from docarray.base_doc import BaseDoc from docarray.documents import AudioDoc from docarray.typing import AnyEmbedding, AnyTensor from docarray.typing.tensor.abstract_tensor import AbstractTensor diff --git a/docarray/typing/abstract_type.py b/docarray/typing/abstract_type.py index aee87da298e..fd73c93452e 100644 --- a/docarray/typing/abstract_type.py +++ b/docarray/typing/abstract_type.py @@ -4,7 +4,7 @@ from pydantic import BaseConfig from pydantic.fields import ModelField -from docarray.base_document.base_node import BaseNode +from docarray.base_doc.base_node import BaseNode if TYPE_CHECKING: from docarray.proto import NodeProto diff --git a/docarray/typing/tensor/ndarray.py b/docarray/typing/tensor/ndarray.py index 2ed7d649868..0fc41a0c184 100644 --- a/docarray/typing/tensor/ndarray.py +++ b/docarray/typing/tensor/ndarray.py @@ -23,7 +23,7 @@ from docarray.computation.numpy_backend import NumpyCompBackend from docarray.proto import NdArrayProto -from docarray.base_document.base_node import BaseNode +from docarray.base_doc.base_node import BaseNode T = TypeVar('T', bound='NdArray') ShapeT = TypeVar('ShapeT') diff --git a/docarray/typing/tensor/tensorflow_tensor.py b/docarray/typing/tensor/tensorflow_tensor.py index ec686b486ba..4f28ab78a47 100644 --- a/docarray/typing/tensor/tensorflow_tensor.py +++ b/docarray/typing/tensor/tensorflow_tensor.py @@ -12,7 +12,7 @@ from docarray.proto import NdArrayProto from docarray.computation.tensorflow_backend import TensorFlowCompBackend -from docarray.base_document.base_node import BaseNode +from docarray.base_doc.base_node import BaseNode T = TypeVar('T', bound='TensorFlowTensor') ShapeT = TypeVar('ShapeT') diff --git a/docarray/typing/tensor/torch_tensor.py b/docarray/typing/tensor/torch_tensor.py index a9c71d2e747..7ca9cf70a89 100644 --- a/docarray/typing/tensor/torch_tensor.py +++ b/docarray/typing/tensor/torch_tensor.py @@ -13,7 +13,7 @@ from docarray.proto import NdArrayProto from docarray.computation.torch_backend import TorchCompBackend -from docarray.base_document.base_node import BaseNode +from docarray.base_doc.base_node import BaseNode T = TypeVar('T', bound='TorchTensor') ShapeT = TypeVar('ShapeT') diff --git a/docarray/utils/find.py b/docarray/utils/find.py index 98229eb7b7b..b7029578b56 100644 --- a/docarray/utils/find.py +++ b/docarray/utils/find.py @@ -5,7 +5,7 @@ from docarray.array.abstract_array import AnyDocArray from docarray.array.array.array import DocArray from docarray.array.stacked.array_stacked import DocArrayStacked -from docarray.base_document import BaseDoc +from docarray.base_doc import BaseDoc from docarray.helper import _get_field_type_by_access_path from docarray.typing import AnyTensor from docarray.typing.tensor.abstract_tensor import AbstractTensor diff --git a/docs/tutorials/multimodal_training_and_serving.md b/docs/tutorials/multimodal_training_and_serving.md index 0b8645a46fb..fd4421beb0f 100644 --- a/docs/tutorials/multimodal_training_and_serving.md +++ b/docs/tutorials/multimodal_training_and_serving.md @@ -342,7 +342,7 @@ FastAPI will be able to automatically translate it into a fully fledged API with ```python from fastapi import FastAPI -from docarray.base_document import DocumentResponse +from docarray.base_doc import DocumentResponse ``` ```python diff --git a/tests/integrations/document/test_to_json.py b/tests/integrations/document/test_to_json.py index 9bc06be14da..8eef195e2b8 100644 --- a/tests/integrations/document/test_to_json.py +++ b/tests/integrations/document/test_to_json.py @@ -2,8 +2,8 @@ import pytest import torch -from docarray.base_document import BaseDoc -from docarray.base_document.io.json import orjson_dumps +from docarray.base_doc import BaseDoc +from docarray.base_doc.io.json import orjson_dumps from docarray.typing import AnyUrl, NdArray, TorchTensor diff --git a/tests/integrations/externals/test_fastapi.py b/tests/integrations/externals/test_fastapi.py index 03bc4650775..5c5ed0bba60 100644 --- a/tests/integrations/externals/test_fastapi.py +++ b/tests/integrations/externals/test_fastapi.py @@ -4,7 +4,7 @@ from httpx import AsyncClient from docarray import BaseDoc -from docarray.base_document import DocResponse +from docarray.base_doc import DocResponse from docarray.documents import ImageDoc, TextDoc from docarray.typing import NdArray diff --git a/tests/integrations/predefined_document/test_mesh.py b/tests/integrations/predefined_document/test_mesh.py index 7b6edecb11d..5f91ffd9067 100644 --- a/tests/integrations/predefined_document/test_mesh.py +++ b/tests/integrations/predefined_document/test_mesh.py @@ -2,7 +2,7 @@ import pytest from pydantic import parse_obj_as -from docarray.base_document.doc import BaseDoc +from docarray.base_doc.doc import BaseDoc from docarray.documents import Mesh3D from tests import TOYDATA_DIR diff --git a/tests/integrations/typing/test_typing_proto.py b/tests/integrations/typing/test_typing_proto.py index a6f3f571659..ff16c2bc1e0 100644 --- a/tests/integrations/typing/test_typing_proto.py +++ b/tests/integrations/typing/test_typing_proto.py @@ -3,7 +3,7 @@ import torch from docarray import BaseDoc -from docarray.base_document import AnyDoc +from docarray.base_doc import AnyDoc from docarray.typing import ( AnyEmbedding, AnyUrl, diff --git a/tests/units/array/test_generic_array.py b/tests/units/array/test_generic_array.py index 66f9e92b87b..e0b5386e676 100644 --- a/tests/units/array/test_generic_array.py +++ b/tests/units/array/test_generic_array.py @@ -1,5 +1,5 @@ from docarray import BaseDoc, DocArray -from docarray.base_document import AnyDoc +from docarray.base_doc import AnyDoc def test_generic_init(): diff --git a/tests/units/document/proto/test_document_proto.py b/tests/units/document/proto/test_document_proto.py index 9b4c73e57cd..e6e8a58fa99 100644 --- a/tests/units/document/proto/test_document_proto.py +++ b/tests/units/document/proto/test_document_proto.py @@ -5,7 +5,7 @@ import torch from docarray import DocArray -from docarray.base_document import BaseDoc +from docarray.base_doc import BaseDoc from docarray.typing import NdArray, TorchTensor from docarray.utils.misc import is_tf_available diff --git a/tests/units/document/test_any_document.py b/tests/units/document/test_any_document.py index 5fae5d05b00..9628b013fd5 100644 --- a/tests/units/document/test_any_document.py +++ b/tests/units/document/test_any_document.py @@ -1,6 +1,6 @@ import numpy as np -from docarray.base_document import AnyDoc, BaseDoc +from docarray.base_doc import AnyDoc, BaseDoc from docarray.typing import NdArray diff --git a/tests/units/document/test_base_document.py b/tests/units/document/test_base_document.py index 91d02e600f5..e986ff0f1bb 100644 --- a/tests/units/document/test_base_document.py +++ b/tests/units/document/test_base_document.py @@ -1,6 +1,6 @@ from typing import List, Optional -from docarray.base_document.doc import BaseDoc +from docarray.base_doc.doc import BaseDoc def test_base_document_init(): diff --git a/tests/units/typing/tensor/test_embedding.py b/tests/units/typing/tensor/test_embedding.py index f7eebcf4caf..88d9f83ecc1 100644 --- a/tests/units/typing/tensor/test_embedding.py +++ b/tests/units/typing/tensor/test_embedding.py @@ -2,7 +2,7 @@ import pytest from pydantic.tools import parse_obj_as, schema_json_of -from docarray.base_document.io.json import orjson_dumps +from docarray.base_doc.io.json import orjson_dumps from docarray.typing import AnyEmbedding diff --git a/tests/units/typing/tensor/test_tensor.py b/tests/units/typing/tensor/test_tensor.py index a72ba18769d..787cb16d849 100644 --- a/tests/units/typing/tensor/test_tensor.py +++ b/tests/units/typing/tensor/test_tensor.py @@ -4,7 +4,7 @@ import torch from pydantic.tools import parse_obj_as, schema_json_of -from docarray.base_document.io.json import orjson_dumps +from docarray.base_doc.io.json import orjson_dumps from docarray.typing import AudioNdArray, NdArray, TorchTensor from docarray.typing.tensor import NdArrayEmbedding diff --git a/tests/units/typing/tensor/test_tensor_flow_tensor.py b/tests/units/typing/tensor/test_tensor_flow_tensor.py index a40e8db5508..5c7d942a02d 100644 --- a/tests/units/typing/tensor/test_tensor_flow_tensor.py +++ b/tests/units/typing/tensor/test_tensor_flow_tensor.py @@ -3,7 +3,7 @@ from pydantic import schema_json_of from pydantic.tools import parse_obj_as -from docarray.base_document.io.json import orjson_dumps +from docarray.base_doc.io.json import orjson_dumps from docarray.utils.misc import is_tf_available tf_available = is_tf_available() diff --git a/tests/units/typing/tensor/test_torch_tensor.py b/tests/units/typing/tensor/test_torch_tensor.py index 8b1d5f8250b..78b28b33c62 100644 --- a/tests/units/typing/tensor/test_torch_tensor.py +++ b/tests/units/typing/tensor/test_torch_tensor.py @@ -2,7 +2,7 @@ import torch from pydantic.tools import parse_obj_as, schema_json_of -from docarray.base_document.io.json import orjson_dumps +from docarray.base_doc.io.json import orjson_dumps from docarray.typing import TorchEmbedding, TorchTensor diff --git a/tests/units/typing/test_id.py b/tests/units/typing/test_id.py index 39ca28bb29f..377a28d1935 100644 --- a/tests/units/typing/test_id.py +++ b/tests/units/typing/test_id.py @@ -4,7 +4,7 @@ from pydantic import schema_json_of from pydantic.tools import parse_obj_as -from docarray.base_document.io.json import orjson_dumps +from docarray.base_doc.io.json import orjson_dumps from docarray.typing import ID diff --git a/tests/units/typing/url/test_any_url.py b/tests/units/typing/url/test_any_url.py index d86c09f2442..f8b55a3fdac 100644 --- a/tests/units/typing/url/test_any_url.py +++ b/tests/units/typing/url/test_any_url.py @@ -1,7 +1,7 @@ import pytest from pydantic.tools import parse_obj_as, schema_json_of -from docarray.base_document.io.json import orjson_dumps +from docarray.base_doc.io.json import orjson_dumps from docarray.typing import AnyUrl diff --git a/tests/units/typing/url/test_audio_url.py b/tests/units/typing/url/test_audio_url.py index 1f326effa7a..6b876d81de7 100644 --- a/tests/units/typing/url/test_audio_url.py +++ b/tests/units/typing/url/test_audio_url.py @@ -6,7 +6,7 @@ from pydantic.tools import parse_obj_as, schema_json_of from docarray import BaseDoc -from docarray.base_document.io.json import orjson_dumps +from docarray.base_doc.io.json import orjson_dumps from docarray.typing import AudioTorchTensor, AudioUrl from docarray.utils.misc import is_tf_available from tests import TOYDATA_DIR diff --git a/tests/units/typing/url/test_image_url.py b/tests/units/typing/url/test_image_url.py index cc95a074c3c..b425e498ab2 100644 --- a/tests/units/typing/url/test_image_url.py +++ b/tests/units/typing/url/test_image_url.py @@ -6,7 +6,7 @@ from PIL import Image from pydantic.tools import parse_obj_as, schema_json_of -from docarray.base_document.io.json import orjson_dumps +from docarray.base_doc.io.json import orjson_dumps from docarray.typing import ImageUrl CUR_DIR = os.path.dirname(os.path.abspath(__file__)) diff --git a/tests/units/typing/url/test_mesh_url.py b/tests/units/typing/url/test_mesh_url.py index 9893c90118b..fbc5342f2d6 100644 --- a/tests/units/typing/url/test_mesh_url.py +++ b/tests/units/typing/url/test_mesh_url.py @@ -2,7 +2,7 @@ import pytest from pydantic.tools import parse_obj_as, schema_json_of -from docarray.base_document.io.json import orjson_dumps +from docarray.base_doc.io.json import orjson_dumps from docarray.typing import Mesh3DUrl, NdArray from tests import TOYDATA_DIR diff --git a/tests/units/typing/url/test_point_cloud_url.py b/tests/units/typing/url/test_point_cloud_url.py index 7f28cdf9f30..0b0f744a8a3 100644 --- a/tests/units/typing/url/test_point_cloud_url.py +++ b/tests/units/typing/url/test_point_cloud_url.py @@ -2,7 +2,7 @@ import pytest from pydantic.tools import parse_obj_as, schema_json_of -from docarray.base_document.io.json import orjson_dumps +from docarray.base_doc.io.json import orjson_dumps from docarray.typing import NdArray, PointCloud3DUrl from tests import TOYDATA_DIR diff --git a/tests/units/typing/url/test_text_url.py b/tests/units/typing/url/test_text_url.py index 5c6d4a5b9e4..17498eac9b9 100644 --- a/tests/units/typing/url/test_text_url.py +++ b/tests/units/typing/url/test_text_url.py @@ -4,9 +4,8 @@ import pytest from pydantic import parse_obj_as, schema_json_of -from docarray.base_document.io.json import orjson_dumps +from docarray.base_doc.io.json import orjson_dumps from docarray.typing import TextUrl - from tests import TOYDATA_DIR REMOTE_TEXT_FILE = 'https://de.wikipedia.org/wiki/Brixen' diff --git a/tests/units/typing/url/test_video_url.py b/tests/units/typing/url/test_video_url.py index c69bd1f3054..a1401bde896 100644 --- a/tests/units/typing/url/test_video_url.py +++ b/tests/units/typing/url/test_video_url.py @@ -6,7 +6,7 @@ from pydantic.tools import parse_obj_as, schema_json_of from docarray import BaseDoc -from docarray.base_document.io.json import orjson_dumps +from docarray.base_doc.io.json import orjson_dumps from docarray.typing import ( AudioNdArray, NdArray, From 65faa3460cc26af971ac44a8464143955a420b39 Mon Sep 17 00:00:00 2001 From: samsja Date: Mon, 27 Mar 2023 13:41:13 +0200 Subject: [PATCH 05/22] fix(docs): fix docs building Signed-off-by: samsja --- docs/api_references/base_doc/base_doc.md | 3 +++ docs/api_references/base_document/base_document.md | 3 --- 2 files changed, 3 insertions(+), 3 deletions(-) create mode 100644 docs/api_references/base_doc/base_doc.md delete mode 100644 docs/api_references/base_document/base_document.md diff --git a/docs/api_references/base_doc/base_doc.md b/docs/api_references/base_doc/base_doc.md new file mode 100644 index 00000000000..0fe2dc80891 --- /dev/null +++ b/docs/api_references/base_doc/base_doc.md @@ -0,0 +1,3 @@ +# BaseDoc + +::: docarray.base_doc.doc.BaseDoc diff --git a/docs/api_references/base_document/base_document.md b/docs/api_references/base_document/base_document.md deleted file mode 100644 index 68427b67742..00000000000 --- a/docs/api_references/base_document/base_document.md +++ /dev/null @@ -1,3 +0,0 @@ -# BaseDoc - -::: docarray.base_document.document.BaseDoc From 1f26f01b4e50e6ea5db73d7d6462bef12adb4432 Mon Sep 17 00:00:00 2001 From: samsja Date: Mon, 27 Mar 2023 14:07:18 +0200 Subject: [PATCH 06/22] fix: ingore hubble test Signed-off-by: samsja --- tests/integrations/store/test_jac.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/integrations/store/test_jac.py b/tests/integrations/store/test_jac.py index 94d3c693e32..db07e684419 100644 --- a/tests/integrations/store/test_jac.py +++ b/tests/integrations/store/test_jac.py @@ -13,6 +13,8 @@ TOLERANCE_RATIO = 0.5 # Percentage of difference allowed in stream vs non-stream test RANDOM: str = uuid.uuid4().hex[:8] +pytestmark = pytest.mark.ignore + @pytest.fixture(scope='session', autouse=True) def testing_namespace_cleanup(): From 70fde35b4800d3b024df7824588d5591c8ec025a Mon Sep 17 00:00:00 2001 From: samsja Date: Mon, 27 Mar 2023 14:15:22 +0200 Subject: [PATCH 07/22] fix: ingore hubble test Signed-off-by: samsja --- tests/integrations/store/test_jac.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integrations/store/test_jac.py b/tests/integrations/store/test_jac.py index db07e684419..3e070b6de2b 100644 --- a/tests/integrations/store/test_jac.py +++ b/tests/integrations/store/test_jac.py @@ -13,7 +13,7 @@ TOLERANCE_RATIO = 0.5 # Percentage of difference allowed in stream vs non-stream test RANDOM: str = uuid.uuid4().hex[:8] -pytestmark = pytest.mark.ignore +pytestmark = [pytest.mark.skip] @pytest.fixture(scope='session', autouse=True) From dd6b0104961b9091a1ab243a1f845de9ec2607f4 Mon Sep 17 00:00:00 2001 From: samsja Date: Mon, 27 Mar 2023 11:00:49 +0200 Subject: [PATCH 08/22] docs: add userguide install Signed-off-by: samsja --- docs/user_guide/intro.md | 22 ++++++++++++++++++++++ pyproject.toml | 3 +++ 2 files changed, 25 insertions(+) create mode 100644 docs/user_guide/intro.md diff --git a/docs/user_guide/intro.md b/docs/user_guide/intro.md new file mode 100644 index 00000000000..3710be483f7 --- /dev/null +++ b/docs/user_guide/intro.md @@ -0,0 +1,22 @@ +# User Guide - Intro + +This user guide show you how to use `DocArray` with most of its features, step by step. + +## Install DocArray + +To install `DocArray` to follow this user guide, you can use the following command: + +```console +$ pip install "docarray[full]" + +---> 100% +``` + +Note: this will install the main dependencies of `DocArray` and will work will all the modalities supported. +T + +!!! note + To install a very light version of `DocArray` with only the core dependencies, you can use the following command: + ``` + pip install "docarray[full]" + ``` \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 61b10f7f298..3f47e4d9a13 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,6 +40,9 @@ pandas = ["pandas"] jac = ["jina-hubble-sdk"] aws = ["smart-open"] +# all +full = ["protobuf", "lz4", "pillow", "types-pillow", "av", "pydub", "trimesh"] + [tool.poetry.dev-dependencies] pytest = ">=7.0" pre-commit = ">=2.20.0" From 30df5172d07b4efc5bc81650773faa4c20216c7a Mon Sep 17 00:00:00 2001 From: samsja Date: Mon, 27 Mar 2023 12:05:08 +0200 Subject: [PATCH 09/22] docs: add awesome-pages Signed-off-by: samsja --- docs/api_references/.pages | 4 +++ mkdocs.yml | 14 +++++++- poetry.lock | 67 ++++++++++++++++++++++++++++++++++++-- pyproject.toml | 2 +- 4 files changed, 82 insertions(+), 5 deletions(-) create mode 100644 docs/api_references/.pages diff --git a/docs/api_references/.pages b/docs/api_references/.pages new file mode 100644 index 00000000000..2ac040fecf7 --- /dev/null +++ b/docs/api_references/.pages @@ -0,0 +1,4 @@ +nav: + - ... + +sort_type: natural \ No newline at end of file diff --git a/mkdocs.yml b/mkdocs.yml index 7ab61cfbe22..32e9519d564 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -43,9 +43,21 @@ markdown_extensions: plugins: - search + - awesome-pages - mkdocstrings: handlers: python: options: docstring_style: sphinx - inherited_members: true \ No newline at end of file + inherited_members: true + +nav: + - Home: README.md + - User Guide: + - user_guide/intro.md + - Tutorials: + - tutorials/add_doc_index.md + - tutorials/multimodal_training_and_serving.md + - tutorials/optimize_performance_with_id_generation.md + - ... + - Contributing: CONTRIBUTING.md diff --git a/poetry.lock b/poetry.lock index 6b31ed3d618..ff4dad6815b 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.4.1 and should not be changed by hand. +# This file is automatically @generated by Poetry and should not be changed by hand. [[package]] name = "aiohttp" @@ -470,6 +470,18 @@ urllib3 = ">=1.25.4,<1.27" [package.extras] crt = ["awscrt (==0.16.9)"] +[[package]] +name = "bracex" +version = "2.3.post1" +description = "Bash style brace expander." +category = "dev" +optional = false +python-versions = ">=3.7" +files = [ + {file = "bracex-2.3.post1-py3-none-any.whl", hash = "sha256:351b7f20d56fb9ea91f9b9e9e7664db466eb234188c175fd943f8f755c807e73"}, + {file = "bracex-2.3.post1.tar.gz", hash = "sha256:e7b23fc8b2cd06d3dec0692baabecb249dda94e06a617901ff03a6c56fd71693"}, +] + [[package]] name = "cached-property" version = "1.5.2" @@ -1658,6 +1670,23 @@ files = [ Markdown = ">=3.3" mkdocs = ">=1.1" +[[package]] +name = "mkdocs-awesome-pages-plugin" +version = "2.8.0" +description = "An MkDocs plugin that simplifies configuring page titles and their order" +category = "dev" +optional = false +python-versions = ">=3.6.2" +files = [ + {file = "mkdocs-awesome-pages-plugin-2.8.0.tar.gz", hash = "sha256:af7e327e14b2eea3b2735c37428e33a528ecd2d9ae2296dc0f1632f0f3bc28f7"}, + {file = "mkdocs_awesome_pages_plugin-2.8.0-py3-none-any.whl", hash = "sha256:6b21ad4f41aecbe89e3a9a51f8837892cc7ce8ca0f9f4e0a355d56159ace3d68"}, +] + +[package.dependencies] +mkdocs = ">=1" +natsort = ">=8.1.0" +wcmatch = ">=7" + [[package]] name = "mkdocs-material" version = "9.1.3" @@ -1879,6 +1908,22 @@ files = [ {file = "mypy_extensions-0.4.3.tar.gz", hash = "sha256:2d82818f5bb3e369420cb3c4060a7970edba416647068eb4c5343488a6c604a8"}, ] +[[package]] +name = "natsort" +version = "8.3.1" +description = "Simple yet flexible natural sorting in Python." +category = "dev" +optional = false +python-versions = ">=3.7" +files = [ + {file = "natsort-8.3.1-py3-none-any.whl", hash = "sha256:d583bc9050dd10538de36297c960b93f873f0cd01671a3c50df5bd86dd391dcb"}, + {file = "natsort-8.3.1.tar.gz", hash = "sha256:517595492dde570a4fd6b6a76f644440c1ba51e2338c8a671d7f0475fda8f9fd"}, +] + +[package.extras] +fast = ["fastnumbers (>=2.0.0)"] +icu = ["PyICU (>=1.0.0)"] + [[package]] name = "nbclassic" version = "0.4.8" @@ -3762,6 +3807,21 @@ files = [ [package.extras] watchmedo = ["PyYAML (>=3.10)"] +[[package]] +name = "wcmatch" +version = "8.4.1" +description = "Wildcard/glob file name matcher." +category = "dev" +optional = false +python-versions = ">=3.7" +files = [ + {file = "wcmatch-8.4.1-py3-none-any.whl", hash = "sha256:3476cd107aba7b25ba1d59406938a47dc7eec6cfd0ad09ff77193f21a964dee7"}, + {file = "wcmatch-8.4.1.tar.gz", hash = "sha256:b1f042a899ea4c458b7321da1b5e3331e3e0ec781583434de1301946ceadb943"}, +] + +[package.dependencies] +bracex = ">=2.1.1" + [[package]] name = "wcwidth" version = "0.2.5" @@ -3926,7 +3986,8 @@ testing = ["flake8 (<5)", "func-timeout", "jaraco.functools", "jaraco.itertools" [extras] audio = ["pydub"] aws = ["smart-open"] -common = ["lz4", "protobuf"] +common = ["protobuf", "lz4"] +full = ["protobuf", "lz4", "pillow", "types-pillow", "av", "pydub", "trimesh"] hnswlib = ["hnswlib"] image = ["pillow", "types-pillow"] jac = ["jina-hubble-sdk"] @@ -3939,4 +4000,4 @@ web = ["fastapi"] [metadata] lock-version = "2.0" python-versions = ">=3.7,<4.0" -content-hash = "0872bd8654de67d349699a227cd2dc1708c6fa5066c84e1f295d630184ee5ac4" +content-hash = "cf8aa752961fc4ba57ac4e938a08dc55283e32f4285799463c84338401bdf4d4" diff --git a/pyproject.toml b/pyproject.toml index 3f47e4d9a13..4af671846b7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -62,7 +62,7 @@ pytest-asyncio = ">=0.20.2" [tool.poetry.group.docs.dependencies] mkdocstrings = {extras = ["python"], version = ">=0.20.0"} mkdocs-material= ">=9.1.2" - +mkdocs-awesome-pages-plugin = ">=2.8.0" [build-system] requires = ["poetry-core>=1.0.0"] build-backend = "poetry.core.masonry.api" From 180a58ad772cc294f2e6c948884c410af2239573 Mon Sep 17 00:00:00 2001 From: samsja Date: Mon, 27 Mar 2023 12:14:34 +0200 Subject: [PATCH 10/22] docs: add install Signed-off-by: samsja --- docs/user_guide/intro.md | 34 +++++++++++++++++++++++++++------- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/docs/user_guide/intro.md b/docs/user_guide/intro.md index 3710be483f7..ee8c20f2b1a 100644 --- a/docs/user_guide/intro.md +++ b/docs/user_guide/intro.md @@ -2,21 +2,41 @@ This user guide show you how to use `DocArray` with most of its features, step by step. +You wil first need to install `DocArray` in you python environment. ## Install DocArray To install `DocArray` to follow this user guide, you can use the following command: ```console -$ pip install "docarray[full]" - ----> 100% +pip install "docarray[full]" ``` -Note: this will install the main dependencies of `DocArray` and will work will all the modalities supported. -T +This will install the main dependencies of `DocArray` and will work will all the modalities supported. + !!! note To install a very light version of `DocArray` with only the core dependencies, you can use the following command: ``` - pip install "docarray[full]" - ``` \ No newline at end of file + pip install "docarray" + ``` + + If you want to install user protobuf with the minimal dependencies you can do + + ``` + pip install "docarray[common]" + ``` + +!!! note + You can always only install a subset of the dependencies for the modalities that you need. + For instance lets say you only want to work with images, you can do + + ``` + pip install "docarray[image]" + ``` + + or with image and audio + + + ``` + pip install "docarray[image, audio]" + ``` From f058f22e31de5a69ed2e250aae9eb6d5f3ef8b30 Mon Sep 17 00:00:00 2001 From: samsja Date: Mon, 27 Mar 2023 12:17:51 +0200 Subject: [PATCH 11/22] docs: rename tutorials to how to Signed-off-by: samsja --- docs/{tutorials => how_to}/add_doc_index.md | 0 .../multimodal_training_and_serving.md | 0 .../optimize_performance_with_id_generation.md | 0 mkdocs.yml | 10 +++++----- 4 files changed, 5 insertions(+), 5 deletions(-) rename docs/{tutorials => how_to}/add_doc_index.md (100%) rename docs/{tutorials => how_to}/multimodal_training_and_serving.md (100%) rename docs/{tutorials => how_to}/optimize_performance_with_id_generation.md (100%) diff --git a/docs/tutorials/add_doc_index.md b/docs/how_to/add_doc_index.md similarity index 100% rename from docs/tutorials/add_doc_index.md rename to docs/how_to/add_doc_index.md diff --git a/docs/tutorials/multimodal_training_and_serving.md b/docs/how_to/multimodal_training_and_serving.md similarity index 100% rename from docs/tutorials/multimodal_training_and_serving.md rename to docs/how_to/multimodal_training_and_serving.md diff --git a/docs/tutorials/optimize_performance_with_id_generation.md b/docs/how_to/optimize_performance_with_id_generation.md similarity index 100% rename from docs/tutorials/optimize_performance_with_id_generation.md rename to docs/how_to/optimize_performance_with_id_generation.md diff --git a/mkdocs.yml b/mkdocs.yml index 32e9519d564..6c9e116240c 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -53,11 +53,11 @@ plugins: nav: - Home: README.md - - User Guide: + - Tutorial - User Guide: - user_guide/intro.md - - Tutorials: - - tutorials/add_doc_index.md - - tutorials/multimodal_training_and_serving.md - - tutorials/optimize_performance_with_id_generation.md + - How-to: + - how_to/add_doc_index.md + - how_to/multimodal_training_and_serving.md + - how_to/optimize_performance_with_id_generation.md - ... - Contributing: CONTRIBUTING.md From 3afe15844356a5744c78fbaea04bbe0e24ff0a68 Mon Sep 17 00:00:00 2001 From: samsja Date: Mon, 27 Mar 2023 12:24:01 +0200 Subject: [PATCH 12/22] chore: add pre commit blacken docs Signed-off-by: samsja --- .pre-commit-config.yaml | 11 ++++++++++- poetry.lock | 17 ++++++++++++++++- pyproject.toml | 1 + 3 files changed, 27 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2a7c5542f3e..bccbe2f206d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -23,4 +23,13 @@ repos: - repo: https://github.com/charliermarsh/ruff-pre-commit rev: v0.0.243 hooks: - - id: ruff \ No newline at end of file + - id: ruff + +- repo: https://github.com/asottile/blacken-docs + rev: 1.13.0 + hooks: + - id: blacken-docs + args: + - -S + additional_dependencies: + - black==22.3.0 \ No newline at end of file diff --git a/poetry.lock b/poetry.lock index ff4dad6815b..8a45df32aa7 100644 --- a/poetry.lock +++ b/poetry.lock @@ -410,6 +410,21 @@ d = ["aiohttp (>=3.7.4)"] jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"] uvloop = ["uvloop (>=0.15.2)"] +[[package]] +name = "blacken-docs" +version = "1.13.0" +description = "Run Black on Python code blocks in documentation files." +category = "dev" +optional = false +python-versions = ">=3.7" +files = [ + {file = "blacken_docs-1.13.0-py3-none-any.whl", hash = "sha256:455388df506fca04742f36a4a3475630eb7f141cb98acc6070d3c24bcf69cdda"}, + {file = "blacken_docs-1.13.0.tar.gz", hash = "sha256:2babba84a42fb31a1d393dcf5a9a66d9b0657bdc320aec69d9f96301501dba35"}, +] + +[package.dependencies] +black = ">=22.1.0" + [[package]] name = "bleach" version = "5.0.1" @@ -4000,4 +4015,4 @@ web = ["fastapi"] [metadata] lock-version = "2.0" python-versions = ">=3.7,<4.0" -content-hash = "cf8aa752961fc4ba57ac4e938a08dc55283e32f4285799463c84338401bdf4d4" +content-hash = "e2db7830e67ddd737fe6c47eac5ecc746afd2ce47a4cef013182b30e50a3737d" diff --git a/pyproject.toml b/pyproject.toml index 4af671846b7..79dde2568e7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,6 +52,7 @@ types-protobuf = ">=3.20.4" black = ">=22.10.0" isort = ">=5.10.1" ruff = ">=0.0.243" +blacken-docs = ">=1.13.0" [tool.poetry.group.dev.dependencies] uvicorn = ">=0.19.0" From 6edb371e1fa899ede72f394d87fbdd81fa88efe0 Mon Sep 17 00:00:00 2001 From: samsja Date: Mon, 27 Mar 2023 12:24:36 +0200 Subject: [PATCH 13/22] chore: add blacken docs Signed-off-by: samsja --- CONTRIBUTING.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index e06a3b9df21..332daa90ca4 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -276,6 +276,7 @@ This allows: If you need to monitor and debug your code, you can enable docarray logging: ```python import logging + logging.getLogger('docarray').setLevel(logging.DEBUG) ``` From 6f5da6ca013ae21c8d5f941e08b08d9f9f6cea4a Mon Sep 17 00:00:00 2001 From: samsja Date: Mon, 27 Mar 2023 12:29:37 +0200 Subject: [PATCH 14/22] docs: arr warning docarray version Signed-off-by: samsja --- docs/user_guide/intro.md | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/docs/user_guide/intro.md b/docs/user_guide/intro.md index ee8c20f2b1a..cb51945c154 100644 --- a/docs/user_guide/intro.md +++ b/docs/user_guide/intro.md @@ -26,17 +26,19 @@ This will install the main dependencies of `DocArray` and will work will all the pip install "docarray[common]" ``` -!!! note - You can always only install a subset of the dependencies for the modalities that you need. - For instance lets say you only want to work with images, you can do +Depending on your usage you might want to only use `DocArray` with only a couple of specific modalities. +For instance lets say you only want to work with images, you can do install `DocArray` using the following command: - ``` - pip install "docarray[image]" - ``` +``` +pip install "docarray[image]" +``` - or with image and audio +or with image and audio - ``` - pip install "docarray[image, audio]" - ``` +``` +pip install "docarray[image, audio]" +``` + +!!! warning + This way of installing `DocArray` is only valid starting with version `0.30` \ No newline at end of file From 8577ce14f6c141e5defdca95d4860f86646cc399 Mon Sep 17 00:00:00 2001 From: samsja Date: Mon, 27 Mar 2023 12:47:36 +0200 Subject: [PATCH 15/22] docs: repo url Signed-off-by: samsja --- mkdocs.yml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/mkdocs.yml b/mkdocs.yml index 6c9e116240c..6df5851459a 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -1,4 +1,9 @@ -site_name: "DocArray" +site_name: DocArray Documentation +site_description: DocArray, DocArray is a library for representing, sending and storing multi-modal data, with a focus on applications in ML and Neural Search. +site_url: https://docs.docarray.org/ +repo_name: docarray/docarray +repo_url: https://github.com/docarray/docarray +edit_uri: '' theme: # logo: assets/favicon.png name: material From e125f386f777d8af414427254f617cb657700423 Mon Sep 17 00:00:00 2001 From: samsja Date: Mon, 27 Mar 2023 13:04:08 +0200 Subject: [PATCH 16/22] docs: add social Signed-off-by: samsja --- mkdocs.yml | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/mkdocs.yml b/mkdocs.yml index 6df5851459a..58314789607 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -16,8 +16,6 @@ theme: primary: teal toggle: icon: material/brightness-7 - - name: Switch to dark mode # Palette toggle for dark mode @@ -45,6 +43,17 @@ markdown_extensions: - attr_list - md_in_html +extra: +# analytics: +# provider: google + social: + - icon: fontawesome/brands/github-alt + link: https://github.com/docarray/docarray + - icon: fontawesome/brands/discord + link: https://discord.com/invite/WaMp6PVPgR + - icon: fontawesome/brands/twitter + link: https://twitter.com/docarray + plugins: - search From 8f9b0c8f074acfdadb8cb007666cd494743ed95e Mon Sep 17 00:00:00 2001 From: samsja Date: Mon, 27 Mar 2023 13:11:01 +0200 Subject: [PATCH 17/22] docs: add logo Signed-off-by: samsja --- docs/assets/logo-dark.svg | 18 +++++++++++++----- docs/assets/logo-light.svg | 18 +++++------------- mkdocs.yml | 7 ++++++- 3 files changed, 24 insertions(+), 19 deletions(-) diff --git a/docs/assets/logo-dark.svg b/docs/assets/logo-dark.svg index 5c3d8d504a8..db2420c6673 100644 --- a/docs/assets/logo-dark.svg +++ b/docs/assets/logo-dark.svg @@ -1,12 +1,20 @@ - docarray-logo_text-dark - - - + docarray-logo_text-light + + + + - + + + + + + + + \ No newline at end of file diff --git a/docs/assets/logo-light.svg b/docs/assets/logo-light.svg index db2420c6673..5c3d8d504a8 100644 --- a/docs/assets/logo-light.svg +++ b/docs/assets/logo-light.svg @@ -1,20 +1,12 @@ - docarray-logo_text-light - - - - + docarray-logo_text-dark + + + - - - - - - - - + \ No newline at end of file diff --git a/mkdocs.yml b/mkdocs.yml index 58314789607..aa9d1302b53 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -5,7 +5,9 @@ repo_name: docarray/docarray repo_url: https://github.com/docarray/docarray edit_uri: '' theme: -# logo: assets/favicon.png + logo: assets/logo-light.svg + + favicon: assets/favicon.png name: material features: # - navigation.sections @@ -18,12 +20,15 @@ theme: icon: material/brightness-7 name: Switch to dark mode + # Palette toggle for dark mode - scheme: slate primary: teal toggle: icon: material/brightness-4 name: Switch to light mode +# logo: assets/docarray-light.svg + #primary: teal markdown_extensions: - pymdownx.highlight: From 013567e28ef02cea467a03860099f54101d08b46 Mon Sep 17 00:00:00 2001 From: samsja Date: Mon, 27 Mar 2023 14:27:17 +0200 Subject: [PATCH 18/22] docs: add first step emtpy page Signed-off-by: samsja --- docs/user_guide/first_step.md | 0 mkdocs.yml | 2 ++ 2 files changed, 2 insertions(+) create mode 100644 docs/user_guide/first_step.md diff --git a/docs/user_guide/first_step.md b/docs/user_guide/first_step.md new file mode 100644 index 00000000000..e69de29bb2d diff --git a/mkdocs.yml b/mkdocs.yml index aa9d1302b53..e7749bc2874 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -74,6 +74,8 @@ nav: - Home: README.md - Tutorial - User Guide: - user_guide/intro.md + - user_guide/first_step.md + - How-to: - how_to/add_doc_index.md - how_to/multimodal_training_and_serving.md From 29c12a20f3342b225dc5641ad6707721231669f3 Mon Sep 17 00:00:00 2001 From: samsja Date: Mon, 27 Mar 2023 15:52:03 +0200 Subject: [PATCH 19/22] docs: add document docs Signed-off-by: samsja --- docs/user_guide/first_step.md | 72 +++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/docs/user_guide/first_step.md b/docs/user_guide/first_step.md index e69de29bb2d..85d7a807edc 100644 --- a/docs/user_guide/first_step.md +++ b/docs/user_guide/first_step.md @@ -0,0 +1,72 @@ +# First Step : BaseDoc + +At the heart of `DocArray` lies the concept of [`BaseDoc`][docarray.base_doc.doc.BaseDoc]. + +A [BaseDoc][docarray.base_doc.doc.BaseDoc] is very similar to [Pydantic](https://docs.pydantic.dev/) +[`BaseModel`](https://docs.pydantic.dev/usage/models). It allows to define custom `Document` schema (or `Model` in +the Pydantic world) to represent your data. + +## Basic `Doc` usage. + +Before going in detail about what we can do with [BaseDoc][docarray.base_doc.doc.BaseDoc] and how to use it, let's +take a look at how it looks like in practice. + +The following python code will define a `BannerDoc` class that will be used to represent banner data. + +```python +from docarray import BaseDoc +from docarray.typing import ImageUrl + + +class BannerDoc(BaseDoc): + img_url: ImageUrl + title: str + description: str +``` + +you can then instantiate a `BannerDoc` object and access its attributes. + +```python +banner = BannerDoc( + image_url="https://example.com/image.png", + title="Hello World", + description="This is a banner", +) + +assert banner.img_url == "https://example.com/image.png" +assert banner.title == "Hello World" +assert banner.description == "This is a banner" +``` + +## `BaseDoc` allows to represent MultiModal and nested Data. + +more complex example + + +## `BaseDoc` is a Pydantic `BaseModel` + +The class [BaseDoc][docarray.base_doc.doc.BaseDoc] inherits from pydantic [BaseModel](https://docs.pydantic.dev/usage/models) from Pydantic. So you can use +all the features of `BaseModel` in your `Doc` class. + +This namely means that `BaseDoc`: + +* Will perform data validation: `BaseDoc` will check that the data you pass to it is valid. If not, it will raise an + error. Data being "valid" is actually define by the type use in the docstring itself, but we will come back on this concept later (TODO add typing section) + +* Can be configured using a nested `Config` class, see pydantic [documentation](https://docs.pydantic.dev/usage/model_config/) for more details on what kind of config Pydantic offer. + +* Can be used as a drop in replacement for `BaseModel` in your code and is compatible with tools using Pydantic like [FastAPI]('https://fastapi.tiangolo.com/'). + + +### What is the difference with Pydantic `BaseModel`? (INCOMPLETE) + +here maybe need the link to the versus section + +[BaseDoc][docarray.base_doc.doc.BaseDoc] is not only a [BaseModel](https://docs.pydantic.dev/usage/models), + +* it allows to be used with DocArray [Typed](docarray.typing) that are oriented toward MultiModal (image, audio, ...) data and for +Machine Learning use case TODO link the type section. + +Another tiny difference is that [BaseDoc][docarray.base_doc.doc.BaseDoc] has a generated by default `id` field that is used to uniquely identify a document. + + From 1ba22425a0a2f2c1e53896b5c83779d22b575984 Mon Sep 17 00:00:00 2001 From: samsja Date: Mon, 27 Mar 2023 16:32:27 +0200 Subject: [PATCH 20/22] feat: add markdown documentation test Signed-off-by: samsja --- .github/workflows/ci.yml | 5 ++--- docs/user_guide/first_step.md | 4 ++-- poetry.lock | 17 ++++++++++++++++- pyproject.toml | 1 + tests/docs/test_docs.py | 13 +++++++++++++ 5 files changed, 34 insertions(+), 6 deletions(-) create mode 100644 tests/docs/test_docs.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9715db93d72..d289509f5d4 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -100,7 +100,7 @@ jobs: matrix: python-version: [3.7] # test-path: ${{fromJson(needs.prep-testbed.outputs.matrix)}} - test-path: [tests/integrations, tests/units] + test-path: [tests/integrations, tests/units, tests/documentation] steps: - uses: actions/checkout@v2.5.0 - name: Set up Python ${{ matrix.python-version }} @@ -144,7 +144,7 @@ jobs: fail-fast: false matrix: python-version: [3.7] - test-path: [tests/integrations, tests/units] + test-path: [tests/integrations, tests/units, tests/documentation] steps: - uses: actions/checkout@v2.5.0 - name: Set up Python ${{ matrix.python-version }} @@ -275,7 +275,6 @@ jobs: poetry run pytest -m 'benchmark' tests timeout-minutes: 30 - # just for blocking the merge until all parallel core-test are successful success-all-test: needs: [docarray-test, docarray-test-proto3, docarray-doc-index, docarray-test-tensorflow, docarray-test-benchmarks, import-test, check-black, check-mypy, lint-ruff] diff --git a/docs/user_guide/first_step.md b/docs/user_guide/first_step.md index 85d7a807edc..f4850f11489 100644 --- a/docs/user_guide/first_step.md +++ b/docs/user_guide/first_step.md @@ -19,7 +19,7 @@ from docarray.typing import ImageUrl class BannerDoc(BaseDoc): - img_url: ImageUrl + image_url: ImageUrl title: str description: str ``` @@ -33,7 +33,7 @@ banner = BannerDoc( description="This is a banner", ) -assert banner.img_url == "https://example.com/image.png" +assert banner.image_url == "https://example.com/image.png" assert banner.title == "Hello World" assert banner.description == "This is a banner" ``` diff --git a/poetry.lock b/poetry.lock index 8a45df32aa7..2f89d1231dc 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1779,6 +1779,21 @@ files = [ griffe = ">=0.24" mkdocstrings = ">=0.19" +[[package]] +name = "mktestdocs" +version = "0.2.0" +description = "" +category = "dev" +optional = false +python-versions = "*" +files = [ + {file = "mktestdocs-0.2.0-py2.py3-none-any.whl", hash = "sha256:0ce2ba702dfe7f2a516878fd1787d4c1f95e4a088248893bb5788ac037010559"}, + {file = "mktestdocs-0.2.0.tar.gz", hash = "sha256:a6b401c63ac02ab683443e0fcb27c58fc8c2264cf4e9e93835741d234f917267"}, +] + +[package.extras] +test = ["pytest (>=4.0.2)"] + [[package]] name = "multidict" version = "6.0.4" @@ -4015,4 +4030,4 @@ web = ["fastapi"] [metadata] lock-version = "2.0" python-versions = ">=3.7,<4.0" -content-hash = "e2db7830e67ddd737fe6c47eac5ecc746afd2ce47a4cef013182b30e50a3737d" +content-hash = "c5b13c9b48aa9edf9d494ce8ba91cfdd9f78d4220ae758ac8a74b69963fc7253" diff --git a/pyproject.toml b/pyproject.toml index 79dde2568e7..9e30c3a804e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -64,6 +64,7 @@ pytest-asyncio = ">=0.20.2" mkdocstrings = {extras = ["python"], version = ">=0.20.0"} mkdocs-material= ">=9.1.2" mkdocs-awesome-pages-plugin = ">=2.8.0" +mktestdocs= ">=0.2.0" [build-system] requires = ["poetry-core>=1.0.0"] build-backend = "poetry.core.masonry.api" diff --git a/tests/docs/test_docs.py b/tests/docs/test_docs.py new file mode 100644 index 00000000000..4eceb252f89 --- /dev/null +++ b/tests/docs/test_docs.py @@ -0,0 +1,13 @@ +import pathlib + +import pytest +from mktestdocs import check_md_file + + +# @pytest.mark.parametrize('fpath', pathlib.Path("docs").glob("**/*.md"), ids=str) +# to use later +@pytest.mark.parametrize( + 'fpath', pathlib.Path('docs/user_guide').glob('**/*.md'), ids=str +) +def test_files_good(fpath): + check_md_file(fpath=fpath, memory=True) From 4988cb4cf7eaf70c89177d03806bf69a4374c810 Mon Sep 17 00:00:00 2001 From: samsja Date: Tue, 28 Mar 2023 10:02:50 +0200 Subject: [PATCH 21/22] docs: remove content Signed-off-by: samsja --- docs/user_guide/first_step.md | 71 ----------------------------------- docs/user_guide/intro.md | 43 --------------------- 2 files changed, 114 deletions(-) diff --git a/docs/user_guide/first_step.md b/docs/user_guide/first_step.md index f4850f11489..0671e3a096a 100644 --- a/docs/user_guide/first_step.md +++ b/docs/user_guide/first_step.md @@ -1,72 +1 @@ # First Step : BaseDoc - -At the heart of `DocArray` lies the concept of [`BaseDoc`][docarray.base_doc.doc.BaseDoc]. - -A [BaseDoc][docarray.base_doc.doc.BaseDoc] is very similar to [Pydantic](https://docs.pydantic.dev/) -[`BaseModel`](https://docs.pydantic.dev/usage/models). It allows to define custom `Document` schema (or `Model` in -the Pydantic world) to represent your data. - -## Basic `Doc` usage. - -Before going in detail about what we can do with [BaseDoc][docarray.base_doc.doc.BaseDoc] and how to use it, let's -take a look at how it looks like in practice. - -The following python code will define a `BannerDoc` class that will be used to represent banner data. - -```python -from docarray import BaseDoc -from docarray.typing import ImageUrl - - -class BannerDoc(BaseDoc): - image_url: ImageUrl - title: str - description: str -``` - -you can then instantiate a `BannerDoc` object and access its attributes. - -```python -banner = BannerDoc( - image_url="https://example.com/image.png", - title="Hello World", - description="This is a banner", -) - -assert banner.image_url == "https://example.com/image.png" -assert banner.title == "Hello World" -assert banner.description == "This is a banner" -``` - -## `BaseDoc` allows to represent MultiModal and nested Data. - -more complex example - - -## `BaseDoc` is a Pydantic `BaseModel` - -The class [BaseDoc][docarray.base_doc.doc.BaseDoc] inherits from pydantic [BaseModel](https://docs.pydantic.dev/usage/models) from Pydantic. So you can use -all the features of `BaseModel` in your `Doc` class. - -This namely means that `BaseDoc`: - -* Will perform data validation: `BaseDoc` will check that the data you pass to it is valid. If not, it will raise an - error. Data being "valid" is actually define by the type use in the docstring itself, but we will come back on this concept later (TODO add typing section) - -* Can be configured using a nested `Config` class, see pydantic [documentation](https://docs.pydantic.dev/usage/model_config/) for more details on what kind of config Pydantic offer. - -* Can be used as a drop in replacement for `BaseModel` in your code and is compatible with tools using Pydantic like [FastAPI]('https://fastapi.tiangolo.com/'). - - -### What is the difference with Pydantic `BaseModel`? (INCOMPLETE) - -here maybe need the link to the versus section - -[BaseDoc][docarray.base_doc.doc.BaseDoc] is not only a [BaseModel](https://docs.pydantic.dev/usage/models), - -* it allows to be used with DocArray [Typed](docarray.typing) that are oriented toward MultiModal (image, audio, ...) data and for -Machine Learning use case TODO link the type section. - -Another tiny difference is that [BaseDoc][docarray.base_doc.doc.BaseDoc] has a generated by default `id` field that is used to uniquely identify a document. - - diff --git a/docs/user_guide/intro.md b/docs/user_guide/intro.md index cb51945c154..c500c92629f 100644 --- a/docs/user_guide/intro.md +++ b/docs/user_guide/intro.md @@ -1,44 +1 @@ # User Guide - Intro - -This user guide show you how to use `DocArray` with most of its features, step by step. - -You wil first need to install `DocArray` in you python environment. -## Install DocArray - -To install `DocArray` to follow this user guide, you can use the following command: - -```console -pip install "docarray[full]" -``` - -This will install the main dependencies of `DocArray` and will work will all the modalities supported. - - -!!! note - To install a very light version of `DocArray` with only the core dependencies, you can use the following command: - ``` - pip install "docarray" - ``` - - If you want to install user protobuf with the minimal dependencies you can do - - ``` - pip install "docarray[common]" - ``` - -Depending on your usage you might want to only use `DocArray` with only a couple of specific modalities. -For instance lets say you only want to work with images, you can do install `DocArray` using the following command: - -``` -pip install "docarray[image]" -``` - -or with image and audio - - -``` -pip install "docarray[image, audio]" -``` - -!!! warning - This way of installing `DocArray` is only valid starting with version `0.30` \ No newline at end of file From 60e2016d9686dac187bdfe263a0cf5ccf240b9a8 Mon Sep 17 00:00:00 2001 From: samsja Date: Tue, 28 Mar 2023 10:20:15 +0200 Subject: [PATCH 22/22] docs: fix ci Signed-off-by: samsja --- tests/{docs => documentation}/test_docs.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/{docs => documentation}/test_docs.py (100%) diff --git a/tests/docs/test_docs.py b/tests/documentation/test_docs.py similarity index 100% rename from tests/docs/test_docs.py rename to tests/documentation/test_docs.py